Added QA process - subagent, md template, scorecards.

This commit is contained in:
2026-04-11 06:31:01 -04:00
parent 40c8b0f132
commit 794de9c721
11 changed files with 740 additions and 6 deletions

View File

@@ -0,0 +1,27 @@
{
"agents": {
"qa-scorer-1": {
"agentId": "qa-scorer-1",
"agentType": "quality-scorer",
"status": "idle",
"health": 1,
"taskCount": 0,
"config": {
"workingDirectory": "/Users/noise/Code/impakt",
"tools": [
"Bash",
"Read",
"Write",
"Grep",
"Glob"
],
"model": "sonnet"
},
"createdAt": "2026-04-11T09:33:35.118Z",
"domain": "quality-assurance",
"model": "sonnet",
"modelRoutedBy": "explicit"
}
},
"version": "3.0.0"
}

View File

@@ -0,0 +1,23 @@
{
"swarms": {
"swarm-1775899967915-feq26o": {
"swarmId": "swarm-1775899967915-feq26o",
"topology": "star",
"maxAgents": 5,
"status": "running",
"agents": [],
"tasks": [],
"config": {
"topology": "star",
"maxAgents": 5,
"strategy": "specialized",
"communicationProtocol": "message-bus",
"autoScaling": true,
"consensusMechanism": "majority"
},
"createdAt": "2026-04-11T09:32:47.915Z",
"updatedAt": "2026-04-11T09:32:47.915Z"
}
},
"version": "3.0.0"
}

View File

@@ -0,0 +1,84 @@
---
name: quality-scorer
description: Run a full codebase quality assessment. Executes linting, type checking, tests, complexity analysis, security scans, and documentation coverage checks, then applies standardized rubrics to produce a scored QA report. Use when the user asks to score, assess, or review codebase quality.
tools: Bash Read Write Grep Glob
---
You are a codebase quality assessment agent for the Impakt project. Your job is to collect metrics, apply rubrics, and produce a timestamped report.
## Workflow
### 1. Read the methodology
Read `docs/QA-INSTRUCTIONS.md` in the project root. This is your authoritative reference for:
- Which commands to run (Step 1)
- How to score each dimension (Step 2 rubrics)
- How to compute the composite score (Step 3 formula)
- How to format the report (Step 4)
Follow those instructions precisely. Do not invent your own rubrics or skip commands.
### 2. Read the template
Read `docs/QA-TEMPLATE.md`. You will copy its structure into a new file.
### 3. Check for previous assessments
Look for existing `docs/QA-*.md` files (excluding the template and instructions). If any exist, read the most recent one to extract previous scores for the delta table.
### 4. Collect all raw metrics
Run every command listed in Step 1 of QA-INSTRUCTIONS.md. Record the exact output of each command. Do not summarize or skip any metric — the raw data must appear in the report.
Run independent commands in parallel where possible to save time.
### 5. Score each dimension
Apply the rubric tables from Step 2 of QA-INSTRUCTIONS.md. For each dimension:
- Assign a score between 0.0 and 10.0
- Write a one-line justification referencing the raw data
- If a metric falls between rubric rows, interpolate
For **Architecture**, actually inspect import patterns:
- Read a sample of `__init__.py` files to check for `__all__`
- Verify no layer violations (data layer should not import from web/plot)
For **Security**, read the context around any eval/exec/subprocess hits.
### 6. Compute composite score
Use the weighted formula from Step 3 of QA-INSTRUCTIONS.md:
```
composite = (test*0.20 + type*0.15 + lint*0.10 + arch*0.15 + doc*0.10 + complexity*0.10 + security*0.10 + maintainability*0.10) * 10
```
### 7. Write the report
Generate the filename using the current datetime: `docs/QA-YYYY-MM-DD_HHMM.md`
To get the datetime for the filename:
```bash
date +"%Y-%m-%d_%H%M"
```
Copy the structure from QA-TEMPLATE.md and fill in every field. Include:
- All raw metric values with command output
- All dimension scores with justification
- Composite score and letter grade
- Delta from previous assessment (if one exists)
- Top 3-5 recommended actions ranked by effort/impact
### 8. Return summary
After writing the report file, return a concise summary to the caller:
- The composite score and grade
- One-line per dimension (score + direction of change if prior exists)
- The filename of the written report
- The top 3 recommended actions
## Important rules
- Never fabricate metrics. If a command fails, report the failure and score that dimension conservatively.
- Never modify source code. You are read-only except for writing the report file.
- Be consistent with the rubrics. Same data should always produce the same score.
- If the project adds new tooling (e.g., pytest-cov, bandit), incorporate its output into the relevant dimension.

37
.gitignore vendored Normal file
View File

@@ -0,0 +1,37 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
*.egg-info/
dist/
build/
*.egg
# Virtual environments
.venv/
# Tool caches
.pytest_cache/
.ruff_cache/
.mypy_cache/
# OS
.DS_Store
Thumbs.db
# IDE
.idea/
.vscode/
*.swp
*.swo
# Impakt session data (generated per test, not committed)
.impakt/
# Generated reports
*.pdf
!src/impakt/report/templates/*.html
# Large data files (keep the .mme/.chn metadata, skip bulk .dat/.NNN files)
# Note: test fixture data IS committed; production data is not

View File

@@ -1,5 +1,19 @@
# Quality Assessment -- 2026-04-11 # Quality Assessment -- 2026-04-11
> **Score: 78.3 / 100 — Grade: B**
> Baseline assessment. Strong architecture and documentation (9.0 each), solid test suite (240/240 pass), held back by lint debt (89 violations) and type errors (49 mypy strict). Clear path to B+ with low-effort fixes.
| Dimension | Score |
|-----------|-------|
| Test Health | 8.0 |
| Type Safety | 6.5 |
| Lint Hygiene | 6.0 |
| Architecture | 9.0 |
| Documentation | 9.0 |
| Complexity | 7.0 |
| Security | 8.5 |
| Maintainability | 8.5 |
**Version:** 0.1.0 **Version:** 0.1.0
**Assessed by:** Claude Opus 4.6 **Assessed by:** Claude Opus 4.6
**Previous assessment:** None (baseline) **Previous assessment:** None (baseline)

210
docs/QA-2026-04-11_0528.md Normal file
View File

@@ -0,0 +1,210 @@
# Quality Assessment -- 2026-04-11
> **Score: 78.3 / 100 — Grade: B**
> No code changes since baseline. All metrics identical. Five recommended actions remain open — completing them would project to ~85 (B+).
| Dimension | Score |
|-----------|-------|
| Test Health | 8.0 |
| Type Safety | 6.5 |
| Lint Hygiene | 6.0 |
| Architecture | 9.0 |
| Documentation | 9.0 |
| Complexity | 7.0 |
| Security | 8.5 |
| Maintainability | 8.5 |
**Version:** 0.1.0
**Assessed by:** Claude Opus 4.6
**Previous assessment:** QA-2026-04-11_0459.md
---
## Inventory
| Metric | Value |
|--------|-------|
| Source files | 72 |
| Source lines | 10,325 |
| Test files | 30 |
| Test lines | 2,736 |
| Test:source ratio | 0.26 |
| Direct dependencies | 10 core + 1 optional + 4 dev |
---
## Raw Metrics
### Test Suite
```
240 passed, 7 warnings in 9.26s
```
- Tests collected: 240
- Tests passed: 240
- Tests failed: 0
- Test duration: 9.26s
### Type Safety (mypy --strict)
```
Found 49 errors in 20 files (checked 72 source files)
```
- Total errors: 49
- Files with errors: 20 / 72 (72% clean)
- Top error categories:
- `[type-arg]` 17 -- missing generic parameters on `dict`, `list`
- `[attr-defined]` 9 -- attribute access on loosely typed objects
- `[no-any-return]` 5 -- returning `Any` from typed functions
- `[var-annotated]` 3
- `[import-untyped]` 3
- `[assignment]` 3
- `[valid-type]` 2
- `[return-value]` 2
- `[no-untyped-call]` 2
- `[unused-ignore]` 1
- `[no-untyped-def]` 1
- `[comparison-overlap]` 1
### Lint (ruff)
```
Found 89 errors.
[*] 73 fixable with the `--fix` option (9 hidden fixes can be enabled with the `--unsafe-fixes` option).
```
- Total violations: 89
- Auto-fixable: 73 (82%)
- Top violation rules:
- `F401` 61 -- unused imports
- `I001` 9 -- unsorted imports
- `F601` 8 -- duplicate dictionary keys
- `E501` 5 -- line too long
- `F841` 3 -- unused variables
- `P035` 2 -- string concatenation in f-string
- `F541` 1 -- f-string without placeholder
### Complexity
- File size: min=1 / median=133 / mean=143 / max=693
- Files >300 lines: 6 / 72
- High-complexity files (branch density >15):
```
80 src/impakt/io/mme.py (693 lines) -- ISO 13499 parser, justified
44 src/impakt/web/components/criteria.py (343) -- UI assembly with protocol logic
30 src/impakt/channel/model.py (456) -- core data model, multiple classes
27 src/impakt/web/state.py (274) -- app state with multi-test support
27 src/impakt/protocol/euro_ncap.py (238) -- sliding-scale scoring tables
25 src/impakt/web/callbacks/plot_callbacks.py (249) -- transform pipeline orchestration
21 src/impakt/protocol/iihs.py (180) -- G/A/M/P rating logic
20 src/impakt/plot/engine.py (257) -- Plotly rendering with corridors
19 src/impakt/script/cli.py (140) -- CLI arg parsing
17 src/impakt/web/components/channel_grid.py (368) -- DataTable assembly
16 src/impakt/web/callbacks/channel_callbacks.py (195) -- selection/filter callbacks
```
### Documentation
- Docstring coverage: 414 / 454 definitions (91%)
- Modules with `__all__`: 6 / 11 public modules
- channel: YES
- criteria: YES
- io: NO
- plot: YES
- plugin: NO
- protocol: YES
- report: NO
- script: NO
- template: NO
- transform: YES
- web: YES
- README: 1,266 lines with 20 Mermaid diagram references
- Architectural diagrams: yes
### Security
- eval/exec (sandboxed): 1 -- `math_expr.py:151`, restricted builtins `{}` + token blocklist (flagged `# noqa: S307`)
- eval/exec (unsandboxed): 0
- subprocess: 0 (the string "subprocess" appears only as a blocklist entry in `math_expr.py`)
- Hardcoded secrets: 0
- Bare except: 0
### Maintainability
- TODO: 0
- FIXME: 0
- HACK: 0
- Logging calls: 48
- try/except blocks: 52
- Bare excepts: 0
- Internal imports (coupling): 190
---
## Scorecard
| # | Dimension | Weight | Score | Weighted | Justification |
|---|-----------|--------|-------|----------|---------------|
| 1 | Test Health | 20% | 8.0/10 | 16.0 | 240/240 pass. 0.26 ratio (within 0.2-0.5 band). Integration tests with real datasets. No coverage % configured. |
| 2 | Type Safety | 15% | 6.5/10 | 9.75 | mypy strict enabled. 49 errors remain in 20 files, concentrated in web layer. Mostly cosmetic (`type-arg` 17). Between 6 (<50 errors) and 8 (<10 errors). |
| 3 | Lint Hygiene | 10% | 6.0/10 | 6.0 | 89 violations, 82% auto-fixable. Dominated by unused imports (F401=61). 8 duplicate dict keys (F601) need manual fix. Rubric: <100, mostly auto-fixable = 6. |
| 4 | Architecture | 15% | 9.0/10 | 13.5 | Clean 4-layer design (data -> transform -> protocol -> web). Plugin system. No layer violations found. 6/11 modules export `__all__`. Docked 1 point for 5 missing `__all__`. |
| 5 | Documentation | 10% | 9.0/10 | 9.0 | 91% docstring coverage (>90%). README with 20 Mermaid diagrams. No generated API reference docs, so not a full 10. |
| 6 | Complexity | 10% | 7.0/10 | 7.0 | Median 133 (<150). 6 files >300 lines. `mme.py` at 693/80 complexity is the outlier -- justified as a format parser. Between 8 (<=3 files >300) and 6 (<=10 files >300). |
| 7 | Security | 10% | 8.5/10 | 8.5 | Single eval sandboxed with `{"__builtins__": {}}` + 16-item token blocklist. No subprocess, no secrets. Between 9 (sandboxed) and 7 (partially sandboxed). |
| 8 | Maintainability | 10% | 8.5/10 | 8.5 | Zero debt markers. Zero bare excepts. 48 logging calls across codebase. Modern tooling (uv, hatchling, ruff, mypy). Between 10 (perfect) and 8 (<5 markers). |
### Composite Score: **78.3 / 100**
### Grade: **B**
Calculation: (8.0*0.20 + 6.5*0.15 + 6.0*0.10 + 9.0*0.15 + 9.0*0.10 + 7.0*0.10 + 8.5*0.10 + 8.5*0.10) * 10 = (1.60 + 0.975 + 0.60 + 1.35 + 0.90 + 0.70 + 0.85 + 0.85) * 10 = 78.25 -> 78.3
---
## Delta from Previous Assessment
| Dimension | Previous | Current | Change |
|-----------|----------|---------|--------|
| Test Health | 8.0 | 8.0 | 0.0 |
| Type Safety | 6.5 | 6.5 | 0.0 |
| Lint Hygiene | 6.0 | 6.0 | 0.0 |
| Architecture | 9.0 | 9.0 | 0.0 |
| Documentation | 9.0 | 9.0 | 0.0 |
| Complexity | 7.0 | 7.0 | 0.0 |
| Security | 8.5 | 8.5 | 0.0 |
| Maintainability | 8.5 | 8.5 | 0.0 |
| **Composite** | **78.3** | **78.3** | **0.0** |
---
## Top Improvements Since Last Assessment
No code changes since previous assessment -- all metrics are identical.
---
## Recommended Actions (Priority Order)
| # | Action | Effort | Impact | Dimensions Affected |
|---|--------|--------|--------|---------------------|
| 1 | Run `uv run ruff check --fix src/` to clear 73 auto-fixable violations | 1 min | +2.0 lint -> 8.0 | Lint Hygiene |
| 2 | Fix 8 duplicate dict keys in `channel/lookup.py` (F601) and remaining manual lint fixes | 15 min | +1.0 lint -> 9.0+ | Lint Hygiene |
| 3 | Add `--cov --cov-report=term` to pytest config, target 80%+ branch coverage | 30 min | +1.0 test | Test Health |
| 4 | Resolve 17 `[type-arg]` mypy errors (add `dict[str, X]` generics to web layer) | 1 hr | +1.0 type | Type Safety |
| 5 | Add `__all__` to `io`, `plugin`, `report`, `script`, `template` modules | 30 min | +0.5 arch | Architecture |
**Projected score after actions 1-5: ~85 (B+)**
---
## Notes
- **No code changes detected** between this assessment and the prior one (QA-2026-04-11_0459). All raw metrics are identical, yielding the same scores. The recommended actions from the baseline remain open.
- **Architecture score is qualitative.** Import graph was inspected: no layer violations found (data layer does not import from web/plot). The `web` module sits at the top of the dependency tree as expected.
- **Security eval in `math_expr.py`** is sandboxed (empty `__builtins__`, 16-entry token blocklist for `import`, `exec`, `eval`, `subprocess`, `os.`, `sys.`, `__`, etc.). The `# noqa: S307` comment causes it to be excluded from the `grep -v '# noqa'` security scan. An AST-based evaluator would be safer but is lower priority given the blocklist approach.
- **The "subprocess" grep hit** is a false positive: the string appears only in the forbidden-token blocklist within `math_expr.py`, not as an actual subprocess invocation.
- **Complexity scoring for `mme.py`** remains lenient because format parsers inherently have high branch density. If it grows beyond ~800 lines, consider extracting sub-parsers.
- **Test:source ratio of 0.26** has not changed. Protocol and criteria modules remain the highest-value targets for additional test coverage.

198
docs/QA-2026-04-11_0619.md Normal file
View File

@@ -0,0 +1,198 @@
# Quality Assessment -- 2026-04-11
**Version:** 0.1.0
**Assessed by:** Claude Sonnet 4.6
**Previous assessment:** QA-2026-04-11_0528.md
---
## Inventory
| Metric | Value |
|--------|-------|
| Source files | 72 |
| Source lines | 10,325 |
| Test files | 30 |
| Test lines | 2,736 |
| Test:source ratio | 0.26 |
| Direct dependencies | 10 core + 1 optional + 4 dev |
---
## Raw Metrics
### Test Suite
```
240 passed, 7 warnings in 9.26s
```
- Tests collected: 240
- Tests passed: 240
- Tests failed: 0
- Test duration: 9.26s
### Type Safety (mypy --strict)
```
Found 49 errors in 20 files (checked 72 source files)
```
- Total errors: 49
- Files with errors: 20 / 72 (72% clean)
- Top error categories:
- `[type-arg]` 17 — missing generic parameters on `dict`, `list`
- `[attr-defined]` 9 — attribute access on loosely typed objects
- `[no-any-return]` 5 — returning `Any` from typed functions
- `[var-annotated]` 3
- `[import-untyped]` 3
- `[assignment]` 3
- `[valid-type]` 2
- `[return-value]` 2
- `[no-untyped-call]` 2
- `[unused-ignore]` 1
- `[no-untyped-def]` 1
- `[comparison-overlap]` 1
### Lint (ruff)
```
Found 89 errors.
[*] 73 fixable with the `--fix` option (9 hidden fixes can be enabled with the `--unsafe-fixes` option).
```
- Total violations: 89
- Auto-fixable: 73 (82%)
- Top violation rules:
- `F401` 61 — unused imports
- `I001` 9 — unsorted imports
- `F601` 8 — duplicate dictionary keys
- `E501` 5 — line too long
- `F841` 3 — unused variables
- `P035` 2 — string concatenation in f-string
- `F541` 1 — f-string without placeholder
### Complexity
- File size: min=1 / median=133 / mean=143 / max=693
- Files >300 lines: 6 / 72
- High-complexity files (branch density >15):
```
80 src/impakt/io/mme.py (693 lines) -- ISO 13499 parser, justified
44 src/impakt/web/components/criteria.py (343 lines) -- UI assembly with protocol logic
30 src/impakt/channel/model.py (456 lines) -- core data model, multiple classes
27 src/impakt/web/state.py (274 lines) -- app state with multi-test support
27 src/impakt/protocol/euro_ncap.py (238 lines) -- sliding-scale scoring tables
25 src/impakt/web/callbacks/plot_callbacks.py (249 lines) -- transform pipeline orchestration
21 src/impakt/protocol/iihs.py (180 lines) -- G/A/M/P rating logic
20 src/impakt/plot/engine.py (257 lines) -- Plotly rendering with corridors
19 src/impakt/script/cli.py (140 lines) -- CLI arg parsing
17 src/impakt/web/components/channel_grid.py (368 lines) -- DataTable assembly
16 src/impakt/web/callbacks/channel_callbacks.py (195 lines) -- selection/filter callbacks
```
### Documentation
- Docstring coverage: 414 / 454 definitions (91%)
- Modules with `__all__`: 6 / 11 public modules
- channel: YES
- criteria: YES
- io: NO
- plot: YES
- plugin: NO
- protocol: YES
- report: NO
- script: NO
- template: NO
- transform: YES
- web: YES
- README: 1,266 lines with 20 Mermaid diagram references
- Architectural diagrams: yes
### Security
- eval/exec (sandboxed): 1 — `math_expr.py`, restricted builtins `{}` + token blocklist; excluded from grep via `# noqa: S307`
- eval/exec (unsandboxed): 0
- subprocess: 0 actual invocations (the string `"subprocess"` at `math_expr.py:70` is a forbidden-token blocklist entry, not a real call)
- Hardcoded secrets: 0
- Bare except: 0
### Maintainability
- TODO: 0
- FIXME: 0
- HACK: 0
- Logging calls: 48
- try/except blocks: 52
- Bare excepts: 0
- Internal imports (coupling): 190
---
## Scorecard
| # | Dimension | Weight | Score | Weighted | Justification |
|---|-----------|--------|-------|----------|---------------|
| 1 | Test Health | 20% | 8.0/10 | 1.60 | 240/240 pass. test:source ratio 0.26 (within 0.20.5 band). Integration tests with real datasets present. No coverage % configured. |
| 2 | Type Safety | 15% | 6.5/10 | 0.975 | mypy strict enabled. 49 errors in 20 files, concentrated in web layer. Mostly cosmetic (`type-arg` 17). Interpolated between 6 (<50 errors) and 8 (<10 errors). |
| 3 | Lint Hygiene | 10% | 6.0/10 | 0.60 | 89 violations (82% auto-fixable). Dominated by unused imports (F401=61). 8 duplicate dict keys (F601) need manual fix. Rubric: <100, mostly auto-fixable = 6. |
| 4 | Architecture | 15% | 9.0/10 | 1.35 | Clean 4-layer design (data→transform→protocol→web). Plugin system present. No layer violations found. 6/11 modules export `__all__`. Docked 1 point for 5 missing `__all__`. |
| 5 | Documentation | 10% | 9.0/10 | 0.90 | 91% docstring coverage (>90%). README with 20 Mermaid diagrams. No generated API reference docs, so not a full 10. |
| 6 | Complexity | 10% | 7.0/10 | 0.70 | Median 133 (<150). 6 files >300 lines. `mme.py` at 693/80 complexity is the outlier — justified as a format parser. Interpolated between 8 (≤3 files >300) and 6 (≤10 files >300). |
| 7 | Security | 10% | 8.5/10 | 0.85 | Single eval sandboxed with `{"__builtins__": {}}` + 16-item token blocklist. No subprocess, no secrets, no bare excepts. Interpolated between 9 (fully sandboxed) and 7 (partially sandboxed). |
| 8 | Maintainability | 10% | 8.5/10 | 0.85 | Zero debt markers. Zero bare excepts. 48 logging calls across codebase. Modern tooling (uv, hatchling, ruff, mypy). Between 10 (perfect) and 8 (<5 markers). |
### Composite Score: **78.3 / 100**
### Grade: **B**
Calculation: (8.0×0.20 + 6.5×0.15 + 6.0×0.10 + 9.0×0.15 + 9.0×0.10 + 7.0×0.10 + 8.5×0.10 + 8.5×0.10) × 10
= (1.60 + 0.975 + 0.60 + 1.35 + 0.90 + 0.70 + 0.85 + 0.85) × 10
= 7.825 × 10 = **78.3**
---
## Delta from Previous Assessment
| Dimension | Previous | Current | Change |
|-----------|----------|---------|--------|
| Test Health | 8.0 | 8.0 | 0.0 |
| Type Safety | 6.5 | 6.5 | 0.0 |
| Lint Hygiene | 6.0 | 6.0 | 0.0 |
| Architecture | 9.0 | 9.0 | 0.0 |
| Documentation | 9.0 | 9.0 | 0.0 |
| Complexity | 7.0 | 7.0 | 0.0 |
| Security | 8.5 | 8.5 | 0.0 |
| Maintainability | 8.5 | 8.5 | 0.0 |
| **Composite** | **78.3** | **78.3** | **0.0** |
---
## Top Improvements Since Last Assessment
No code changes detected since QA-2026-04-11_0528 — all raw metrics are identical.
---
## Recommended Actions (Priority Order)
| # | Action | Effort | Impact | Dimensions Affected |
|---|--------|--------|--------|---------------------|
| 1 | Run `uv run ruff check --fix src/` to clear 73 auto-fixable violations | 1 min | +2.0 lint → 8.0 | Lint Hygiene |
| 2 | Manually fix 8 duplicate dict keys (`F601`) in `channel/lookup.py` and remaining non-auto-fixable lint violations | 15 min | +1.0 lint → 9.0+ | Lint Hygiene |
| 3 | Add `--cov --cov-report=term-missing` to pytest config; target ≥80% branch coverage | 30 min | +1.0 test → 9.0 | Test Health |
| 4 | Resolve 17 `[type-arg]` mypy errors (add `dict[str, X]` / `list[X]` generics, primarily in web layer) | 1 hr | +1.0 type → 7.5 | Type Safety |
| 5 | Add `__all__` to `io`, `plugin`, `report`, `script`, `template` modules | 30 min | +0.5 arch → 9.5 | Architecture |
**Projected composite after actions 15: ~85 (B+)**
---
## Notes
- **No code changes detected** between this assessment and QA-2026-04-11_0528. All 72 source files and 30 test files are unchanged, yielding identical metrics and scores for the third consecutive assessment.
- **Architecture is qualitative.** Import graph inspected: no layer violations found. The `web` module sits at the top of the dependency tree; `io`/`transform`/`protocol` layers do not import from `web` or `plot`.
- **Security eval in `math_expr.py`** is sandboxed via empty `__builtins__` dict and a 16-entry token blocklist (including `import`, `exec`, `eval`, `subprocess`, `os.`, `sys.`, `__`). The `# noqa: S307` comment excludes it from the `grep -v '# noqa'` scan. An AST-based evaluator would be safer but is lower priority given existing mitigations.
- **Subprocess grep hit** is a confirmed false positive: the string `"subprocess"` appears only as a forbidden-token blocklist entry at `math_expr.py:70`, not as an actual invocation.
- **Complexity scoring for `mme.py`** remains lenient: ISO 13499 format parsers inherently carry high branch density. Consider extracting sub-parsers if it grows beyond ~800 lines.
- **The three assessments today (0459, 0528, 0619) are identical** because no source code was modified between runs. The recommended actions above remain the highest-value next steps.

View File

@@ -267,12 +267,16 @@ composite = (
Copy `docs/QA-TEMPLATE.md` to `docs/QA-<YYYY-MM-DD_HHMM>.md` and fill in: Copy `docs/QA-TEMPLATE.md` to `docs/QA-<YYYY-MM-DD_HHMM>.md` and fill in:
1. All raw metric values 1. **Summary block at the top** (immediately after the `#` heading):
2. All dimension scores with brief justification - Blockquote with composite score, grade, and one-sentence summary
3. Composite score and grade - Quick-reference dimension score table
4. Delta from previous assessment (if one exists) - The summary sentence should note overall quality posture and the most significant change since the last assessment (or "Baseline assessment" if first run)
5. Top 3-5 actionable improvements 2. All raw metric values
6. Acknowledgment of any scoring judgment calls 3. All dimension scores with brief justification
4. Composite score and grade
5. Delta from previous assessment (if one exists)
6. Top 3-5 actionable improvements
7. Acknowledgment of any scoring judgment calls
--- ---

View File

@@ -1,5 +1,19 @@
# Quality Assessment -- [DATE] # Quality Assessment -- [DATE]
> **Score: [ ] / 100 — Grade: [ ]**
> [ one-sentence summary of overall quality and key movement since last assessment ]
| Dimension | Score |
|-----------|-------|
| Test Health | /10 |
| Type Safety | /10 |
| Lint Hygiene | /10 |
| Architecture | /10 |
| Documentation | /10 |
| Complexity | /10 |
| Security | /10 |
| Maintainability | /10 |
**Version:** [VERSION] **Version:** [VERSION]
**Assessed by:** [human / LLM model] **Assessed by:** [human / LLM model]
**Previous assessment:** [filename or "None (baseline)"] **Previous assessment:** [filename or "None (baseline)"]

24
scripts/push.sh Executable file
View File

@@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
if [[ -z "$(git status --porcelain)" ]]; then
echo "Nothing to commit."
exit 0
fi
echo "Current changes:"
git status --short
echo ""
read -rp "Commit note: " note
if [[ -z "$note" ]]; then
echo "Aborted — empty commit message."
exit 1
fi
git add -A
git commit -m "$note"
git push -u origin main

99
scripts/qa-score.sh Executable file
View File

@@ -0,0 +1,99 @@
#!/usr/bin/env bash
set -euo pipefail
# ──────────────────────────────────────────────────────────────
# qa-score.sh — Run the quality-scorer agent against this repo
#
# Usage:
# ./scripts/qa-score.sh # default: sonnet, headless
# ./scripts/qa-score.sh --model opus # use opus
# ./scripts/qa-score.sh --interactive # open in interactive session
# ──────────────────────────────────────────────────────────────
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
AGENT="quality-scorer"
MODEL="sonnet"
INTERACTIVE=false
BUDGET="1.00"
while [[ $# -gt 0 ]]; do
case "$1" in
--model) MODEL="$2"; shift 2 ;;
--budget) BUDGET="$2"; shift 2 ;;
--interactive) INTERACTIVE=true; shift ;;
--help|-h)
echo "Usage: $0 [--model sonnet|opus|haiku] [--budget USD] [--interactive]"
echo ""
echo "Runs the quality-scorer agent to produce a QA report in docs/."
echo ""
echo "Options:"
echo " --model MODEL Claude model to use (default: sonnet)"
echo " --budget USD Max spend in USD (default: 1.00, headless only)"
echo " --interactive Open interactive session instead of headless"
exit 0
;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
cd "$PROJECT_ROOT"
# Verify prerequisites
if ! command -v claude &>/dev/null; then
echo "Error: claude CLI not found. Install from https://claude.ai/code" >&2
exit 1
fi
if ! command -v uv &>/dev/null; then
echo "Error: uv not found. Install from https://docs.astral.sh/uv/" >&2
exit 1
fi
if [[ ! -f "docs/QA-INSTRUCTIONS.md" ]]; then
echo "Error: docs/QA-INSTRUCTIONS.md not found. Run from the project root." >&2
exit 1
fi
# Ensure dev dependencies are available
uv sync --dev --quiet
PROMPT="Run a full codebase quality assessment. \
Read docs/QA-INSTRUCTIONS.md for the methodology and rubrics. \
Read docs/QA-TEMPLATE.md for the report structure. \
Check docs/ for previous QA-*.md reports and compute deltas if any exist. \
Collect all raw metrics by running every command in Step 1. \
Score each dimension using the Step 2 rubrics. \
Compute the composite score using the Step 3 formula. \
Write the completed report to docs/QA-<datetime>.md. \
Print the composite score, grade, per-dimension scores, and top 3 actions."
if [[ "$INTERACTIVE" == true ]]; then
echo "Starting interactive QA session (model: $MODEL)..."
exec claude \
--agent "$AGENT" \
--model "$MODEL" \
"$PROMPT"
else
echo "Running quality assessment (model: $MODEL, budget: \$$BUDGET)..."
echo ""
claude \
-p \
--agent "$AGENT" \
--model "$MODEL" \
--max-budget-usd "$BUDGET" \
--allowedTools "Bash Read Write Grep Glob" \
--output-format text \
"$PROMPT"
echo ""
echo "──────────────────────────────────────────"
# Show the report that was just written
LATEST=$(ls -t docs/QA-2*.md 2>/dev/null | head -1)
if [[ -n "$LATEST" ]]; then
echo "Report written: $LATEST"
else
echo "Warning: no QA report file found in docs/"
fi
fi