Coverage for eval_harness / dataset.py: 97%
34 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1from __future__ import annotations
3from dataclasses import dataclass, field
4from pathlib import Path
6import yaml
8DEFAULT_GOLDEN_SET_PATH = Path(__file__).parent.parent / "data" / "golden_set.yaml"
11@dataclass(frozen=True)
12class GoldenItem:
13 """A single hand-curated eval item.
15 Frozen so items can be used as parametrize ids and so a test can't
16 mutate the dataset mid-run.
17 """
19 id: str
20 question: str
21 expected: str
22 difficulty: str
23 category: str
24 tags: tuple[str, ...] = field(default_factory=tuple)
27_REQUIRED_FIELDS = {"id", "question", "expected", "difficulty", "category"}
28_VALID_DIFFICULTIES = {"easy", "medium", "hard"}
31def load_golden_set(path: Path | str = DEFAULT_GOLDEN_SET_PATH) -> list[GoldenItem]:
32 """Load and validate the golden set.
34 Validation is intentionally strict: a malformed dataset should fail loud
35 at load time.
36 """
37 raw = yaml.safe_load(Path(path).read_text())
38 if not isinstance(raw, list) or not raw:
39 raise ValueError(f"Golden set at {path} must be a non-empty list")
41 items: list[GoldenItem] = []
42 seen_ids: set[str] = set()
43 for i, row in enumerate(raw):
44 if not isinstance(row, dict):
45 raise ValueError(f"Item {i} is not a mapping: {row!r}")
46 missing = _REQUIRED_FIELDS - row.keys()
47 if missing:
48 raise ValueError(f"Item {i} ({row.get('id')!r}) missing fields: {sorted(missing)}")
49 if row["difficulty"] not in _VALID_DIFFICULTIES:
50 raise ValueError(
51 f"Item {row['id']!r} has invalid difficulty {row['difficulty']!r}; "
52 f"expected one of {sorted(_VALID_DIFFICULTIES)}"
53 )
54 if row["id"] in seen_ids:
55 raise ValueError(f"Duplicate item id: {row['id']!r}")
56 seen_ids.add(row["id"])
57 items.append(
58 GoldenItem(
59 id=row["id"],
60 question=row["question"],
61 expected=row["expected"],
62 difficulty=row["difficulty"],
63 category=row["category"],
64 tags=tuple(row.get("tags") or ()),
65 )
66 )
67 return items