Coverage for eval_harness/dataset.py: 97%

1from __future__ import annotations

3from dataclasses import dataclass, field

4from pathlib import Path

6import yaml

8DEFAULT_GOLDEN_SET_PATH = Path(__file__).parent.parent / "data" / "golden_set.yaml"

11@dataclass(frozen=True)

12class GoldenItem:

13 """A single hand-curated eval item.

15 Frozen so items can be used as parametrize ids and so a test can't

16 mutate the dataset mid-run.

17 """

19 id: str

20 question: str

21 expected: str

22 difficulty: str

23 category: str

24 tags: tuple[str, ...] = field(default_factory=tuple)

27_REQUIRED_FIELDS = {"id", "question", "expected", "difficulty", "category"}

28_VALID_DIFFICULTIES = {"easy", "medium", "hard"}

31def load_golden_set(path: Path | str = DEFAULT_GOLDEN_SET_PATH) -> list[GoldenItem]:

32 """Load and validate the golden set.

34 Validation is intentionally strict: a malformed dataset should fail loud

35 at load time.

36 """

37 raw = yaml.safe_load(Path(path).read_text())

38 if not isinstance(raw, list) or not raw:

39 raise ValueError(f"Golden set at {path} must be a non-empty list")

41 items: list[GoldenItem] = []

42 seen_ids: set[str] = set()

43 for i, row in enumerate(raw):

44 if not isinstance(row, dict):

45 raise ValueError(f"Item {i} is not a mapping: {row!r}")

46 missing = _REQUIRED_FIELDS - row.keys()

47 if missing:

48 raise ValueError(f"Item {i} ({row.get('id')!r}) missing fields: {sorted(missing)}")

49 if row["difficulty"] not in _VALID_DIFFICULTIES:

50 raise ValueError(

51 f"Item {row['id']!r} has invalid difficulty {row['difficulty']!r}; "

52 f"expected one of {sorted(_VALID_DIFFICULTIES)}"

53 )

54 if row["id"] in seen_ids:

55 raise ValueError(f"Duplicate item id: {row['id']!r}")

56 seen_ids.add(row["id"])

57 items.append(

58 GoldenItem(

59 id=row["id"],

60 question=row["question"],

61 expected=row["expected"],

62 difficulty=row["difficulty"],

63 category=row["category"],

64 tags=tuple(row.get("tags") or ()),

65 )

66 )

67 return items

Coverage for eval_harness / dataset.py: 97%

34 statements