Coverage for eval_harness / dataset.py: 97%

34 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 20:06 +0200

1from __future__ import annotations 

2 

3from dataclasses import dataclass, field 

4from pathlib import Path 

5 

6import yaml 

7 

8DEFAULT_GOLDEN_SET_PATH = Path(__file__).parent.parent / "data" / "golden_set.yaml" 

9 

10 

11@dataclass(frozen=True) 

12class GoldenItem: 

13 """A single hand-curated eval item. 

14 

15 Frozen so items can be used as parametrize ids and so a test can't 

16 mutate the dataset mid-run. 

17 """ 

18 

19 id: str 

20 question: str 

21 expected: str 

22 difficulty: str 

23 category: str 

24 tags: tuple[str, ...] = field(default_factory=tuple) 

25 

26 

27_REQUIRED_FIELDS = {"id", "question", "expected", "difficulty", "category"} 

28_VALID_DIFFICULTIES = {"easy", "medium", "hard"} 

29 

30 

31def load_golden_set(path: Path | str = DEFAULT_GOLDEN_SET_PATH) -> list[GoldenItem]: 

32 """Load and validate the golden set. 

33 

34 Validation is intentionally strict: a malformed dataset should fail loud 

35 at load time. 

36 """ 

37 raw = yaml.safe_load(Path(path).read_text()) 

38 if not isinstance(raw, list) or not raw: 

39 raise ValueError(f"Golden set at {path} must be a non-empty list") 

40 

41 items: list[GoldenItem] = [] 

42 seen_ids: set[str] = set() 

43 for i, row in enumerate(raw): 

44 if not isinstance(row, dict): 

45 raise ValueError(f"Item {i} is not a mapping: {row!r}") 

46 missing = _REQUIRED_FIELDS - row.keys() 

47 if missing: 

48 raise ValueError(f"Item {i} ({row.get('id')!r}) missing fields: {sorted(missing)}") 

49 if row["difficulty"] not in _VALID_DIFFICULTIES: 

50 raise ValueError( 

51 f"Item {row['id']!r} has invalid difficulty {row['difficulty']!r}; " 

52 f"expected one of {sorted(_VALID_DIFFICULTIES)}" 

53 ) 

54 if row["id"] in seen_ids: 

55 raise ValueError(f"Duplicate item id: {row['id']!r}") 

56 seen_ids.add(row["id"]) 

57 items.append( 

58 GoldenItem( 

59 id=row["id"], 

60 question=row["question"], 

61 expected=row["expected"], 

62 difficulty=row["difficulty"], 

63 category=row["category"], 

64 tags=tuple(row.get("tags") or ()), 

65 ) 

66 ) 

67 return items