Coverage for eval_harness/scorers/base.py: 100%

1from __future__ import annotations

3from abc import ABC, abstractmethod

4from dataclasses import dataclass

7@dataclass(frozen=True)

8class ScoreResult:

9 """Outcome of a single scorer applied to a single (output, expected) pair.

11 `score` is in [0.0, 1.0]. `passed` is the scorer's binary verdict — for

12 threshold-based scorers (semantic, judge), it's `score >= threshold`.

13 `reason` is a short human-readable explanation, surfaced in reports.

14 """

16 passed: bool

17 score: float

18 reason: str

21class Scorer(ABC):

22 """Scoring function over (question, output, expected).

24 `score` is async because some scorers (LLM-as-judge) make HTTP calls at

25 score time. Scorers without I/O (exact match, semantic similarity) just

26 don't await anything. The harness runs under `asyncio_mode = "auto"`,

27 so every test is async by default — async-only is the consistent

28 choice given a single runtime context.

30 Production frameworks (DeepEval, Ragas) expose dual sync+async

31 interfaces to avoid async contagion across many runtime contexts.

32 """

34 name: str

36 @abstractmethod

37 async def score(self, question: str, output: str, expected: str) -> ScoreResult: ...

Coverage for eval_harness / scorers / base.py: 100%