Coverage for eval_harness / scorers / base.py: 100%
10 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1from __future__ import annotations
3from abc import ABC, abstractmethod
4from dataclasses import dataclass
7@dataclass(frozen=True)
8class ScoreResult:
9 """Outcome of a single scorer applied to a single (output, expected) pair.
11 `score` is in [0.0, 1.0]. `passed` is the scorer's binary verdict — for
12 threshold-based scorers (semantic, judge), it's `score >= threshold`.
13 `reason` is a short human-readable explanation, surfaced in reports.
14 """
16 passed: bool
17 score: float
18 reason: str
21class Scorer(ABC):
22 """Scoring function over (question, output, expected).
24 `score` is async because some scorers (LLM-as-judge) make HTTP calls at
25 score time. Scorers without I/O (exact match, semantic similarity) just
26 don't await anything. The harness runs under `asyncio_mode = "auto"`,
27 so every test is async by default — async-only is the consistent
28 choice given a single runtime context.
30 Production frameworks (DeepEval, Ragas) expose dual sync+async
31 interfaces to avoid async contagion across many runtime contexts.
32 """
34 name: str
36 @abstractmethod
37 async def score(self, question: str, output: str, expected: str) -> ScoreResult: ...