Coverage for eval_harness/scorers/bleu.py: 100%

1from __future__ import annotations

3from sacrebleu.metrics import BLEU

5from eval_harness.scorers.base import Scorer, ScoreResult

7DEFAULT_THRESHOLD = 0.30

10class BleuScorer(Scorer):

11 """How much vocabulary do the model's answer and the expected answer share?

13 Explanation: BLEU counts how many short word-sequences

14 appear in both texts. An *n-gram* is just a run of n consecutive

15 words: "the cat" is a 2-gram, "the cat sat" is a 3-gram. BLEU

16 checks 1-grams, 2-grams, 3-grams, and 4-grams, then combines the

17 counts. Two corrections that matter:

19 - *Brevity penalty* — if the model's answer is much shorter than

20 the reference, BLEU multiplies the score down. Otherwise a model

21 could just emit one well-chosen word and get high precision for

22 free.

23 - *Clipping* — if the model says "the the the" and "the" appears

24 once in the reference, only one of those gets credit.

26 BLEU does not understand meaning. It only counts word sequences. It

27 was invented in 2002 for machine translation, where the reference

28 is a full human-translated sentence. Score is `sacrebleu`'s 0-100

29 scale rescaled to [0, 1].

31 Default threshold 0.30 follows translation-paper convention

32 ("decent translation"). On this project's short-factual corpus

33 BLEU collapses near zero on prose-wrapped answers like exact match

34 does, but lifts above threshold on prose-vs-prose pairs.

35 """

37 name = "bleu"

39 def __init__(self, threshold: float = DEFAULT_THRESHOLD) -> None:

40 if not 0.0 <= threshold <= 1.0:

41 raise ValueError(f"threshold must be in [0, 1], got {threshold}")

42 self.threshold = threshold

43 self._bleu = BLEU(effective_order=True)

45 async def score(self, question: str, output: str, expected: str) -> ScoreResult:

46 result = self._bleu.sentence_score(output, [expected])

47 normalized = result.score / 100.0

48 passed = normalized >= self.threshold

49 return ScoreResult(

50 passed=passed,

51 score=normalized,

52 reason=f"bleu={normalized:.3f} {'>=' if passed else '<'} threshold={self.threshold}",

Coverage for eval_harness / scorers / bleu.py: 100%