Coverage for eval_harness / scorers / bleu.py: 100%

16 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 20:06 +0200

1from __future__ import annotations 

2 

3from sacrebleu.metrics import BLEU 

4 

5from eval_harness.scorers.base import Scorer, ScoreResult 

6 

7DEFAULT_THRESHOLD = 0.30 

8 

9 

10class BleuScorer(Scorer): 

11 """How much vocabulary do the model's answer and the expected answer share? 

12 

13 Explanation: BLEU counts how many short word-sequences 

14 appear in both texts. An *n-gram* is just a run of n consecutive 

15 words: "the cat" is a 2-gram, "the cat sat" is a 3-gram. BLEU 

16 checks 1-grams, 2-grams, 3-grams, and 4-grams, then combines the 

17 counts. Two corrections that matter: 

18 

19 - *Brevity penalty* — if the model's answer is much shorter than 

20 the reference, BLEU multiplies the score down. Otherwise a model 

21 could just emit one well-chosen word and get high precision for 

22 free. 

23 - *Clipping* — if the model says "the the the" and "the" appears 

24 once in the reference, only one of those gets credit. 

25 

26 BLEU does not understand meaning. It only counts word sequences. It 

27 was invented in 2002 for machine translation, where the reference 

28 is a full human-translated sentence. Score is `sacrebleu`'s 0-100 

29 scale rescaled to [0, 1]. 

30 

31 Default threshold 0.30 follows translation-paper convention 

32 ("decent translation"). On this project's short-factual corpus 

33 BLEU collapses near zero on prose-wrapped answers like exact match 

34 does, but lifts above threshold on prose-vs-prose pairs. 

35 """ 

36 

37 name = "bleu" 

38 

39 def __init__(self, threshold: float = DEFAULT_THRESHOLD) -> None: 

40 if not 0.0 <= threshold <= 1.0: 

41 raise ValueError(f"threshold must be in [0, 1], got {threshold}") 

42 self.threshold = threshold 

43 self._bleu = BLEU(effective_order=True) 

44 

45 async def score(self, question: str, output: str, expected: str) -> ScoreResult: 

46 result = self._bleu.sentence_score(output, [expected]) 

47 normalized = result.score / 100.0 

48 passed = normalized >= self.threshold 

49 return ScoreResult( 

50 passed=passed, 

51 score=normalized, 

52 reason=f"bleu={normalized:.3f} {'>=' if passed else '<'} threshold={self.threshold}", 

53 )