Coverage for eval_harness / scorers / bleu.py: 100%
16 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1from __future__ import annotations
3from sacrebleu.metrics import BLEU
5from eval_harness.scorers.base import Scorer, ScoreResult
7DEFAULT_THRESHOLD = 0.30
10class BleuScorer(Scorer):
11 """How much vocabulary do the model's answer and the expected answer share?
13 Explanation: BLEU counts how many short word-sequences
14 appear in both texts. An *n-gram* is just a run of n consecutive
15 words: "the cat" is a 2-gram, "the cat sat" is a 3-gram. BLEU
16 checks 1-grams, 2-grams, 3-grams, and 4-grams, then combines the
17 counts. Two corrections that matter:
19 - *Brevity penalty* — if the model's answer is much shorter than
20 the reference, BLEU multiplies the score down. Otherwise a model
21 could just emit one well-chosen word and get high precision for
22 free.
23 - *Clipping* — if the model says "the the the" and "the" appears
24 once in the reference, only one of those gets credit.
26 BLEU does not understand meaning. It only counts word sequences. It
27 was invented in 2002 for machine translation, where the reference
28 is a full human-translated sentence. Score is `sacrebleu`'s 0-100
29 scale rescaled to [0, 1].
31 Default threshold 0.30 follows translation-paper convention
32 ("decent translation"). On this project's short-factual corpus
33 BLEU collapses near zero on prose-wrapped answers like exact match
34 does, but lifts above threshold on prose-vs-prose pairs.
35 """
37 name = "bleu"
39 def __init__(self, threshold: float = DEFAULT_THRESHOLD) -> None:
40 if not 0.0 <= threshold <= 1.0:
41 raise ValueError(f"threshold must be in [0, 1], got {threshold}")
42 self.threshold = threshold
43 self._bleu = BLEU(effective_order=True)
45 async def score(self, question: str, output: str, expected: str) -> ScoreResult:
46 result = self._bleu.sentence_score(output, [expected])
47 normalized = result.score / 100.0
48 passed = normalized >= self.threshold
49 return ScoreResult(
50 passed=passed,
51 score=normalized,
52 reason=f"bleu={normalized:.3f} {'>=' if passed else '<'} threshold={self.threshold}",
53 )