Coverage for eval_harness / scorers / rouge.py: 100%
18 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1from __future__ import annotations
3from rouge_score import rouge_scorer
5from eval_harness.scorers.base import Scorer, ScoreResult
7DEFAULT_THRESHOLD = 0.40
8DEFAULT_VARIANT = "rougeL"
11class RougeScorer(Scorer):
12 """Did the model's answer cover the words in the expected answer?
14 Explanation: ROUGE-L looks for the *longest common
15 subsequence* (LCS) of words between the two texts. "Subsequence"
16 means same order, but the matching words don't have to be next to
17 each other — so "the cat sat on the mat" and "the cat quietly sat
18 on the warm mat" share the LCS "the cat sat on the mat" (six
19 words) even though they aren't identical strings. From that LCS
20 length, ROUGE-L computes:
22 - *recall* — how many of the *reference's* words made it into
23 the LCS (did the model cover the reference?).
24 - *precision* — how many of the *output's* words made it into the
25 LCS (did the model stick to relevant words?).
26 - *F1* — the harmonic mean of the two, what we report.
28 Where BLEU emphasizes precision (translation cares: "did the model
29 pick the right words?"), ROUGE emphasizes recall (summarization
30 cares: "did the model cover the source?"). They are siblings, not
31 competitors — both are vocabulary-overlap metrics that don't
32 understand meaning, just from different angles.
34 Backed by `rouge-score` (Google's reference implementation).
35 Default variant is ROUGE-L F1, default threshold 0.40 follows
36 summarization-paper convention.
38 Like BLEU, ROUGE-L collapses on this project's short-factual cases
39 (one-token reference vs prose output → precision is 1/N, F1
40 follows down) but clears its threshold on prose-vs-prose pairs.
41 """
43 name = "rouge"
45 def __init__(
46 self,
47 threshold: float = DEFAULT_THRESHOLD,
48 variant: str = DEFAULT_VARIANT,
49 ) -> None:
50 if not 0.0 <= threshold <= 1.0:
51 raise ValueError(f"threshold must be in [0, 1], got {threshold}")
52 self.threshold = threshold
53 self.variant = variant
54 self._scorer = rouge_scorer.RougeScorer([variant], use_stemmer=True)
56 async def score(self, question: str, output: str, expected: str) -> ScoreResult:
57 scores = self._scorer.score(expected, output)
58 f1 = scores[self.variant].fmeasure
59 passed = f1 >= self.threshold
60 return ScoreResult(
61 passed=passed,
62 score=f1,
63 reason=(
64 f"{self.variant}_f1={f1:.3f} "
65 f"{'>=' if passed else '<'} threshold={self.threshold}"
66 ),
67 )