Coverage for eval_harness / scorers / rouge.py: 100%

18 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 20:06 +0200

1from __future__ import annotations 

2 

3from rouge_score import rouge_scorer 

4 

5from eval_harness.scorers.base import Scorer, ScoreResult 

6 

7DEFAULT_THRESHOLD = 0.40 

8DEFAULT_VARIANT = "rougeL" 

9 

10 

11class RougeScorer(Scorer): 

12 """Did the model's answer cover the words in the expected answer? 

13 

14 Explanation: ROUGE-L looks for the *longest common 

15 subsequence* (LCS) of words between the two texts. "Subsequence" 

16 means same order, but the matching words don't have to be next to 

17 each other — so "the cat sat on the mat" and "the cat quietly sat 

18 on the warm mat" share the LCS "the cat sat on the mat" (six 

19 words) even though they aren't identical strings. From that LCS 

20 length, ROUGE-L computes: 

21 

22 - *recall* — how many of the *reference's* words made it into 

23 the LCS (did the model cover the reference?). 

24 - *precision* — how many of the *output's* words made it into the 

25 LCS (did the model stick to relevant words?). 

26 - *F1* — the harmonic mean of the two, what we report. 

27 

28 Where BLEU emphasizes precision (translation cares: "did the model 

29 pick the right words?"), ROUGE emphasizes recall (summarization 

30 cares: "did the model cover the source?"). They are siblings, not 

31 competitors — both are vocabulary-overlap metrics that don't 

32 understand meaning, just from different angles. 

33 

34 Backed by `rouge-score` (Google's reference implementation). 

35 Default variant is ROUGE-L F1, default threshold 0.40 follows 

36 summarization-paper convention. 

37 

38 Like BLEU, ROUGE-L collapses on this project's short-factual cases 

39 (one-token reference vs prose output → precision is 1/N, F1 

40 follows down) but clears its threshold on prose-vs-prose pairs. 

41 """ 

42 

43 name = "rouge" 

44 

45 def __init__( 

46 self, 

47 threshold: float = DEFAULT_THRESHOLD, 

48 variant: str = DEFAULT_VARIANT, 

49 ) -> None: 

50 if not 0.0 <= threshold <= 1.0: 

51 raise ValueError(f"threshold must be in [0, 1], got {threshold}") 

52 self.threshold = threshold 

53 self.variant = variant 

54 self._scorer = rouge_scorer.RougeScorer([variant], use_stemmer=True) 

55 

56 async def score(self, question: str, output: str, expected: str) -> ScoreResult: 

57 scores = self._scorer.score(expected, output) 

58 f1 = scores[self.variant].fmeasure 

59 passed = f1 >= self.threshold 

60 return ScoreResult( 

61 passed=passed, 

62 score=f1, 

63 reason=( 

64 f"{self.variant}_f1={f1:.3f} " 

65 f"{'>=' if passed else '<'} threshold={self.threshold}" 

66 ), 

67 )