Coverage for eval_harness / scorers / judge.py: 100%

51 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 20:06 +0200

1"""LLM-as-judge: a second model grades the first model's answer. 

2 

3Plain-English version of what this file does, end to end: 

4 

51. Take the question, the expected answer, and the model's actual 

6 answer. Drop them into a grading-instructions template (see 

7 `RUBRIC_PROMPT`). 

82. Send that filled-in prompt to an LLM (a *separate* call from 

9 whatever produced the answer being graded). 

103. The LLM replies with a small JSON blob containing two integer 

11 scores (correctness 0-10, relevance 0-10) and a one-line 

12 explanation. 

134. Parse the JSON, average the two scores, divide by 10 to get a 

14 score in [0, 1], compare against a threshold, return PASS or FAIL. 

15 

16Why bother. The simpler scorers (exact match, BLEU, ROUGE, semantic 

17similarity) all compare *text shapes*. They can be fooled by: 

18 

19- A right answer phrased very differently from the expected (e.g. a 

20 bulleted list of all 8 planets when the expected was just "8" — 

21 exact match and BLEU/ROUGE see no overlap; embeddings see different 

22 shape). 

23- A wrong answer that uses the right vocabulary (semantic similarity 

24 passes it). 

25 

26A judge LLM reads both texts and can sometimes recognize "yes, this 

27answers the question" or "no, this is plausible-sounding nonsense" 

28where pure-text comparison can't. 

29 

30Important caveat. When the judge and the answering model are the same 

31weights (the case in this project — both are llama3.2), the judge has 

32a blind spot for its own hallucinations. It produced the same 

33nonsense; it can't tell it apart from a real answer. Surfacing this 

34"self-grading bias" is one of the things this harness is here to 

35demonstrate — see `tests/test_calibration.py` and the 

36`procedural_001` item in `data/human_labels.yaml`. 

37""" 

38 

39from __future__ import annotations 

40 

41import json 

42import re 

43from collections.abc import Awaitable, Callable 

44 

45from eval_harness.providers.ollama import OllamaProvider 

46from eval_harness.scorers.base import Scorer, ScoreResult 

47 

48DEFAULT_THRESHOLD = 0.7 

49DEFAULT_MODEL = "llama3.2" 

50 

51# The grading instructions sent to the judge. See LLMJudgeScorer's 

52# docstring for why the rubric looks the way it does. 

53RUBRIC_PROMPT = """You are a strict but fair grader for a question-answering system. 

54 

55You will be given: 

56- A QUESTION 

57- An EXPECTED answer (a reference — the model's answer does NOT need to 

58 match it word-for-word to be correct) 

59- The MODEL'S answer to grade 

60 

61Grade the model's answer on two dimensions, each 0-10: 

62- correctness: does the answer correctly answer the question? 

63 10 = fully correct (even if phrased differently from the expected answer) 

64 5 = partially correct, or correct but with significant inaccuracies 

65 0 = wrong, contradicts the expected answer, or hallucinated 

66- relevance: is the answer on-topic and responsive to the question? 

67 10 = directly addresses the question 

68 5 = partially on-topic, drifts 

69 0 = off-topic or refuses to answer 

70 

71Respond with ONLY a JSON object, no other text: 

72{{"reasoning": "<one or two sentences explaining your scores>", 

73 "correctness": <int>, 

74 "relevance": <int>}} 

75 

76QUESTION: {question} 

77EXPECTED: {expected} 

78MODEL'S ANSWER: {output}""" 

79 

80 

81class JudgeParseError(RuntimeError): 

82 """Raised when the judge's response cannot be parsed into scores. 

83 

84 A malformed judge response is an infrastructure problem (the judge 

85 model isn't following the rubric format), not "the model's answer 

86 was bad." Silently coercing to a 0 score would hide bugs in the 

87 judge prompt or the parsing logic. 

88 """ 

89 

90 

91def _parse_judge_response(raw: str) -> tuple[int, int, str]: 

92 """Extract (correctness, relevance, reasoning) from the judge's text. 

93 

94 Strategy: try strict JSON first. If the model wrapped the JSON in 

95 prose or markdown, try to find the first {...} block. As a last 

96 resort, regex-extract the two integer scores. If even that fails, 

97 raise JudgeParseError with the raw text — better than silently 

98 returning a 0. 

99 """ 

100 try: 

101 data = json.loads(raw) 

102 return ( 

103 int(data["correctness"]), 

104 int(data["relevance"]), 

105 str(data.get("reasoning", "")), 

106 ) 

107 except (json.JSONDecodeError, KeyError, TypeError, ValueError): 

108 pass 

109 

110 json_blob = re.search(r"\{.*\}", raw, re.DOTALL) 

111 if json_blob: 

112 try: 

113 data = json.loads(json_blob.group(0)) 

114 return ( 

115 int(data["correctness"]), 

116 int(data["relevance"]), 

117 str(data.get("reasoning", "")), 

118 ) 

119 except (json.JSONDecodeError, KeyError, TypeError, ValueError): 

120 pass 

121 

122 correctness_match = re.search(r'"?correctness"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE) 

123 relevance_match = re.search(r'"?relevance"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE) 

124 if correctness_match and relevance_match: 

125 return ( 

126 int(correctness_match.group(1)), 

127 int(relevance_match.group(1)), 

128 "(reasoning unparseable, scores extracted by regex fallback)", 

129 ) 

130 

131 raise JudgeParseError(f"Could not parse judge response: {raw!r}") 

132 

133 

134class LLMJudgeScorer(Scorer): 

135 """Calls a grading LLM and turns its reply into a PASS/FAIL. 

136 

137 Three rubric design choices are worth knowing about, since they 

138 explain why `RUBRIC_PROMPT` looks the way it does: 

139 

140 - **Reference is a hint, not a string to match.** The judge is told 

141 the expected answer is a reference — the model's answer can be 

142 phrased completely differently and still be correct. This is what 

143 lets the judge pass right-but-different-shape answers (a bulleted 

144 list when the expected was a single number, etc). 

145 - **Two scores, not one.** Asking for *correctness* and *relevance* 

146 separately keeps two distinct failure modes from being smeared 

147 into a single number: a right-but-rambling answer scores high on 

148 correctness and low on relevance; a confident hallucination 

149 scores low on correctness and high on relevance. A single 

150 "quality" score loses that signal. 

151 - **Reasoning before score.** The rubric asks the judge to justify 

152 its score *before* writing the number down. A model asked for 

153 the score first tends to commit to a number and rationalize it. 

154 

155 Construction. 

156 

157 - `threshold` — pass/fail cutoff on the [0, 1] combined score. 

158 - `judge_fn` — async callable that takes a prompt and returns the 

159 judge's raw text. Defaults to a fresh `OllamaProvider` at 

160 temperature 0 (judge wants determinism). Tests inject a fake 

161 `judge_fn` to avoid touching HTTP at all. 

162 - `model` — the Ollama model name to use when `judge_fn` is the 

163 default. 

164 """ 

165 

166 name = "judge" 

167 

168 def __init__( 

169 self, 

170 threshold: float = DEFAULT_THRESHOLD, 

171 judge_fn: Callable[[str], Awaitable[str]] | None = None, 

172 model: str = DEFAULT_MODEL, 

173 ) -> None: 

174 if not 0.0 <= threshold <= 1.0: 

175 raise ValueError(f"threshold must be in [0, 1], got {threshold}") 

176 self.threshold = threshold 

177 self.model = model 

178 if judge_fn is None: 

179 provider = OllamaProvider(model=model, timeout=180.0, temperature=0.0) 

180 judge_fn = provider.generate 

181 self._judge_fn = judge_fn 

182 

183 def build_prompt(self, question: str, output: str, expected: str) -> str: 

184 """Render the rubric prompt the judge will see. Public so tests and 

185 debug paths can inspect what was actually sent without scoring.""" 

186 return RUBRIC_PROMPT.format(question=question, expected=expected, output=output) 

187 

188 async def score(self, question: str, output: str, expected: str) -> ScoreResult: 

189 prompt = self.build_prompt(question, output, expected) 

190 raw = await self._judge_fn(prompt) 

191 correctness, relevance, reasoning = _parse_judge_response(raw) 

192 

193 # Clamp into 0-10 in case the judge invented an out-of-range score. 

194 correctness = max(0, min(10, correctness)) 

195 relevance = max(0, min(10, relevance)) 

196 

197 score = (correctness + relevance) / 20.0 

198 passed = score >= self.threshold 

199 reason = ( 

200 f"correctness={correctness}/10 relevance={relevance}/10 " 

201 f"score={score:.3f} {'>=' if passed else '<'} threshold={self.threshold} " 

202 f"| reasoning: {reasoning}" 

203 ) 

204 return ScoreResult(passed=passed, score=score, reason=reason)