Coverage for eval_harness/scorers/judge.py: 100%

1"""LLM-as-judge: a second model grades the first model's answer.

3Plain-English version of what this file does, end to end:

51. Take the question, the expected answer, and the model's actual

6 answer. Drop them into a grading-instructions template (see

7 `RUBRIC_PROMPT`).

82. Send that filled-in prompt to an LLM (a *separate* call from

9 whatever produced the answer being graded).

103. The LLM replies with a small JSON blob containing two integer

11 scores (correctness 0-10, relevance 0-10) and a one-line

12 explanation.

134. Parse the JSON, average the two scores, divide by 10 to get a

14 score in [0, 1], compare against a threshold, return PASS or FAIL.

16Why bother. The simpler scorers (exact match, BLEU, ROUGE, semantic

17similarity) all compare *text shapes*. They can be fooled by:

19- A right answer phrased very differently from the expected (e.g. a

20 bulleted list of all 8 planets when the expected was just "8" —

21 exact match and BLEU/ROUGE see no overlap; embeddings see different

22 shape).

23- A wrong answer that uses the right vocabulary (semantic similarity

24 passes it).

26A judge LLM reads both texts and can sometimes recognize "yes, this

27answers the question" or "no, this is plausible-sounding nonsense"

28where pure-text comparison can't.

30Important caveat. When the judge and the answering model are the same

31weights (the case in this project — both are llama3.2), the judge has

32a blind spot for its own hallucinations. It produced the same

33nonsense; it can't tell it apart from a real answer. Surfacing this

34"self-grading bias" is one of the things this harness is here to

35demonstrate — see `tests/test_calibration.py` and the

36`procedural_001` item in `data/human_labels.yaml`.

37"""

39from __future__ import annotations

41import json

42import re

43from collections.abc import Awaitable, Callable

45from eval_harness.providers.ollama import OllamaProvider

46from eval_harness.scorers.base import Scorer, ScoreResult

48DEFAULT_THRESHOLD = 0.7

49DEFAULT_MODEL = "llama3.2"

51# The grading instructions sent to the judge. See LLMJudgeScorer's

52# docstring for why the rubric looks the way it does.

53RUBRIC_PROMPT = """You are a strict but fair grader for a question-answering system.

55You will be given:

56- A QUESTION

57- An EXPECTED answer (a reference — the model's answer does NOT need to

58 match it word-for-word to be correct)

59- The MODEL'S answer to grade

61Grade the model's answer on two dimensions, each 0-10:

62- correctness: does the answer correctly answer the question?

63 10 = fully correct (even if phrased differently from the expected answer)

64 5 = partially correct, or correct but with significant inaccuracies

65 0 = wrong, contradicts the expected answer, or hallucinated

66- relevance: is the answer on-topic and responsive to the question?

67 10 = directly addresses the question

68 5 = partially on-topic, drifts

69 0 = off-topic or refuses to answer

71Respond with ONLY a JSON object, no other text:

72{{"reasoning": "<one or two sentences explaining your scores>",

73 "correctness": <int>,

74 "relevance": <int>}}

76QUESTION: {question}

77EXPECTED: {expected}

78MODEL'S ANSWER: {output}"""

81class JudgeParseError(RuntimeError):

82 """Raised when the judge's response cannot be parsed into scores.

84 A malformed judge response is an infrastructure problem (the judge

85 model isn't following the rubric format), not "the model's answer

86 was bad." Silently coercing to a 0 score would hide bugs in the

87 judge prompt or the parsing logic.

88 """

91def _parse_judge_response(raw: str) -> tuple[int, int, str]:

92 """Extract (correctness, relevance, reasoning) from the judge's text.

94 Strategy: try strict JSON first. If the model wrapped the JSON in

95 prose or markdown, try to find the first {...} block. As a last

96 resort, regex-extract the two integer scores. If even that fails,

97 raise JudgeParseError with the raw text — better than silently

98 returning a 0.

99 """

100 try:

101 data = json.loads(raw)

102 return (

103 int(data["correctness"]),

104 int(data["relevance"]),

105 str(data.get("reasoning", "")),

106 )

107 except (json.JSONDecodeError, KeyError, TypeError, ValueError):

108 pass

109

110 json_blob = re.search(r"\{.*\}", raw, re.DOTALL)

111 if json_blob:

112 try:

113 data = json.loads(json_blob.group(0))

114 return (

115 int(data["correctness"]),

116 int(data["relevance"]),

117 str(data.get("reasoning", "")),

118 )

119 except (json.JSONDecodeError, KeyError, TypeError, ValueError):

120 pass

121

122 correctness_match = re.search(r'"?correctness"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE)

123 relevance_match = re.search(r'"?relevance"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE)

124 if correctness_match and relevance_match:

125 return (

126 int(correctness_match.group(1)),

127 int(relevance_match.group(1)),

128 "(reasoning unparseable, scores extracted by regex fallback)",

129 )

130

131 raise JudgeParseError(f"Could not parse judge response: {raw!r}")

132

133

134class LLMJudgeScorer(Scorer):

135 """Calls a grading LLM and turns its reply into a PASS/FAIL.

136

137 Three rubric design choices are worth knowing about, since they

138 explain why `RUBRIC_PROMPT` looks the way it does:

139

140 - **Reference is a hint, not a string to match.** The judge is told

141 the expected answer is a reference — the model's answer can be

142 phrased completely differently and still be correct. This is what

143 lets the judge pass right-but-different-shape answers (a bulleted

144 list when the expected was a single number, etc).

145 - **Two scores, not one.** Asking for *correctness* and *relevance*

146 separately keeps two distinct failure modes from being smeared

147 into a single number: a right-but-rambling answer scores high on

148 correctness and low on relevance; a confident hallucination

149 scores low on correctness and high on relevance. A single

150 "quality" score loses that signal.

151 - **Reasoning before score.** The rubric asks the judge to justify

152 its score *before* writing the number down. A model asked for

153 the score first tends to commit to a number and rationalize it.

154

155 Construction.

156

157 - `threshold` — pass/fail cutoff on the [0, 1] combined score.

158 - `judge_fn` — async callable that takes a prompt and returns the

159 judge's raw text. Defaults to a fresh `OllamaProvider` at

160 temperature 0 (judge wants determinism). Tests inject a fake

161 `judge_fn` to avoid touching HTTP at all.

162 - `model` — the Ollama model name to use when `judge_fn` is the

163 default.

164 """

165

166 name = "judge"

167

168 def __init__(

169 self,

170 threshold: float = DEFAULT_THRESHOLD,

171 judge_fn: Callable[[str], Awaitable[str]] | None = None,

172 model: str = DEFAULT_MODEL,

173 ) -> None:

174 if not 0.0 <= threshold <= 1.0:

175 raise ValueError(f"threshold must be in [0, 1], got {threshold}")

176 self.threshold = threshold

177 self.model = model

178 if judge_fn is None:

179 provider = OllamaProvider(model=model, timeout=180.0, temperature=0.0)

180 judge_fn = provider.generate

181 self._judge_fn = judge_fn

182

183 def build_prompt(self, question: str, output: str, expected: str) -> str:

184 """Render the rubric prompt the judge will see. Public so tests and

185 debug paths can inspect what was actually sent without scoring."""

186 return RUBRIC_PROMPT.format(question=question, expected=expected, output=output)

187

188 async def score(self, question: str, output: str, expected: str) -> ScoreResult:

189 prompt = self.build_prompt(question, output, expected)

190 raw = await self._judge_fn(prompt)

191 correctness, relevance, reasoning = _parse_judge_response(raw)

192

193 # Clamp into 0-10 in case the judge invented an out-of-range score.

194 correctness = max(0, min(10, correctness))

195 relevance = max(0, min(10, relevance))

196

197 score = (correctness + relevance) / 20.0

198 passed = score >= self.threshold

199 reason = (

200 f"correctness={correctness}/10 relevance={relevance}/10 "

201 f"score={score:.3f} {'>=' if passed else '<'} threshold={self.threshold} "

202 f"| reasoning: {reasoning}"

203 )

204 return ScoreResult(passed=passed, score=score, reason=reason)

Coverage for eval_harness / scorers / judge.py: 100%

51 statements