Coverage for eval_harness / scorers / judge.py: 100%
51 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1"""LLM-as-judge: a second model grades the first model's answer.
3Plain-English version of what this file does, end to end:
51. Take the question, the expected answer, and the model's actual
6 answer. Drop them into a grading-instructions template (see
7 `RUBRIC_PROMPT`).
82. Send that filled-in prompt to an LLM (a *separate* call from
9 whatever produced the answer being graded).
103. The LLM replies with a small JSON blob containing two integer
11 scores (correctness 0-10, relevance 0-10) and a one-line
12 explanation.
134. Parse the JSON, average the two scores, divide by 10 to get a
14 score in [0, 1], compare against a threshold, return PASS or FAIL.
16Why bother. The simpler scorers (exact match, BLEU, ROUGE, semantic
17similarity) all compare *text shapes*. They can be fooled by:
19- A right answer phrased very differently from the expected (e.g. a
20 bulleted list of all 8 planets when the expected was just "8" —
21 exact match and BLEU/ROUGE see no overlap; embeddings see different
22 shape).
23- A wrong answer that uses the right vocabulary (semantic similarity
24 passes it).
26A judge LLM reads both texts and can sometimes recognize "yes, this
27answers the question" or "no, this is plausible-sounding nonsense"
28where pure-text comparison can't.
30Important caveat. When the judge and the answering model are the same
31weights (the case in this project — both are llama3.2), the judge has
32a blind spot for its own hallucinations. It produced the same
33nonsense; it can't tell it apart from a real answer. Surfacing this
34"self-grading bias" is one of the things this harness is here to
35demonstrate — see `tests/test_calibration.py` and the
36`procedural_001` item in `data/human_labels.yaml`.
37"""
39from __future__ import annotations
41import json
42import re
43from collections.abc import Awaitable, Callable
45from eval_harness.providers.ollama import OllamaProvider
46from eval_harness.scorers.base import Scorer, ScoreResult
48DEFAULT_THRESHOLD = 0.7
49DEFAULT_MODEL = "llama3.2"
51# The grading instructions sent to the judge. See LLMJudgeScorer's
52# docstring for why the rubric looks the way it does.
53RUBRIC_PROMPT = """You are a strict but fair grader for a question-answering system.
55You will be given:
56- A QUESTION
57- An EXPECTED answer (a reference — the model's answer does NOT need to
58 match it word-for-word to be correct)
59- The MODEL'S answer to grade
61Grade the model's answer on two dimensions, each 0-10:
62- correctness: does the answer correctly answer the question?
63 10 = fully correct (even if phrased differently from the expected answer)
64 5 = partially correct, or correct but with significant inaccuracies
65 0 = wrong, contradicts the expected answer, or hallucinated
66- relevance: is the answer on-topic and responsive to the question?
67 10 = directly addresses the question
68 5 = partially on-topic, drifts
69 0 = off-topic or refuses to answer
71Respond with ONLY a JSON object, no other text:
72{{"reasoning": "<one or two sentences explaining your scores>",
73 "correctness": <int>,
74 "relevance": <int>}}
76QUESTION: {question}
77EXPECTED: {expected}
78MODEL'S ANSWER: {output}"""
81class JudgeParseError(RuntimeError):
82 """Raised when the judge's response cannot be parsed into scores.
84 A malformed judge response is an infrastructure problem (the judge
85 model isn't following the rubric format), not "the model's answer
86 was bad." Silently coercing to a 0 score would hide bugs in the
87 judge prompt or the parsing logic.
88 """
91def _parse_judge_response(raw: str) -> tuple[int, int, str]:
92 """Extract (correctness, relevance, reasoning) from the judge's text.
94 Strategy: try strict JSON first. If the model wrapped the JSON in
95 prose or markdown, try to find the first {...} block. As a last
96 resort, regex-extract the two integer scores. If even that fails,
97 raise JudgeParseError with the raw text — better than silently
98 returning a 0.
99 """
100 try:
101 data = json.loads(raw)
102 return (
103 int(data["correctness"]),
104 int(data["relevance"]),
105 str(data.get("reasoning", "")),
106 )
107 except (json.JSONDecodeError, KeyError, TypeError, ValueError):
108 pass
110 json_blob = re.search(r"\{.*\}", raw, re.DOTALL)
111 if json_blob:
112 try:
113 data = json.loads(json_blob.group(0))
114 return (
115 int(data["correctness"]),
116 int(data["relevance"]),
117 str(data.get("reasoning", "")),
118 )
119 except (json.JSONDecodeError, KeyError, TypeError, ValueError):
120 pass
122 correctness_match = re.search(r'"?correctness"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE)
123 relevance_match = re.search(r'"?relevance"?\s*[:=]\s*(\d+)', raw, re.IGNORECASE)
124 if correctness_match and relevance_match:
125 return (
126 int(correctness_match.group(1)),
127 int(relevance_match.group(1)),
128 "(reasoning unparseable, scores extracted by regex fallback)",
129 )
131 raise JudgeParseError(f"Could not parse judge response: {raw!r}")
134class LLMJudgeScorer(Scorer):
135 """Calls a grading LLM and turns its reply into a PASS/FAIL.
137 Three rubric design choices are worth knowing about, since they
138 explain why `RUBRIC_PROMPT` looks the way it does:
140 - **Reference is a hint, not a string to match.** The judge is told
141 the expected answer is a reference — the model's answer can be
142 phrased completely differently and still be correct. This is what
143 lets the judge pass right-but-different-shape answers (a bulleted
144 list when the expected was a single number, etc).
145 - **Two scores, not one.** Asking for *correctness* and *relevance*
146 separately keeps two distinct failure modes from being smeared
147 into a single number: a right-but-rambling answer scores high on
148 correctness and low on relevance; a confident hallucination
149 scores low on correctness and high on relevance. A single
150 "quality" score loses that signal.
151 - **Reasoning before score.** The rubric asks the judge to justify
152 its score *before* writing the number down. A model asked for
153 the score first tends to commit to a number and rationalize it.
155 Construction.
157 - `threshold` — pass/fail cutoff on the [0, 1] combined score.
158 - `judge_fn` — async callable that takes a prompt and returns the
159 judge's raw text. Defaults to a fresh `OllamaProvider` at
160 temperature 0 (judge wants determinism). Tests inject a fake
161 `judge_fn` to avoid touching HTTP at all.
162 - `model` — the Ollama model name to use when `judge_fn` is the
163 default.
164 """
166 name = "judge"
168 def __init__(
169 self,
170 threshold: float = DEFAULT_THRESHOLD,
171 judge_fn: Callable[[str], Awaitable[str]] | None = None,
172 model: str = DEFAULT_MODEL,
173 ) -> None:
174 if not 0.0 <= threshold <= 1.0:
175 raise ValueError(f"threshold must be in [0, 1], got {threshold}")
176 self.threshold = threshold
177 self.model = model
178 if judge_fn is None:
179 provider = OllamaProvider(model=model, timeout=180.0, temperature=0.0)
180 judge_fn = provider.generate
181 self._judge_fn = judge_fn
183 def build_prompt(self, question: str, output: str, expected: str) -> str:
184 """Render the rubric prompt the judge will see. Public so tests and
185 debug paths can inspect what was actually sent without scoring."""
186 return RUBRIC_PROMPT.format(question=question, expected=expected, output=output)
188 async def score(self, question: str, output: str, expected: str) -> ScoreResult:
189 prompt = self.build_prompt(question, output, expected)
190 raw = await self._judge_fn(prompt)
191 correctness, relevance, reasoning = _parse_judge_response(raw)
193 # Clamp into 0-10 in case the judge invented an out-of-range score.
194 correctness = max(0, min(10, correctness))
195 relevance = max(0, min(10, relevance))
197 score = (correctness + relevance) / 20.0
198 passed = score >= self.threshold
199 reason = (
200 f"correctness={correctness}/10 relevance={relevance}/10 "
201 f"score={score:.3f} {'>=' if passed else '<'} threshold={self.threshold} "
202 f"| reasoning: {reasoning}"
203 )
204 return ScoreResult(passed=passed, score=score, reason=reason)