Coverage for eval_harness / scorers / semantic.py: 100%
22 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 20:06 +0200
1from __future__ import annotations
3from sentence_transformers import SentenceTransformer, util
5from eval_harness.scorers.base import Scorer, ScoreResult
7DEFAULT_MODEL = "all-MiniLM-L6-v2"
8DEFAULT_THRESHOLD = 0.75
11class SemanticScorer(Scorer):
12 """Does the model's answer *mean* the same thing as the expected answer?:
14 1. A small neural network (the *embedding model*) turns each piece of
15 text into a list of 384 numbers — a coordinate in a 384-dimensional
16 "meaning space."
17 2. Texts with similar meanings end up at nearby coordinates because the
18 model was trained that way. "Paris" and "The capital of France is
19 Paris." sit close; "Paris" and "Water boils at 100°C" sit far.
20 3. We measure how close the two coordinates are with *cosine
21 similarity* (the angle between them, not the distance — same
22 direction = 1.0, perpendicular = 0.0).
23 4. If the similarity is above a threshold, the scorer says PASS.
25 None of this is a real understanding of meaning — it is pattern
26 matching over how words co-occur in the model's training data.
28 Two implementation details:
30 - The default model is `all-MiniLM-L6-v2`: small, fast, downloads
31 automatically the first time the scorer is constructed.
32 - The threshold is the knob you tune against your own dataset.
33 Too high collapses into exact match. Too low lets unrelated
34 answers through. 0.75 is a starting point — expect to revisit
35 it once you have real data.
36 """
38 name = "semantic"
40 # Class-level cache: loading a model takes seconds and downloads weights
41 # on first use. One process, one model load per model name, regardless
42 # of how many SemanticScorer instances are created.
43 _model_cache: dict[str, SentenceTransformer] = {}
45 def __init__(
46 self,
47 threshold: float = DEFAULT_THRESHOLD,
48 model_name: str = DEFAULT_MODEL,
49 ) -> None:
50 if not 0.0 <= threshold <= 1.0:
51 raise ValueError(f"threshold must be in [0, 1], got {threshold}")
52 self.threshold = threshold
53 self.model_name = model_name
54 if model_name not in self._model_cache:
55 self._model_cache[model_name] = SentenceTransformer(model_name)
56 self._model = self._model_cache[model_name]
58 async def score(self, question: str, output: str, expected: str) -> ScoreResult:
59 embeddings = self._model.encode([output, expected], convert_to_tensor=True)
60 cosine = util.cos_sim(embeddings[0], embeddings[1]).item()
61 clamped = max(0.0, min(1.0, cosine))
62 passed = clamped >= self.threshold
63 return ScoreResult(
64 passed=passed,
65 score=clamped,
66 reason=f"cosine={clamped:.3f} {'>=' if passed else '<'} threshold={self.threshold}",
67 )