Coverage for eval_harness/scorers/semantic.py: 100%

1from __future__ import annotations

3from sentence_transformers import SentenceTransformer, util

5from eval_harness.scorers.base import Scorer, ScoreResult

7DEFAULT_MODEL = "all-MiniLM-L6-v2"

8DEFAULT_THRESHOLD = 0.75

11class SemanticScorer(Scorer):

12 """Does the model's answer *mean* the same thing as the expected answer?:

14 1. A small neural network (the *embedding model*) turns each piece of

15 text into a list of 384 numbers — a coordinate in a 384-dimensional

16 "meaning space."

17 2. Texts with similar meanings end up at nearby coordinates because the

18 model was trained that way. "Paris" and "The capital of France is

19 Paris." sit close; "Paris" and "Water boils at 100°C" sit far.

20 3. We measure how close the two coordinates are with *cosine

21 similarity* (the angle between them, not the distance — same

22 direction = 1.0, perpendicular = 0.0).

23 4. If the similarity is above a threshold, the scorer says PASS.

25 None of this is a real understanding of meaning — it is pattern

26 matching over how words co-occur in the model's training data.

28 Two implementation details:

30 - The default model is `all-MiniLM-L6-v2`: small, fast, downloads

31 automatically the first time the scorer is constructed.

32 - The threshold is the knob you tune against your own dataset.

33 Too high collapses into exact match. Too low lets unrelated

34 answers through. 0.75 is a starting point — expect to revisit

35 it once you have real data.

36 """

38 name = "semantic"

40 # Class-level cache: loading a model takes seconds and downloads weights

41 # on first use. One process, one model load per model name, regardless

42 # of how many SemanticScorer instances are created.

43 _model_cache: dict[str, SentenceTransformer] = {}

45 def __init__(

46 self,

47 threshold: float = DEFAULT_THRESHOLD,

48 model_name: str = DEFAULT_MODEL,

49 ) -> None:

50 if not 0.0 <= threshold <= 1.0:

51 raise ValueError(f"threshold must be in [0, 1], got {threshold}")

52 self.threshold = threshold

53 self.model_name = model_name

54 if model_name not in self._model_cache:

55 self._model_cache[model_name] = SentenceTransformer(model_name)

56 self._model = self._model_cache[model_name]

58 async def score(self, question: str, output: str, expected: str) -> ScoreResult:

59 embeddings = self._model.encode([output, expected], convert_to_tensor=True)

60 cosine = util.cos_sim(embeddings[0], embeddings[1]).item()

61 clamped = max(0.0, min(1.0, cosine))

62 passed = clamped >= self.threshold

63 return ScoreResult(

64 passed=passed,

65 score=clamped,

66 reason=f"cosine={clamped:.3f} {'>=' if passed else '<'} threshold={self.threshold}",

67 )

Coverage for eval_harness / scorers / semantic.py: 100%

22 statements