Coverage for eval_harness / scorers / semantic.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 20:06 +0200

1from __future__ import annotations 

2 

3from sentence_transformers import SentenceTransformer, util 

4 

5from eval_harness.scorers.base import Scorer, ScoreResult 

6 

7DEFAULT_MODEL = "all-MiniLM-L6-v2" 

8DEFAULT_THRESHOLD = 0.75 

9 

10 

11class SemanticScorer(Scorer): 

12 """Does the model's answer *mean* the same thing as the expected answer?: 

13 

14 1. A small neural network (the *embedding model*) turns each piece of 

15 text into a list of 384 numbers — a coordinate in a 384-dimensional 

16 "meaning space." 

17 2. Texts with similar meanings end up at nearby coordinates because the 

18 model was trained that way. "Paris" and "The capital of France is 

19 Paris." sit close; "Paris" and "Water boils at 100°C" sit far. 

20 3. We measure how close the two coordinates are with *cosine 

21 similarity* (the angle between them, not the distance — same 

22 direction = 1.0, perpendicular = 0.0). 

23 4. If the similarity is above a threshold, the scorer says PASS. 

24 

25 None of this is a real understanding of meaning — it is pattern 

26 matching over how words co-occur in the model's training data. 

27 

28 Two implementation details: 

29 

30 - The default model is `all-MiniLM-L6-v2`: small, fast, downloads 

31 automatically the first time the scorer is constructed. 

32 - The threshold is the knob you tune against your own dataset. 

33 Too high collapses into exact match. Too low lets unrelated 

34 answers through. 0.75 is a starting point — expect to revisit 

35 it once you have real data. 

36 """ 

37 

38 name = "semantic" 

39 

40 # Class-level cache: loading a model takes seconds and downloads weights 

41 # on first use. One process, one model load per model name, regardless 

42 # of how many SemanticScorer instances are created. 

43 _model_cache: dict[str, SentenceTransformer] = {} 

44 

45 def __init__( 

46 self, 

47 threshold: float = DEFAULT_THRESHOLD, 

48 model_name: str = DEFAULT_MODEL, 

49 ) -> None: 

50 if not 0.0 <= threshold <= 1.0: 

51 raise ValueError(f"threshold must be in [0, 1], got {threshold}") 

52 self.threshold = threshold 

53 self.model_name = model_name 

54 if model_name not in self._model_cache: 

55 self._model_cache[model_name] = SentenceTransformer(model_name) 

56 self._model = self._model_cache[model_name] 

57 

58 async def score(self, question: str, output: str, expected: str) -> ScoreResult: 

59 embeddings = self._model.encode([output, expected], convert_to_tensor=True) 

60 cosine = util.cos_sim(embeddings[0], embeddings[1]).item() 

61 clamped = max(0.0, min(1.0, cosine)) 

62 passed = clamped >= self.threshold 

63 return ScoreResult( 

64 passed=passed, 

65 score=clamped, 

66 reason=f"cosine={clamped:.3f} {'>=' if passed else '<'} threshold={self.threshold}", 

67 )