From a2d6770a2339abad096fe6670045a8e81ecdba86 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:33:03 +0200 Subject: [PATCH 01/67] feat(evaluation): add evaluation subpackage skeleton and pyproject entry point (#268) * feat(evaluation): add evaluation subpackage __init__ with gate/champion/judge/retrieval exports * feat(evaluation): add EvalConfig and GateVerdict models * feat(evaluation): add evaluation optional-deps and flyeval CLI entry point to pyproject.toml * feat(evaluation): note evaluation as optional subpackage in top-level __init__ docstring --------- Co-authored-by: miguelgfierro --- fireflyframework_agentic/__init__.py | 7 ++ .../evaluation/__init__.py | 57 +++++++++++++++ fireflyframework_agentic/evaluation/models.py | 70 +++++++++++++++++++ pyproject.toml | 7 ++ 4 files changed, 141 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/__init__.py create mode 100644 fireflyframework_agentic/evaluation/models.py diff --git a/fireflyframework_agentic/__init__.py b/fireflyframework_agentic/__init__.py index 993b0248..1736f1f4 100644 --- a/fireflyframework_agentic/__init__.py +++ b/fireflyframework_agentic/__init__.py @@ -24,6 +24,13 @@ config = get_config() print(config.default_model) + +Optional subpackages (not imported eagerly at the top level): + fireflyframework_agentic.lab -- sessions, benchmarks, datasets, evaluation orchestration + fireflyframework_agentic.experiments -- experiment tracking and comparison + fireflyframework_agentic.evaluation -- gate-based quality gates, LLM-as-judge advisory, + champion/challenger tracking, retrieval metrics + (requires the ``evaluation`` optional extra) """ from importlib.metadata import PackageNotFoundError, version diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py new file mode 100644 index 00000000..1c264f07 --- /dev/null +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -0,0 +1,57 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics. + +Gate pipeline (flags, not vetoes): + G1 -- Structural & Safe (schema + PII + empty-registry guard) + G2 -- Must-finds & negative controls (recall + NC precision) + G3 -- Evidence (grounding / token-anchoring) + G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion) + G5 -- No-regression / promotion (champion/challenger comparison) + +Retrieval metrics: + Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results. + +Champion tracking: + Persists the best-known run record so that promotion decisions can be made + against a stable baseline rather than the most recent run. +""" + +from importlib.metadata import PackageNotFoundError, version + +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion +from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics + +try: + __version__ = version("fireflyframework-agentic") +except PackageNotFoundError: + __version__ = "0.0.0+dev" + +__all__ = [ + "GateResult", + "Verdict", + "run_gates", + "render_scorecard", + "ChampionRecord", + "load_champion", + "save_champion", + "invalidate_champion", + "AdvisoryReport", + "run_judge", + "RetrieverMetrics", + "compute_retrieval_metrics", +] diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py new file mode 100644 index 00000000..a98cdf20 --- /dev/null +++ b/fireflyframework_agentic/evaluation/models.py @@ -0,0 +1,70 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared config and model classes for the evaluation framework. + +EvalConfig captures the parameters of a single evaluation run: which model +is being tested, which corpus it runs against, and where the supporting +artefacts (registry, baseline, judge config) live. + +GateVerdict constants define the two possible outcomes of the promotion gate: +PROMOTE (the challenger beats or ties the champion and is safe to deploy) +or HOLD (the challenger does not meet the bar and must be iterated on). +""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel + + +class EvalConfig(BaseModel): + """Configuration for a single evaluation run. + + Parameters: + model_id: Identifier of the model under evaluation. + corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench"). + run_id: Unique identifier for this run (e.g. a timestamp or git SHA). + registry_path: Path to the must-find / golden registry JSON file. + corpus_path: Path to the corpus directory or bundle. + baseline_path: Path to a baseline results file for regression comparison. + judge_model: Model identifier used for the LLM-as-judge advisory pass. + judge_runs: Number of independent judge calls to aggregate (majority vote). + embed_model: Model identifier used for embedding-based retrieval metrics. + metadata: Arbitrary key/value pairs for run bookkeeping. + """ + + model_id: str + corpus: str + run_id: str + registry_path: str = "" + corpus_path: str = "" + baseline_path: str = "" + judge_model: str = "" + judge_runs: int = 3 + embed_model: str = "" + metadata: dict[str, Any] = {} + + +class GateVerdict: + """Promotion gate verdict constants. + + Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and + is safe to become the new champion. Use ``GateVerdict.HOLD`` when the + challenger does not meet the bar and must be iterated on. + """ + + PROMOTE: str = "PROMOTE" + HOLD: str = "HOLD" diff --git a/pyproject.toml b/pyproject.toml index e575323e..bb74201f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,6 +119,10 @@ binary = [ all = [ "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]", ] +evaluation = [ + "scipy>=1.11", + "numpy>=1.26.0", +] dev = [ "pytest>=8.3.0", "pytest-asyncio>=0.24.0", @@ -132,6 +136,9 @@ dev = [ "pre-commit>=3.8.0", ] +[project.scripts] +flyeval = "fireflyframework_agentic.evaluation.cli:main" + [project.urls] Homepage = "https://fireflyframework.org/" Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs" From 8676b6adbc3319845dc1f7b2faede2e8d4b9cd56 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:36:17 +0200 Subject: [PATCH 02/67] feat(evaluation): add matcher primitives and statistics helpers (#269) * feat(evaluation): add matcher primitives (anchored, matches, source_stem, tokens) * feat(evaluation): add statistics helpers (aa_band, aggregate_grounding, left_skew_flag) * feat(evaluation): export matcher and stats primitives from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 9 + .../evaluation/matcher.py | 374 ++++++++++++++++++ fireflyframework_agentic/evaluation/stats.py | 110 ++++++ 3 files changed, 493 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/matcher.py create mode 100644 fireflyframework_agentic/evaluation/stats.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 1c264f07..7d740b00 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -34,7 +34,9 @@ from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag try: __version__ = version("fireflyframework-agentic") @@ -54,4 +56,11 @@ "run_judge", "RetrieverMetrics", "compute_retrieval_metrics", + "anchored", + "matches", + "source_stem", + "tokens", + "aa_band", + "aggregate_grounding", + "left_skew_flag", ] diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py new file mode 100644 index 00000000..2f5065df --- /dev/null +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -0,0 +1,374 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding). + +anchored() is topic-level lexical overlap. matches() is the gate predicate. +One function, three uses — do not write three matching functions. + +Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified. +A '45 days' claim cited to a '3 days' source passes if they share the process name. +Real claim entailment (NLI/AIS) is Phase 2. The G3 human spot-check is the +binding faithfulness signal until then. +""" + +from __future__ import annotations + +import re + +import numpy as np + + +def cosine(a, b) -> float: + """Cosine similarity between two vectors.""" + a = np.asarray(a, dtype=float) + b = np.asarray(b, dtype=float) + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) + + +def tokens(text: str) -> list[str]: + return re.findall(r"\b\w+\b", text.lower()) + + +def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool: + """True if claim and evidence share at least one non-trivial token (>= min_token chars). + + Rejects a citation to an unrelated document. Does NOT verify the claim value — + that gap is closed by the deferred NLI/AIS check in Phase 2. + """ + a = {t for t in tokens(claim) if len(t) >= min_token} + b = {t for t in tokens(evidence) if len(t) >= min_token} + return bool(a & b) + + +def source_stem(locator: str) -> str: + """Normalize a locator/source path to a stable document stem for matching. + + Robust to the two locator conventions observed across runs: + - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md') + both reduce to 'sop-002-kyc-edd'; + - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the + process stem 'credit-underwriting', so they join the CSV the registry cites. + + Preserves the same-document anti-gaming property of matches(): it still keys + on which source document a finding cites — just independent of directory + prefix, file extension, and case, so one registry scores every run. + """ + s = locator.split("#")[0] # drop the locator fragment (#page=N, #anchor) + s = s.rsplit("/", 1)[-1] # basename — strip any directory prefix + if s.startswith("src-") and ":" in s: # event-log row id: src-: + return s.split(":", 1)[0][len("src-") :].lower() + if "." in s: # strip a trailing file extension + s = s.rsplit(".", 1)[0] + return s.lower() + + +def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]: + """Return the set of normalized source-document stems cited by a finding.""" + sources: set[str] = set() + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + stem = source_stem(ev.get("locator", "")) + if stem: + sources.add(stem) + return sources + + +def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool: + """True iff the finding cites at least one source document the item lists as evidence. + + Source documents are compared by normalized stem (source_stem) so one registry + scores every run regardless of locator convention. This is the anti-gaming + anchor reused by both the lexical predicate (matches) and the semantic path + (semantic_hits): a finding on a different document cannot satisfy this item. + + Spec-style NC items list their mirror source (§4.1); legacy NC items carry + evidence=[], which makes this always False for them. + + Args: + finding: dict from DiscoveryResult.findings[i] (model_dump output). + item: RegistryItem dataclass from registry.py. + evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. + """ + finding_sources = _finding_sources(finding, evidence_index) + item_sources = {source_stem(e) for e in item.evidence} + return bool(finding_sources & item_sources) + + +def _keyword_anchored(desc: str, keywords: list[str]) -> bool: + """True iff any keyword appears as a whole word in desc (case-insensitive). + + Keyword rail: exempt from the 5-char token floor so short banking terms + (KYC, PEP, AML) can anchor a match even though they are too short for the + token rail. Whole-word matching prevents false substring hits (e.g. "risk" + inside "enterprise-risk-management"). + """ + if not keywords: + return False + desc_lower = desc.lower() + return any( + re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords + ) + + +def candidate_text(candidate: dict, scope: str) -> str: + """Extract the searchable text from a candidate on the given scope surface (§4.3). + + Each scope surface uses different fields as the match text: + - finding / action : title + description + - process / decision : name + description + - activity : name + notes + regulatory_links + - persona : name + role + goals + pain_points + - system : name + description + - informal_channel : name + usage_context + notes + - dependency_graph : name + description (diagnostic nodes; relation items bypass this) + """ + if scope in ("finding", "action"): + return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")])) + if scope == "activity": + rl = candidate.get("regulatory_links") or [] + rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "") + return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str])) + if scope == "persona": + goals = candidate.get("goals") or [] + pain = candidate.get("pain_points") or [] + goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) + pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) + return " ".join(filter(None, [ + candidate.get("name", ""), + candidate.get("role", ""), + goals_str, + pain_str, + ])) + if scope == "informal_channel": + return " ".join(filter(None, [ + candidate.get("name", ""), + candidate.get("usage_context", ""), + candidate.get("notes", ""), + ])) + # process, decision, system, dependency_graph (diagnostic nodes) + return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) + + +INSIGHT_ITEM_SCOPES = ("finding", "action") +INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision") + + +def allowed_scopes(item) -> tuple[str, ...]: + """Candidate surfaces that may satisfy a registry item. + + Insight items (finding / action) may be satisfied by any insight or process-graph + *leaf* surface (activity / decision): a run often grounds the same operational fact + on a different surface than the registry's scope tag anticipates (the BBVA case — + pain points the registry tags 'finding' that the run emitted as decision/activity + nodes). shares_source is still REQUIRED on every candidate (see matches / + semantic_hits), so a candidate on the wrong document never counts — cross-scope + widens WHERE we look, never the source anchor. + + Structural items (process / activity / decision) stay on their own surface: a + structural must-find requires the run to have actually built that node, not merely + mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process). + NC items are likewise scope-strict — widening a negative control's pool could only + make it easier to trip (a specificity regression), never recover a legitimate hit. + + `process` is never a match surface for an insight item: _candidates_by_scope folds + every child's evidence_refs into the process node, so its citation set is a union of + many documents and shares_source goes vacuous (hence its exclusion from + INSIGHT_MATCH_SURFACES). + """ + if item.tier == "NC": + return (item.scope,) + if item.scope in INSIGHT_ITEM_SCOPES: + return INSIGHT_MATCH_SURFACES + return (item.scope,) + + +def matches( + candidate: dict, + item, + evidence_index: dict[str, dict], + scope: str = "finding", +) -> bool: + """True iff candidate cites a shared source document AND is topic-anchored to item. + + Two-rail anchor (either rail suffices): + - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description. + - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text. + Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor. + + The ``scope`` controls which fields are read as the candidate's match text (§4.3): + findings and actions use ``title + description``; processes and decisions use + ``name + description``; activities use ``name + notes + regulatory_links``. + + Anti-gaming guard: a candidate on a different document cannot satisfy this item + even if its text happens to match. Source documents are compared by + normalized stem (source_stem) so one registry scores every run regardless of + locator convention. + + Args: + candidate: dict from the DiscoveryResult surface matching ``scope``. + item: RegistryItem dataclass from registry.py. + evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. + scope: surface the candidate was drawn from (default "finding"). + """ + if not shares_source(candidate, item, evidence_index): + return False + desc = candidate_text(candidate, scope) + return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description) + + +def matches_dependency_graph_relation( + item, + result: dict, + evidence_index: dict[str, dict], +) -> bool: + """Endpoint matcher for dependency_graph relation items (§5.3b). + + Stage 1: Anchor both endpoints to activity nodes via token rail. + Stage 2: Verify a directed edge or path connects them in the asserted direction, + behind the shared-source guard on the edge's/path's evidence_refs. + + Returns False when either endpoint anchors to no activity, or when no connecting + edge/path shares a source document with the item. + """ + if not item.from_node or not item.to_node: + return False + + processes = result.get("process_graph", {}).get("processes", []) + all_activities = [a for p in processes for a in p.get("activities", [])] + + def _anchor(endpoint_text: str) -> set[str]: + return { + a["id"] + for a in all_activities + if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) + } + + from_ids = _anchor(item.from_node) + to_ids = _anchor(item.to_node) + if not from_ids or not to_ids: + return False + + item_stems = {source_stem(e) for e in item.evidence} + + def _node_stems(node: dict) -> set[str]: + return { + source_stem(evidence_index[r["evidence_id"]].get("locator", "")) + for r in node.get("evidence_refs", []) + if r.get("evidence_id") in evidence_index + } + + dg = result.get("dependency_graph", {}) + + for edge in dg.get("activity_edges", []): + if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: + if _node_stems(edge) & item_stems: + return True + + for path in dg.get("critical_paths", []): + if not (_node_stems(path) & item_stems): + continue + node_ids = path.get("node_ids", []) + from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids] + to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids] + if any(fp < tp for fp in from_pos for tp in to_pos): + return True + + return False + + +def semantic_hits( + candidates: dict[str, list[dict]], + items, + evidence_index: dict[str, dict], + embed_fn, + tau: float = 0.70, + tau_nc: float = 0.85, +) -> dict[str, bool]: + """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}. + + Scope-aware: each registry item is evaluated against candidates from its own + scope surface (finding, process, activity, decision, action) using the same + per-scope field extraction as the lexical path (candidate_text). Passing only + the findings list (the previous behaviour) would leave process/activity/decision/ + action items with an empty candidate pool and a guaranteed False result. + + Real items (L0–L3): hit iff some scope-matching candidate shares a source + document with the item (shares_source) AND is embedding-similar (cosine >= tau). + Source anchor is preserved — a candidate on a different document cannot recover + a real item. + + NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar + (cosine >= tau_nc). When the NC lists its mirror source (§4.1) the shared-source + guard applies; legacy NC items with evidence=[] skip the anchor, with the higher + threshold (default 0.85) compensating. + + Cost is two embed_fn calls — all scope-appropriate candidate texts once and all + item texts once — not O(n*m) per-pair embeddings. + + Args: + candidates: {scope: [candidate dicts]} from _candidates_by_scope(). + items: iterable of RegistryItem dataclasses. + evidence_index: {evidence_id: Evidence dict}. + embed_fn: callable(list[str]) -> array-like of row vectors. + tau: cosine threshold for real items (inclusive). + tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor). + """ + items = list(items) + + # Flatten all candidates across scopes, preserving their scope tag for + # text extraction and per-item filtering. + scoped: list[tuple[str, dict]] = [ + (scope, cand) + for scope, cands in candidates.items() + for cand in cands + ] + + if not scoped: + return {item.id: False for item in items} + + cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] + item_texts = [ + " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items + ] + + cand_vecs = np.asarray(embed_fn(cand_texts)) + item_vecs = np.asarray(embed_fn(item_texts)) + + hits: dict[str, bool] = {} + for i, item in enumerate(items): + item_vec = item_vecs[i] + allowed = allowed_scopes(item) + hit = False + for k, (scope, cand) in enumerate(scoped): + if scope not in allowed: + continue + if item.tier == "NC": + # Shared-source guard applies when the NC lists its mirror source + # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the + # higher tau_nc compensating. + if item.evidence and not shares_source(cand, item, evidence_index): + continue + if cosine(cand_vecs[k], item_vec) >= tau_nc: + hit = True + break + elif ( + shares_source(cand, item, evidence_index) + and cosine(cand_vecs[k], item_vec) >= tau + ): + hit = True + break + hits[item.id] = hit + return hits diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py new file mode 100644 index 00000000..e70c629a --- /dev/null +++ b/fireflyframework_agentic/evaluation/stats.py @@ -0,0 +1,110 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Statistics helpers: A/A noise band + fixed aggregate_grounding. + +The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm +correction, and MCID power analysis. Four self-authored corpora with ~30-70 +non-independent items each cannot power those tests; gating on unpowered tests +is false precision. See EVALUATION_FRAMEWORK.md (regression statistics). + +This module also provides the fixed aggregate_grounding() that closes a prior +aggregation bug where the previous runner inherited run 0's grounding report +unchanged instead of merging across all runs. +""" +from __future__ import annotations + +import statistics +from typing import Sequence + + +def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: + """95th-percentile pairwise delta from champion reruns — the noise floor. + + Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute + differences is the A/A noise floor. A candidate must beat the champion by + more than this number on EVERY seed to count as a real improvement. + + This single number replaces MCID, power analysis, McNemar, Wilcoxon, + bootstrap CIs, and Holm correction. See EVALUATION_FRAMEWORK.md (the A/A noise band). + + Args: + scores: Per-run primary metric scores from champion reruns (>= 2 required). + percentile: Which percentile (default 95). + + Returns: + Noise floor as a float in the same units as the input scores. + """ + scores = list(scores) + if len(scores) < 2: + raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") + deltas = [ + abs(x - y) + for i, x in enumerate(scores) + for y in scores[i + 1:] + ] + sorted_deltas = sorted(deltas) + # Index for the requested percentile; clamp to valid range + idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) + return sorted_deltas[idx] + + +def aggregate_grounding(grounding_dicts: list[dict]) -> dict: + """Merge per-run grounding reports into a conservative aggregate. + + Fixes a prior aggregation bug where the previous runner inherited run 0's grounding + report unchanged. Correct behaviour: + - support_pct: mean across runs + - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged) + + Args: + grounding_dicts: List of grounding report dicts, one per evaluation run. + Each must have 'support_pct' (float 0-100) and optionally + 'unsupported_ids' (list[str]). + + Returns: + Merged grounding dict. + """ + if not grounding_dicts: + return {"support_pct": 0.0, "unsupported_ids": []} + + support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts] + mean_pct = statistics.mean(support_pcts) + + unsupported: set[str] = set() + for g in grounding_dicts: + unsupported.update(g.get("unsupported_ids", [])) + + first = grounding_dicts[0] + return { + **first, + "support_pct": round(mean_pct, 2), + "unsupported_ids": sorted(unsupported), + "_aggregate_runs": len(grounding_dicts), + "_support_pct_per_run": [round(p, 2) for p in support_pcts], + } + + +def left_skew_flag(scores: Sequence[float]) -> bool: + """True if min < median - 0.10 (HIGH_VARIANCE sentinel). + + A single catastrophic run cannot hide inside a decent mean. + True => HIGH_VARIANCE; block the run until investigated. + See EVALUATION_FRAMEWORK.md (anti-flakiness). + """ + scores = list(scores) + if len(scores) < 2: + return False + med = statistics.median(scores) + return min(scores) < med - 0.10 From 8eb2110ef25ad1579d0f093e011664dfe40935e6 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:39:04 +0200 Subject: [PATCH 03/67] feat(evaluation): add corpus loader and registry modules (#270) * feat(evaluation): add corpus loader and evidence verification module * feat(evaluation): add lean-1 registry loader and RegistryItem/Registry models * feat(evaluation): re-export corpus and registry symbols from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 13 ++ fireflyframework_agentic/evaluation/corpus.py | 185 +++++++++++++++ .../evaluation/registry.py | 214 ++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/corpus.py create mode 100644 fireflyframework_agentic/evaluation/registry.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 7d740b00..b6283d8b 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -31,10 +31,12 @@ from importlib.metadata import PackageNotFoundError, version +from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens +from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag @@ -44,6 +46,13 @@ __version__ = "0.0.0+dev" __all__ = [ + "EMPTY", + "FABRICATED", + "SOURCE_UNKNOWN", + "VERIFIED", + "corpus_sha256", + "load_corpus", + "verify_evidence_index", "GateResult", "Verdict", "run_gates", @@ -54,6 +63,10 @@ "invalidate_champion", "AdvisoryReport", "run_judge", + "Registry", + "RegistryItem", + "load_registry", + "registry_sha256", "RetrieverMetrics", "compute_retrieval_metrics", "anchored", diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py new file mode 100644 index 00000000..32835f2c --- /dev/null +++ b/fireflyframework_agentic/evaluation/corpus.py @@ -0,0 +1,185 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3). + +The corpus is the third pinned evaluation input, next to the DiscoveryResult +and the registry: the raw document bundle (input.json) the discovery pipeline +read. It is the trusted side of every evidence anchor — the registry tells +the evaluator what *should* be found; only the corpus can tell it whether what +a run cited is *real*. + +verify_entry() closes the fabricated-evidence channel: a run controls every +byte of its own evidence_index, so any check computable from (result, registry) +alone can be satisfied by self-reported evidence. Checking each excerpt +against the actual corpus text is the only deterministic counter. + +Excerpt contract: excerpts are verbatim quotes from the source document. +Spliced quotes (fragments joined with '...' or '…') are supported — each +fragment is verified independently. Paraphrase belongs in the finding +description, never in an excerpt. +""" + +from __future__ import annotations + +import base64 +import difflib +import hashlib +import json +import re +import unicodedata +from dataclasses import dataclass +from pathlib import Path + +from fireflyframework_agentic.evaluation.matcher import source_stem + +# Verification statuses for one evidence_index entry. +VERIFIED = "verified" # excerpt found (verbatim or spliced) in the cited source +EMPTY = "empty" # entry carries no excerpt text — nothing to verify +SOURCE_UNKNOWN = "source_unknown" # locator resolves to no corpus document +FABRICATED = "fabricated" # populated excerpt not found in the cited source + +# A spliced excerpt is split on these joiners; fragments shorter than +# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped. +_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ") +_MIN_FRAGMENT_CHARS = 15 + +# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars) +# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace +# drift while rejecting invented text (measured ~0.10-0.32 coverage). +_COVERAGE_THRESHOLD = 0.85 +_MIN_BLOCK_CHARS = 4 + + +@dataclass +class Corpus: + """The decoded, normalized corpus: {source stem: normalized text}. + + sha256 pins the corpus file exactly like the registry pin (§4.6): the + champion record stores it, and G1 re-hashes the file at scoring time to + flag CORPUS_DRIFT. + """ + + texts: dict[str, str] + sha256: str + path: str + + +def normalize(text: str) -> str: + """Normalize text for excerpt matching: NFKC, strip markdown emphasis and + smart quotes, collapse whitespace, casefold.""" + text = unicodedata.normalize("NFKC", text) + text = text.replace("**", "").replace("*", "") + text = re.sub(r"[\"""''']", "", text) + return re.sub(r"\s+", " ", text).strip().casefold() + + +def corpus_sha256(path: str | Path) -> str: + """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash).""" + return hashlib.sha256(Path(path).read_bytes()).hexdigest() + + +def load_corpus(path: str | Path) -> Corpus: + """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus. + + Decodes every artifacts[] file and signals[] event log (base64), normalizes + the text, and keys each by the same source_stem the matcher uses — so a + locator in any convention resolves to its document. + + Raises: + ValueError: when the bundle contains no documents, or two documents + reduce to the same stem (a collision would let a fabricated + citation resolve against the wrong real file). + """ + path = Path(path) + raw = json.loads(path.read_text(encoding="utf-8")) + + named_contents: list[tuple[str, str]] = [] + for artifact in raw.get("artifacts", []): + named_contents.append((artifact["filename"], artifact["content_base64"])) + for signal in raw.get("signals", []): + named_contents.append((signal["name"], signal["content_base64"])) + + if not named_contents: + raise ValueError(f"corpus bundle {path} contains no artifacts or signals") + + texts: dict[str, str] = {} + for name, content_b64 in named_contents: + stem = source_stem(name) + if stem in texts: + raise ValueError( + f"corpus stem collision: two documents reduce to {stem!r} — " + "rename one; a collision would verify citations against the wrong file" + ) + decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace") + texts[stem] = normalize(decoded) + + return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path)) + + +def _fragment_coverage(fragment: str, source: str) -> float: + """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" + blocks = difflib.SequenceMatcher( + None, fragment, source, autojunk=False + ).get_matching_blocks() + covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) + return covered / len(fragment) + + +def verify_entry(corpus: Corpus, entry: dict) -> str: + """Verify one evidence_index entry against the corpus. + + Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED: + - the locator must resolve (by source stem) to a corpus document, and + - every fragment of the excerpt must appear in that document's text, + verbatim after normalization or with matching-block coverage >= + _COVERAGE_THRESHOLD. + + The score is the minimum over fragments, so one invented fragment sinks a + spliced excerpt. + + """ + stem = source_stem(entry.get("locator", "")) + source = corpus.texts.get(stem) + if source is None: + return SOURCE_UNKNOWN + + excerpt = normalize(entry.get("excerpt") or "") + if not excerpt: + return EMPTY + + fragments = [ + f.strip() + for f in _SPLICE_PATTERN.split(excerpt) + if len(f.strip()) >= _MIN_FRAGMENT_CHARS + ] or [excerpt] + + for fragment in fragments: + if fragment in source: + continue + if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD: + return FABRICATED + return VERIFIED + + +def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: + """Verify every evidence_index entry of a DiscoveryResult. + + Returns {evidence_id: status} over all entries — referenced or not — so + the gates share one verification pass. + """ + return { + ev["id"]: verify_entry(corpus, ev) + for ev in result.get("evidence_index", []) + if ev.get("id") + } diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py new file mode 100644 index 00000000..2b869ba9 --- /dev/null +++ b/fireflyframework_agentic/evaluation/registry.py @@ -0,0 +1,214 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""lean-1 registry loader — one schema for all four corpora. + +Replaces the four mutually incompatible schemes in use today (L1-L5, +documented/observed/pain-point, critical/important, and no tiers). +Loader enforces all invariants; they are not documentation. + +Invariants (EVALUATION_FRAMEWORK.md, the must-find registry): +- schema_version == "lean-1" +- every tier is one of L0 L1 L2 L3 NC +- negative_control_count >= ceil(real_items / 10) +- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) +- ABANCA DILO items must target a single measured sub-population +""" +from __future__ import annotations + +import hashlib +import json +import math +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") +VALID_SCOPES = ( + "process", "activity", "decision", "finding", "action", + "persona", "system", "informal_channel", "dependency_graph", +) +SCHEMA_VERSION = "lean-1" +KAPPA_ADVISORY_THRESHOLD = 0.70 + + +@dataclass(frozen=True) +class RegistryItem: + id: str + tier: Literal["L0", "L1", "L2", "L3", "NC"] + description: str + evidence: list[str] # source file paths (path portion of locator, no #page=N) + scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) + keywords: list[str] = field(default_factory=list) + weight: float = 1.0 + from_node: str = "" # dependency_graph relation items only + to_node: str = "" # dependency_graph relation items only + relation: str = "" # defaults to "precedes" when from/to present + + +@dataclass(frozen=True) +class Registry: + schema_version: str + corpus: str + author: str + date: str + kappa: float + items: list[RegistryItem] + _sha256: str = field(default="", compare=False) + + @property + def real_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier != "NC"] + + @property + def nc_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier == "NC"] + + @property + def l0_items(self) -> list[RegistryItem]: + return [i for i in self.items if i.tier == "L0"] + + def is_kappa_advisory(self) -> bool: + return self.kappa < KAPPA_ADVISORY_THRESHOLD + + def sha256(self) -> str: + return self._sha256 + + +def _validate(raw: dict, path: Path) -> None: + if raw.get("schema_version") != SCHEMA_VERSION: + raise ValueError( + f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " + f"got {raw.get('schema_version')!r}" + ) + for fname in ("corpus", "author", "date"): + if not raw.get(fname): + raise ValueError(f"{path.name}: missing required field '{fname}'") + if "kappa" not in raw: + raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)") + + items = raw.get("items", []) + + # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug + if not items: + raise ValueError( + f"{path.name}: EMPTY_MUST_FIND — items list is empty; " + "cannot evaluate recall. This guard exists to prevent the " + "fake-100%-champion failure." + ) + + ids = [it.get("id") for it in items] + if len(ids) != len(set(ids)): + dupes = sorted({i for i in ids if ids.count(i) > 1}) + raise ValueError(f"{path.name}: duplicate item ids: {dupes}") + + for it in items: + tier = it.get("tier") + if tier not in VALID_TIERS: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " + f"must be one of {VALID_TIERS}" + ) + scope = it.get("scope", "finding") + if scope not in VALID_SCOPES: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " + f"must be one of {VALID_SCOPES}" + ) + if scope == "dependency_graph": + if not it.get("from") or not it.get("to"): + raise ValueError( + f"{path.name}: dependency_graph item '{it.get('id')}' must have " + "non-empty 'from' and 'to'" + ) + else: + if "from" in it or "to" in it or "relation" in it: + raise ValueError( + f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' " + f"but scope is '{scope}'; these fields are only valid on " + "dependency_graph-scoped items" + ) + + real_count = sum(1 for it in items if it.get("tier") != "NC") + nc_count = sum(1 for it in items if it.get("tier") == "NC") + required_nc = max(1, math.ceil(real_count / 10)) + if nc_count < required_nc: + raise ValueError( + f"{path.name}: NC density too low — {nc_count} NC item(s) for " + f"{real_count} real items; need >= {required_nc} (ceil(real/10)). " + "Without NC items the eval measures recall only; a verbose hallucinator " + "scores perfectly." + ) + + # ABANCA DILO blend guard: items must assert a single sub-population target. + # Checks for phrases that would indicate a blended numeric target is asserted. + # "blend" alone is too broad (items may reference it negatively). + BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") + for it in items: + if it.get("tier") == "NC": + continue + desc = it.get("description", "").lower() + iid = it.get("id", "") + if any(phrase in desc for phrase in BLEND_PHRASES): + raise ValueError( + f"{path.name}: item '{iid}' description targets a blended distribution; " + "ABANCA DILO items must target a single measured sub-population " + "(Empresas or PyMEs). Use segment-keyed items: " + "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately." + ) + + +def _compute_sha256(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +def load_registry(path: str | Path) -> Registry: + """Load and validate a lean-1 registry file. + + Raises ValueError with a descriptive message on any invariant violation. + The EMPTY_MUST_FIND check runs first — it is the fake-champion guard. + """ + path = Path(path) + raw = json.loads(path.read_text(encoding="utf-8")) + _validate(raw, path) + sha = _compute_sha256(path) + + items = [ + RegistryItem( + id=it["id"], + tier=it["tier"], + scope=it.get("scope", "finding"), + description=it.get("description", ""), + evidence=it.get("evidence", []), + keywords=it.get("keywords", []), + weight=float(it.get("weight", 1.0)), + from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "", + to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "", + relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "", + ) + for it in raw["items"] + ] + + return Registry( + schema_version=raw["schema_version"], + corpus=raw["corpus"], + author=raw["author"], + date=raw["date"], + kappa=float(raw["kappa"] or 0.0), + items=items, + _sha256=sha, + ) + + +def registry_sha256(path: str | Path) -> str: + return _compute_sha256(Path(path)) From ee64cfad1881e32be569a28acd5981373c8cd04f Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:43:24 +0200 Subject: [PATCH 04/67] feat(evaluation): add G1-G5 gate framework (#271) * feat(evaluation): add G1-G5 gate framework (GateResult, run_gates, g2_recall_precision) * feat(evaluation): export g2_recall_precision from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 3 +- fireflyframework_agentic/evaluation/gates.py | 840 ++++++++++++++++++ 2 files changed, 842 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/evaluation/gates.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index b6283d8b..401244c9 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -32,7 +32,7 @@ from importlib.metadata import PackageNotFoundError, version from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens @@ -55,6 +55,7 @@ "verify_evidence_index", "GateResult", "Verdict", + "g2_recall_precision", "run_gates", "render_scorecard", "ChampionRecord", diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py new file mode 100644 index 00000000..057bfea7 --- /dev/null +++ b/fireflyframework_agentic/evaluation/gates.py @@ -0,0 +1,840 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Four gates — every gate always runs; a failure raises a flag, not a veto. + +Gate pipeline (EVALUATION_FRAMEWORK.md §6): + G1 — Structural & Safe + G2 — Must-finds & negative controls + G3 — Evidence (grounding) + G5 — No-regression / promotion (human decision) + +Each gate is a pure function of the result dict + supporting inputs. +run_gates() always executes all four gates and returns all four results so +the scorecard carries the complete picture regardless of which flags fire. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from fireflyframework_agentic.evaluation import matcher +from fireflyframework_agentic.evaluation.corpus import ( + EMPTY, + FABRICATED, + SOURCE_UNKNOWN, + VERIFIED, + Corpus, + corpus_sha256, + verify_evidence_index, +) +from fireflyframework_agentic.evaluation.matcher import anchored, matches +from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256 + + +@dataclass +class GateResult: + gate: str + passed: bool + reason_code: str = "" + details: dict = field(default_factory=dict) + + def __str__(self) -> str: + status = "PASS" if self.passed else f"FLAG:{self.reason_code}" + return f"[{self.gate}] {status}" + + +class Verdict: + """Promotion gate verdict constants. + + Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and + is safe to become the new champion. Use ``Verdict.HOLD`` when the + challenger does not meet the bar and must be iterated on. + """ + + PROMOTE: str = "PROMOTE" + HOLD: str = "HOLD" + + +def render_scorecard(gate_results: list[GateResult]) -> str: + """Render a human-readable scorecard from a list of GateResult objects. + + Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``. + The overall verdict (PROMOTE / HOLD) appears on the final line. A run + promotes only when every gate passes; any flag signals HOLD. + """ + lines = [str(r) for r in gate_results] + all_passed = all(r.passed for r in gate_results) + verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD + lines.append(f"VERDICT: {verdict}") + return "\n".join(lines) + + +def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]: + """Index evidence by id; with a corpus, drop entries that fail verification. + + Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot + contribute source stems to G2's shared-source guard or excerpts to G3's + grounding — a run cannot anchor anything on evidence it invented. EMPTY + entries are kept: an empty excerpt is a format problem, not fabrication, + and its (verified) locator stem is still a legitimate citation. + """ + index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + if corpus is None: + return index + statuses = verify_evidence_index(corpus, result) + return { + eid: ev + for eid, ev in index.items() + if statuses[eid] in (VERIFIED, EMPTY) + } + + +# ── G1: Structural & Safe ──────────────────────────────────────────────────── + + +def _name_duplication_rate(nodes: list[dict]) -> float: + """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count. + + Tier 1: same normalized id (lower-case) merges nodes into one cluster. + Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster. + + Report-only: no gate flag fires on any threshold. + """ + n = len(nodes) + if n < 2: + return 0.0 + + group = list(range(n)) + + def _root(i: int) -> int: + while group[i] != i: + group[i] = group[group[i]] + i = group[i] + return i + + seen: dict[str, int] = {} + for i, node in enumerate(nodes): + nid = node.get("id", "").lower() + if nid in seen: + group[_root(i)] = _root(seen[nid]) + else: + seen[nid] = i + + toks = [frozenset(node.get("name", "").lower().split()) for node in nodes] + for i in range(n): + for j in range(i + 1, n): + a, b = toks[i], toks[j] + union_ab = a | b + if union_ab and len(a & b) / len(union_ab) >= 0.6: + group[_root(i)] = _root(j) + + clusters = len({_root(i) for i in range(n)}) + return round(1 - clusters / n, 4) + + +def g1_structural( + result: dict, + registry: Registry, + registry_path: str, + *, + pii_list: list[str] | None = None, + corpus: Corpus | None = None, +) -> GateResult: + """G1 — Structural & Safe (hard veto). + + Checks (in order): + 1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug. + 2. Registry SHA-256 pin: loaded Registry matches the file on disk. + 3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for + the evidence universe (CORPUS_DRIFT). + 4. Required top-level keys present in result. + 5. PII non-disclosure: no corpus PII name in any finding/report text. + """ + # Guard 1: empty registry (fake-champion guard — always first) + if not registry.real_items: + return GateResult( + gate="G1", + passed=False, + reason_code="EMPTY_MUST_FIND", + details={"message": "Registry has zero real items — cannot evaluate recall."}, + ) + + # Guard 2: registry SHA-256 pin + computed_sha = registry_sha256(registry_path) + if computed_sha != registry.sha256(): + return GateResult( + gate="G1", + passed=False, + reason_code="GOLD_DRIFT", + details={ + "message": "Registry file has changed since it was loaded.", + "expected": registry.sha256(), + "actual": computed_sha, + }, + ) + + # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence) + if corpus is not None: + current_corpus_sha = corpus_sha256(corpus.path) + if current_corpus_sha != corpus.sha256: + return GateResult( + gate="G1", + passed=False, + reason_code="CORPUS_DRIFT", + details={ + "message": "Corpus file has changed since it was loaded.", + "expected": corpus.sha256, + "actual": current_corpus_sha, + }, + ) + + # Guard 4: required result keys + required = ("process_graph", "findings", "evidence_index") + missing = [k for k in required if k not in result] + if missing: + return GateResult( + gate="G1", + passed=False, + reason_code="SCHEMA_INVALID", + details={"missing_keys": missing}, + ) + + # Guard 5: PII check + if pii_list: + free_text: list[str] = [] + for finding in result.get("findings", []): + free_text.extend([finding.get("title", ""), finding.get("description", "")]) + for report in result.get("reports", []): + free_text.append(str(report)) + combined = " ".join(free_text).lower() + hits = [name for name in pii_list if name.lower() in combined] + if hits: + return GateResult( + gate="G1", + passed=False, + reason_code="PII_LEAK", + details={ + "message": "Corpus PII names found in findings/reports.", + "matches": hits[:5], + }, + ) + + pg = result.get("process_graph", {}) + processes = pg.get("processes", []) + activities = [a for p in processes for a in p.get("activities", [])] + decisions = [d for p in processes for d in p.get("decisions", [])] + dg = result.get("dependency_graph", {}) + + details = { + "registry_sha256": registry.sha256(), + "real_items": len(registry.real_items), + "nc_items": len(registry.nc_items), + "map": { + "processes": { + "count": len(processes), + "duplication_rate": _name_duplication_rate(processes), + }, + "activities": { + "count": len(activities), + "duplication_rate": _name_duplication_rate(activities), + }, + "decisions": { + "count": len(decisions), + "duplication_rate": _name_duplication_rate(decisions), + }, + "personas": { + "count": len(result.get("personas", [])), + "duplication_rate": _name_duplication_rate(result.get("personas", [])), + }, + "systems": { + "count": len(result.get("systems", [])), + "duplication_rate": _name_duplication_rate(result.get("systems", [])), + }, + "informal_channels": { + "count": len(result.get("informal_channels", [])), + "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])), + }, + "dependency_graph_edges": len(dg.get("activity_edges", [])), + }, + } + if corpus is not None: + details["corpus_sha256"] = corpus.sha256 + return GateResult(gate="G1", passed=True, details=details) + + +# ── G2: Recall & Precision ─────────────────────────────────────────────────── + + +def _candidates_by_scope(result: dict) -> dict[str, list[dict]]: + """Build per-scope candidate lists from a DiscoveryResult (§4.3). + + Process candidates are augmented with their children's evidence_refs because + process nodes typically carry no own refs — the source-document guard uses the + union of the process's own refs and all its activities' and decisions' refs. + + dependency_graph-scoped items are relation items (all carry from/to) and are + matched via matcher.matches_dependency_graph_relation() — not through per-candidate + iteration — so no "dependency_graph" key is included here. + """ + pg = result.get("process_graph", {}) + processes = pg.get("processes", []) + + def _merge_refs(proc: dict) -> dict: + children_refs = [ + ref + for child_list in (proc.get("activities", []), proc.get("decisions", [])) + for child in child_list + for ref in child.get("evidence_refs", []) + ] + return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs} + + return { + "process": [_merge_refs(p) for p in processes], + "activity": [a for p in processes for a in p.get("activities", [])], + "decision": [d for p in processes for d in p.get("decisions", [])], + "finding": result.get("findings", []), + "action": result.get("proposed_actions", []), + "persona": result.get("personas", []), + "system": result.get("systems", []), + "informal_channel": result.get("informal_channels", []), + } + + +def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float: + """Weighted recall of a hit map over the scored (non-L3) items.""" + total_weight = sum(item.weight for item in scored_items) or 1.0 + weighted_hit = sum(item.weight for item in scored_items if hits[item.id]) + return weighted_hit / total_weight + + +def _finding_redundancy_rate(findings: list[dict]) -> float: + """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" + if len(findings) < 2: + return 0.0 + def _tok(text: str) -> frozenset[str]: + return frozenset(t.lower() for t in text.split() if len(t) >= 5) + token_sets = [_tok(f.get("description", "")) for f in findings] + in_redundant: set[int] = set() + for i in range(len(token_sets)): + for j in range(i + 1, len(token_sets)): + a, b = token_sets[i], token_sets[j] + union = a | b + sim = len(a & b) / len(union) if union else 1.0 + if sim >= 0.6: + in_redundant.add(i) + in_redundant.add(j) + return round(len(in_redundant) / len(findings), 4) + + +def g2_recall_precision( + result: dict, + registry: Registry, + *, + recall_floor: float = 0.70, + embed_fn=None, + tau: float = 0.70, + tau_nc: float = 0.85, + recall_metric: str = "lexical", + corpus: Corpus | None = None, +) -> GateResult: + """G2 — Recall & Precision (hard veto). + + - L0 miss -> BLOCK (zeros the evaluation; regulatory-mandatory item absent) + - NC hit -> BLOCK (precision failure; plausible-but-false item was emitted) + - recall < floor -> BLOCK + + With a ``corpus``, evidence entries that fail verification (fabricated + excerpt or unknown source) are excluded from the evidence index before + matching, so the shared-source guard only accepts citations to real + corpus documents — a fabricated locator cannot satisfy any item. + + ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES. + "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and + needs no embedder. "semantic"/"hybrid" add the embedding path (matcher.semantic_hits, + threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn`` + — passing them without one raises ValueError (use "lexical" for the offline path). + When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are + reported in details regardless of which one gates. + """ + evidence_index = _build_evidence_index(result, corpus) + candidates = _candidates_by_scope(result) + findings = candidates["finding"] + + # NC items anchor via the embedding path only (§6.2): a correct finding about + # the true mirror fact shares vocabulary with the false description, so a + # token or keyword match would falsely convict it. Lexical NC is always False. + # dependency_graph relation items (those with from_node) use the endpoint + # matcher (§5.3b) instead of the per-candidate text predicate. + lexical: dict[str, bool] = {} + for item in registry.items: + if item.tier == "NC": + lexical[item.id] = False + elif item.scope == "dependency_graph" and item.from_node: + lexical[item.id] = matcher.matches_dependency_graph_relation( + item, result, evidence_index + ) + else: + lexical[item.id] = any( + matches(c, item, evidence_index, scope=scope) + for scope in matcher.allowed_scopes(item) + for c in candidates.get(scope, []) + ) + + if recall_metric not in ("lexical", "semantic", "hybrid"): + raise ValueError(f"unknown recall_metric {recall_metric!r}") + if recall_metric in ("semantic", "hybrid") and embed_fn is None: + raise ValueError( + f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" + ) + + if embed_fn is not None: + semantic = matcher.semantic_hits( + candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc + ) + # dependency_graph relation items have no embedding candidates (§5.3b uses + # the endpoint matcher, not per-candidate text embeddings); mirror the + # lexical result so semantic/hybrid never under-credits them. + for item in registry.items: + if item.scope == "dependency_graph" and item.from_node: + semantic[item.id] = lexical[item.id] + else: + semantic = None + + metric = recall_metric + + if semantic is None or metric == "lexical": + hits = lexical + elif metric == "semantic": + hits = semantic + else: # hybrid + hits = {iid: lexical[iid] or semantic[iid] for iid in lexical} + + # Signal-to-noise panel — report-only, §6.2 item 3 + finding_count = len(findings) + finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] + findings_matched = sum( + 1 for f in findings + if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) + ) + _sn = { + "finding_count": finding_count, + "findings_matched_to_registry": { + "count": findings_matched, + "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0, + }, + "finding_redundancy_rate": _finding_redundancy_rate(findings), + } + if corpus is not None: + excluded = len(_build_evidence_index(result)) - len(evidence_index) + _sn["evidence_entries_excluded_unverified"] = excluded + + # L0 misses + l0_misses = [item.id for item in registry.l0_items if not hits[item.id]] + if l0_misses: + return GateResult( + gate="G2", + passed=False, + reason_code="L0_MISSING", + details={ + "l0_misses": l0_misses, + "message": "Regulatory-mandatory items not found — evaluation zeroed.", + **_sn, + }, + ) + + # NC precision + nc_hits = [item.id for item in registry.nc_items if hits[item.id]] + if nc_hits: + return GateResult( + gate="G2", + passed=False, + reason_code="NC_HIT", + details={ + "nc_hits": nc_hits, + "message": "Plausible-but-false negative control items were matched — precision failure.", + **_sn, + }, + ) + + # Weighted recall — over scored items only (L0/L1/L2). L3 is a bonus tier + # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from + # the denominator and only reported in per_tier below. Recall is computed over + # the GATING hit map so the gate is internally consistent with the chosen metric. + real_items = registry.real_items + scored_items = [item for item in real_items if item.tier != "L3"] + recall = _weighted_recall(scored_items, hits) + + per_tier: dict[str, dict] = {} + for tier in ("L0", "L1", "L2", "L3"): + tier_items = [i for i in real_items if i.tier == tier] + if not tier_items: + continue + per_tier[tier] = { + "hit": sum(1 for i in tier_items if hits[i.id]), + "total": len(tier_items), + } + + def _semantic_details() -> dict: + """The extra recall-breakdown keys, only emitted when an embedder is given.""" + if semantic is None: + return {} + return { + "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), + "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), + "hybrid_recall": round( + _weighted_recall( + scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} + ), + 4, + ), + "tau": tau, + } + + if recall < recall_floor: + return GateResult( + gate="G2", + passed=False, + reason_code="RECALL_BELOW_FLOOR", + details={ + "recall": round(recall, 4), + "recall_metric": metric, + "floor": recall_floor, + "per_tier": per_tier, + "misses": [item.id for item in scored_items if not hits[item.id]], + **_semantic_details(), + **_sn, + }, + ) + + return GateResult( + gate="G2", + passed=True, + details={ + "recall": round(recall, 4), + "recall_metric": metric, + "floor": recall_floor, + "per_tier": per_tier, + "nc_items_checked": len(registry.nc_items), + **_semantic_details(), + **_sn, + }, + ) + + +# ── G3: Grounded ───────────────────────────────────────────────────────────── + + +def g3_grounded( + result: dict, + *, + grounding_floor: float = 0.90, + human_spot_check_n: int = 5, + corpus: Corpus | None = None, +) -> GateResult: + """G3 — Grounded (automated portion; human spot-check triggered on pass). + + For each finding, verifies that at least one cited evidence excerpt shares a + non-trivial token with the finding description (topic-anchoring). + + With a ``corpus``, the gate also looks in a third direction — cited -> + exists: every evidence entry is verified against the actual corpus text + (corpus.verify_entry). A populated excerpt not found in its cited source + raises EVIDENCE_FABRICATED; a locator resolving to no corpus document + raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a + finding, so a run cannot ground itself on evidence it invented. + + Also reports excerpt fill rate and source coverage so the reviewer can tell + whether ungrounded findings are a format problem (empty excerpts) or a real + faithfulness signal (populated excerpts that do not anchor). + + Known limitation: topic-anchoring, not claim entailment. A '45 days' claim + cited to a '3 days' source passes if they share the process name (excerpt + verification confirms the quote is real, not that the claim matches it). + The human spot-check is the binding faithfulness signal until NLI/AIS lands. + """ + evidence_index = _build_evidence_index(result) + findings = result.get("findings", []) + statuses = verify_evidence_index(corpus, result) if corpus is not None else None + + if not findings: + return GateResult( + gate="G3", + passed=False, + reason_code="NO_FINDINGS", + details={"message": "Result has zero findings — cannot compute grounding."}, + ) + + grounded_ids: list[str] = [] + # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. + ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt + ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored + + # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. + total_refs = 0 + populated_refs = 0 + + # Source coverage: which source stems are cited by at least one finding. + cited_stems: set[str] = set() + + for finding in findings: + fid = finding.get("id", "?") + desc = finding.get("description", "") + is_grounded = False + had_populated = False + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + total_refs += 1 + excerpt = ev.get("excerpt") or "" + if excerpt: + populated_refs += 1 + had_populated = True + # Track source coverage (even for ungrounded findings). + stem = matcher.source_stem(ev.get("locator", "")) + if stem: + cited_stems.add(stem) + # Only a corpus-verified excerpt can ground a finding. + if statuses is not None and statuses.get(ev.get("id")) != VERIFIED: + continue + if anchored(desc, excerpt): + is_grounded = True + break + if is_grounded: + grounded_ids.append(fid) + elif had_populated: + ungrounded_populated.append(fid) + else: + ungrounded_empty_only.append(fid) + + grounding_pct = len(grounded_ids) / len(findings) + + # All source stems present in the evidence index (not just those cited). + all_stems: set[str] = set() + for ev in result.get("evidence_index", []): + stem = matcher.source_stem(ev.get("locator", "")) + if stem: + all_stems.add(stem) + orphaned = sorted(all_stems - cited_stems) + + excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0" + source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0" + + details = { + "grounding_pct": round(grounding_pct, 4), + "grounded": len(grounded_ids), + "total": len(findings), + "excerpt_fill": excerpt_fill, + "source_coverage": source_coverage, + "orphaned_sources": orphaned, + } + + fabricated_ids: list[str] = [] + unknown_source_ids: list[str] = [] + if statuses is not None: + fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED) + unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN) + details["evidence_verification"] = { + "entries": len(statuses), + "verified": sum(1 for s in statuses.values() if s == VERIFIED), + "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY), + "fabricated": fabricated_ids, + "source_unknown": unknown_source_ids, + } + + if fabricated_ids: + details["message"] = ( + "Populated excerpt(s) not found in the cited corpus document — " + "the run asserts evidence the source does not contain." + ) + return GateResult( + gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details + ) + + if unknown_source_ids: + details["message"] = ( + "Evidence locator(s) resolve to no corpus document — either the " + "corpus bundle is incomplete or the run invented a source." + ) + return GateResult( + gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details + ) + + if grounding_pct < grounding_floor: + details["floor"] = grounding_floor + details["ungrounded_with_populated_excerpts"] = ungrounded_populated + details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only + return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details) + + spot_n = min(human_spot_check_n, len(findings)) + details["human_spot_check"] = ( + f"ACTION REQUIRED: manually review {spot_n} sampled findings for " + "field-consistency, citation-accuracy, and client-readiness. " + "This is the binding faithfulness signal until NLI/AIS lands." + ) + return GateResult(gate="G3", passed=True, details=details) + + +# ── G5: No-regression / promotion (human decision) ─────────────────────────── + + +def g5_no_regression( + candidate_scores: dict[str, float], + champion_scores: dict[str, float] | None, + aa_noise: dict[str, float] | None, + *, + is_day_zero: bool = False, + human_signed_off: bool = False, + signoff_count: int = 0, +) -> GateResult: + """G5 — No-regression / promotion gate (human decision). + + Day-Zero: no champion exists. Requires G1-G3 pass + 2 independent sign-offs. + Normal promotion: candidate must beat champion by > aa_noise on every metric, + no guardrail regresses, + 1 human sign-off. + + Champions are per-corpus. Do not compare across corpora. + """ + if is_day_zero or champion_scores is None: + required = 2 + if signoff_count < required: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={ + "reason": ( + f"Day-Zero requires {required} independent human sign-offs " + f"(kappa >= 0.70); got {signoff_count}." + ), + "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2", + }, + ) + return GateResult( + gate="G5", + passed=True, + details={"day_zero": True, "signoffs": signoff_count}, + ) + + if not human_signed_off: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={"reason": "Human sign-off required for promotion."}, + ) + + noise = aa_noise or {} + regressions: list[str] = [] + improvements: list[str] = [] + + for metric, cand_val in candidate_scores.items(): + champ_val = champion_scores.get(metric) + if champ_val is None: + continue + delta = cand_val - champ_val + band = noise.get(metric, 0.0) + if delta < -band: + regressions.append( + f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " + f"delta={delta:+.4f} < -band={-band:.4f}" + ) + elif delta > band: + improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") + + if regressions: + return GateResult( + gate="G5", + passed=False, + reason_code="HOLD", + details={ + "regressions": regressions, + "improvements": improvements, + "message": "Guardrail metric(s) regressed beyond A/A noise band.", + }, + ) + + return GateResult( + gate="G5", + passed=True, + details={"improvements": improvements, "noise_band": noise}, + ) + + +# ── Full gate pipeline ──────────────────────────────────────────────────────── + + +def run_gates( + result: dict, + registry: Registry, + registry_path: str, + *, + pii_list: list[str] | None = None, + recall_floor: float = 0.70, + grounding_floor: float = 0.90, + champion_scores: dict[str, float] | None = None, + aa_noise: dict[str, float] | None = None, + is_day_zero: bool = False, + human_signed_off: bool = False, + signoff_count: int = 0, + embed_fn=None, + tau: float = 0.70, + tau_nc: float = 0.85, + recall_metric: str = "lexical", + corpus: Corpus | None = None, +) -> list[GateResult]: + """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes. + + A failed gate raises a flag in its GateResult but never prevents the + remaining gates from running. The scorecard therefore always carries the + complete picture: a run that misses a regulatory item *and* grounds poorly + shows both flags. See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes'). + + ``corpus`` (optional) enables deterministic evidence verification: G1 pins + the corpus hash, G2 ignores unverified evidence entries, and G3 flags + fabricated excerpts and unknown sources. Without it, evidence is taken at + face value from the run's own evidence_index (disclosed on the scorecard). + + Returns all four GateResult objects. + """ + g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus) + + g2 = g2_recall_precision( + result, + registry, + recall_floor=recall_floor, + embed_fn=embed_fn, + tau=tau, + tau_nc=tau_nc, + recall_metric=recall_metric, + corpus=corpus, + ) + + g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus) + + # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did + # not emit the metric (e.g. L0_MISSING returns before computing recall). + candidate_scores = { + "recall": g2.details.get("recall", 0.0), + "grounding_pct": g3.details.get("grounding_pct", 0.0), + } + g5 = g5_no_regression( + candidate_scores, + champion_scores, + aa_noise, + is_day_zero=is_day_zero, + human_signed_off=human_signed_off, + signoff_count=signoff_count, + ) + + return [g1, g2, g3, g5] From d964ba10735b918ed8e62ae7a5b1533238696495 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:46:21 +0200 Subject: [PATCH 05/67] feat(evaluation): add scorecard renderer (#272) * feat(evaluation): add scorecard renderer * feat(evaluation): export render_scorecard, verdict, VERDICT_PROMOTE/HOLD from scorecard module --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 6 +- .../evaluation/scorecard.py | 489 ++++++++++++++++++ 2 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/evaluation/scorecard.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 401244c9..61562db3 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -32,7 +32,8 @@ from importlib.metadata import PackageNotFoundError, version from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, render_scorecard, run_gates +from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates +from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens @@ -58,6 +59,9 @@ "g2_recall_precision", "run_gates", "render_scorecard", + "verdict", + "VERDICT_PROMOTE", + "VERDICT_HOLD", "ChampionRecord", "load_champion", "save_champion", diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py new file mode 100644 index 00000000..b34885e8 --- /dev/null +++ b/fireflyframework_agentic/evaluation/scorecard.py @@ -0,0 +1,489 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Scorecard renderer: gate results -> Markdown report. + +Every scorecard states whether it is self-graded. Until Phase 3 independent +re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against +team-authored ground truth. See EVALUATION_FRAMEWORK.md. +""" + +from __future__ import annotations + +import json + +VERDICT_PROMOTE = "PROMOTE" +VERDICT_HOLD = "HOLD" + + +def verdict(gate_results: list) -> str: + """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise.""" + if not gate_results: + return VERDICT_HOLD + if not all(g.passed for g in gate_results): + return VERDICT_HOLD + gate_names = {g.gate for g in gate_results} + if "G5" not in gate_names: + return VERDICT_HOLD + return VERDICT_PROMOTE + + +def render_scorecard( + gate_results: list, + *, + corpus: str = "unknown", + model_id: str = "unknown", + run_id: str = "run", + is_self_graded: bool = True, + kappa_advisory: bool = False, + evidence_unverified: bool = False, + bpi2017_f1: float | None = None, + advisory=None, + config: dict | None = None, + experiment_config: dict | None = None, +) -> str: + """Render a Markdown evaluation scorecard. + + The scorecard always discloses self-graded status and advisory flags. + """ + v = verdict(gate_results) + lines = [ + "# FlyRadar Evaluation Scorecard", + "", + f"**Corpus**: {corpus}", + f"**Model**: {model_id}", + f"**Run**: {run_id}", + f"**Verdict**: **{v}**", + "", + ] + + if is_self_graded: + lines += [ + "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is", + "> authored by the FlyRadar team. This PROMOTE has no contamination-free signal", + "> until Phase 3. See EVALUATION_FRAMEWORK.md.", + "", + ] + + if kappa_advisory: + lines += [ + "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not", + "> verified the must-find items. Promotion is advisory for this corpus until", + "> kappa >= 0.70 from an independent re-annotation.", + "", + ] + + if evidence_unverified: + lines += [ + "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators", + "> and excerpts are taken at face value from the run's own evidence_index.", + "> Grounding certifies self-consistency, not corpus reality. Supply the run's", + "> input.json to enable deterministic excerpt verification (G3, §6.3).", + "", + ] + + if experiment_config is not None: + lines += [ + "## Experiment configuration", + "How this run was generated. Recorded fields (cost, tokens, latency, agents) are " + "read from the run's output.json; `model` is the value passed to the harness via " + "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not " + "captured in output.json.", + "", + "```json", + json.dumps(experiment_config, indent=2, default=str), + "```", + "", + ] + + if config is not None: + lines += [ + "## Evaluation configuration", + "These are the parameters used to compute the evaluation.", + "", + "```json", + json.dumps(config, indent=2, default=str), + "```", + "", + ] + + lines += ["## Gate Results", ""] + g5_result = None + for g in gate_results: + if g.gate == "G5": + g5_result = g + continue + status = "PASS" if g.passed else f"FLAG ({g.reason_code})" + lines.append(f"### {g.gate}: {status}") + if g.details: + lines.append("```json") + lines.append(json.dumps(g.details, indent=2, default=str)) + lines.append("```") + lines.append("") + + if bpi2017_f1 is not None: + ok = bpi2017_f1 >= 0.60 + anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)" + lines += [ + "## External Sanity Anchor (non-blocking)", + f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}", + "_One non-self-graded signal. Non-blocking; informational only._", + "", + ] + + if advisory is not None: + lines += _render_advisory(advisory) + + if g5_result is not None: + status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})" + lines.append(f"### G5: {status}") + if g5_result.details: + lines.append("```json") + lines.append(json.dumps(g5_result.details, indent=2, default=str)) + lines.append("```") + lines.append("") + + lines += _render_analysis(gate_results, advisory) + + return "\n".join(lines) + + +def _num(x) -> str: + """Format a metric leaf: None -> 'n/a', float -> 3dp, else str.""" + if x is None: + return "n/a" + if isinstance(x, float): + return f"{x:.3f}" + return str(x) + + +def _render_advisory(report) -> list[str]: + """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport. + + Best-effort: only metrics present in report.metrics are shown. G4 never + affects the PROMOTE/HOLD verdict; this section is decision-support for the + G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10). + """ + m = report.metrics + cal = "calibrated" if report.calibrated else "uncalibrated" + lines = [ + "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)", + f"Judge: {report.judge_model} · {cal} · {report.runs}-run median", + ] + if report.same_provider_caveat: + lines.append("> same-provider as the pipeline — results may share blind spots.") + lines.append("```text") + + if "faithfulness" in m: + d = m["faithfulness"] + u = d.get("unsupported_ids", []) + extra = f" (unsupported: {', '.join(u)})" if u else "" + lines.append( + f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" + ) + if "numeric_temporal_fidelity" in m: + lines.append( + f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" + ) + if "citation_relevance" in m: + d = m["citation_relevance"] + lines.append( + f"Citation relevance (ctx-prec): {_num(d.get('precision'))} ({d.get('relevant')}/{d.get('total')})" + ) + if "semantic_recovery" in m: + d = m["semantic_recovery"] + rec = d.get("recovered", []) + rids = ", ".join(r.get("id", "") for r in rec) if rec else "none" + lines.append( + f"Semantic recovery (ctx-recall): lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))} (recovered: {rids})" + ) + if "nc_semantic_precision" in m: + d = m["nc_semantic_precision"] + a = d.get("asserted_ids", []) + extra = f" ({', '.join(a)})" if a else "" + lines.append(f"NC semantic precision: {d.get('asserted', 0)} asserted{extra}") + if "fabricated_entity" in m: + lines.append(f"Fabricated-entity check: {m['fabricated_entity'].get('count', 0)}") + if "contradiction" in m: + lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") + if "actionability" in m: + d = m["actionability"] + lines.append( + f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" + ) + if "severity_calibration" in m: + d = m["severity_calibration"] + lines.append( + f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" + ) + if "answer_relevancy" in m: + lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") + if "comparative_vs_champion" in m: + lines.append( + f"Comparative vs champion: more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}" + ) + if "source_coverage" in m: + d = m["source_coverage"] + o = d.get("orphaned", []) + extra = f" (orphaned: {', '.join(o)})" if o else "" + lines.append( + f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" + ) + if "excerpt_fill_rate" in m: + d = m["excerpt_fill_rate"] + lines.append( + f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" + ) + if "open_gap" in m: + gap = (m["open_gap"].get("gap") or "").strip() + if gap: + lines.append(f"Open gap probe: {gap}") + if report.errors: + lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})") + lines.append("```") + # Full detail — nothing truncated: every id, pair, verdict, and complete text. + lines += [ + "", + "**G4 — full metric detail:**", + "```json", + json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), + "```", + ] + lines.append( + "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." + ) + lines.append("") + return lines + + +def _render_analysis(gate_results: list, advisory=None) -> list[str]: + """Render a plain-language interpretation of all evaluation signals.""" + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + + lines = ["## Analysis", ""] + + # ── Topic coverage (G2) ────────────────────────────────────────────────── + lines.append("### Topic coverage (G2)") + if g2 and g2.details: + d = g2.details + recall = d.get("recall", 0.0) + tiers = d.get("per_tier", {}) + finding_count = d.get("finding_count", 0) + redundancy = d.get("finding_redundancy_rate", 0.0) + matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) + + tier_summary = ", ".join( + f"{t} {v['hit']}/{v['total']}" + for t, v in tiers.items() + if "hit" in v and "total" in v + ) + lines.append( + f"Lexical recall is **{recall:.3f}** ({tier_summary}). " + f"The run produced {finding_count} findings, " + f"all of which map to a registry item (match rate {matched:.0%}). " + ) + if redundancy > 0.15: + lines.append( + f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of " + "findings are near-duplicates of each other (Jaccard ≥ 0.6). " + "The run is covering the same ground multiple times rather than broadening coverage." + ) + else: + lines.append( + f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." + ) + lines.append( + "_G2 is a topic-level test. A recall of 1.000 means every required topic was " + "mentioned somewhere — it does not verify that the specific claims about those " + "topics are accurate. Claim accuracy is G4 Faithfulness._" + ) + else: + lines.append("G2 result unavailable.") + lines.append("") + + # ── Evidence quality (G3) ──────────────────────────────────────────────── + lines.append("### Evidence quality (G3)") + if g3 and g3.details: + d = g3.details + grounding = d.get("grounding_pct", 0.0) + ev = d.get("evidence_verification", {}) + verified = ev.get("verified", 0) + entries = ev.get("entries", 0) + fabricated = ev.get("fabricated", []) + unknown = ev.get("source_unknown", []) + orphaned = d.get("orphaned_sources", []) + source_cov = d.get("source_coverage", "") + + lines.append( + f"Grounding is **{grounding:.0%}**: every finding cites at least one " + "corpus document, and all excerpts are populated. " + f"Evidence verification checked {entries} entries against the raw corpus: " + f"{verified} verified" + + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "") + + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "") + + "." + ) + if unknown: + lines.append( + f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. " + "This is most likely a corpus bundle gap rather than a hallucinated source — " + "verify that all expected files are included in `input.json`." + ) + if orphaned: + lines.append( + f"**{len(orphaned)} corpus documents were never cited** by this run " + f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing " + "from these sources, so any findings they contain are silently missed." + ) + if source_cov: + cited, total = (int(x) for x in source_cov.split("/")) + if cited < total: + lines.append( + f"Overall source coverage is {cited}/{total} — " + f"{total - cited} corpus file(s) left entirely uncited." + ) + else: + lines.append("G3 result unavailable.") + lines.append("") + + # ── Claim accuracy (G4) ────────────────────────────────────────────────── + if advisory is not None: + m = advisory.metrics + lines.append("### Claim accuracy (G4 — advisory)") + + faith = m.get("faithfulness", {}) + supported = faith.get("supported", 0) + total_f = faith.get("total", 0) + if total_f: + faith_pct = supported / total_f + lines.append( + f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** " + ) + if faith_pct < 0.5: + lines.append( + "This is a critical signal: the majority of findings contain claims " + "that the judge cannot verify from the cited sources. " + "The run is presenting inferences, extrapolations, or hallucinated details " + "as if they were directly evidenced. " + "Each unsupported finding should be reviewed against its cited document before use." + ) + elif faith_pct < 0.8: + lines.append( + "A significant minority of findings contain claims not traceable to cited sources. " + "These may be reasonable inferences, but they should be flagged for human verification." + ) + else: + lines.append("Most findings are directly supported by their cited evidence.") + + ntf = m.get("numeric_temporal_fidelity", {}) + mismatch_count = ntf.get("count", 0) + if mismatch_count: + lines.append( + f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** " + "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — " + "appear in findings but cannot be traced to the cited evidence. " + "These numbers should be treated as estimates or fabrications until verified " + "against the source documents." + ) + + fab = m.get("fabricated_entity", {}) + fab_count = fab.get("count", 0) + fab_entities = fab.get("entities", []) + if fab_count: + lines.append( + f"**Fabricated entities: {fab_count}** — the following names/identifiers appear " + f"in the output but are absent from the corpus: " + f"{', '.join(f'`{e}`' for e in fab_entities)}. " + "These should be removed or verified before sharing the output." + ) + + sev = m.get("severity_calibration", {}) + misc = sev.get("miscalibrated", 0) + total_s = sev.get("total", 0) + verdicts = sev.get("verdicts", {}) + over_count = sum(1 for v in verdicts.values() if v == "over") + under_count = sum(1 for v in verdicts.values() if v == "under") + if misc and total_s: + direction = "" + if over_count > under_count: + direction = f" (predominantly over-rated: {over_count} findings rated too high)" + elif under_count > over_count: + direction = f" (predominantly under-rated: {under_count} findings rated too low)" + lines.append( + f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** " + "Over-rated findings inflate perceived urgency and can cause the client to " + "prioritise the wrong items." + ) + + act = m.get("actionability", {}) + act_score = act.get("score") + if act_score is not None: + if act_score < 0.6: + lines.append( + f"**Actionability score: {act_score:.3f}** — proposed actions are below the " + "0.6 threshold for concrete, quantified recommendations. " + "Actions tend to be generic rather than specific enough to assign and execute." + ) + else: + lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.") + + og = m.get("open_gap", {}) + gap_text = (og.get("gap") or "").strip() + if gap_text: + lines.append(f"**Most important missed finding:** {gap_text}") + + lines.append("") + + # ── Bottom line ────────────────────────────────────────────────────────── + lines.append("### Bottom line") + g5 = next((g for g in gate_results if g.gate == "G5"), None) + g5_reason = (g5.details or {}).get("reason", "") if g5 else "" + flags = [g for g in gate_results if not g.passed] + flag_names = [g.gate for g in flags] + + if not flags: + lines.append( + "All deterministic gates pass. The run is ready for G5 human sign-off." + ) + else: + flag_str = ", ".join(flag_names) + lines.append( + f"The run is at **HOLD** due to flags on: {flag_str}. " + ) + for g in flags: + if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": + lines.append( + "- **G3**: One evidence locator points to a file not in the corpus bundle. " + "Regenerate `input.json` to include all corpus sources, then re-run." + ) + elif g.gate == "G5": + lines.append(f"- **G5**: {g5_reason}") + + if advisory is not None: + m = advisory.metrics + faith = m.get("faithfulness", {}) + supported = faith.get("supported", 0) + total_f = faith.get("total", 1) + ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0) + fab_count = m.get("fabricated_entity", {}).get("count", 0) + lines.append( + f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): " + f"faithfulness {supported}/{total_f}, " + f"{ntf_count} numeric mismatches, " + f"{fab_count} fabricated entities. " + "The G5 reviewer should focus on the unsupported findings and verify figures " + "against the source documents before certifying the output." + ) + lines.append("") + return lines From 09cfc34bd75869498a0d5a10216625a784d9e638 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:52:05 +0200 Subject: [PATCH 06/67] feat(evaluation): add LLM-as-judge and judge client (#273) * feat(evaluation): add JudgeClient and OllamaEmbedder (judge_client.py) * feat(evaluation): add AdvisoryReport and run_judge with [D]/[E]/[J] metric families (judge.py) * feat(evaluation): import cosine from judge_client in matcher.py * feat(evaluation): export JudgeClient, OllamaEmbedder, build_embedder, cosine from evaluation package --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 5 + fireflyframework_agentic/evaluation/judge.py | 829 ++++++++++++++++++ .../evaluation/judge_client.py | 454 ++++++++++ .../evaluation/matcher.py | 7 +- 4 files changed, 1289 insertions(+), 6 deletions(-) create mode 100644 fireflyframework_agentic/evaluation/judge.py create mode 100644 fireflyframework_agentic/evaluation/judge_client.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 61562db3..37093075 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -36,6 +36,7 @@ from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge +from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics @@ -68,6 +69,10 @@ "invalidate_champion", "AdvisoryReport", "run_judge", + "JudgeClient", + "OllamaEmbedder", + "build_embedder", + "cosine", "Registry", "RegistryItem", "load_registry", diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py new file mode 100644 index 00000000..a347c8e1 --- /dev/null +++ b/fireflyframework_agentic/evaluation/judge.py @@ -0,0 +1,829 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate. + +G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller. +run_judge() wraps every metric in try/except; a failing metric appends to +report.errors and the run continues (best-effort). The result is an +AdvisoryReport, NOT a GateResult — it is carried separately so it can never +enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note). + +Three families of metric (matching the flyradar contracts): +- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off: + source_coverage, excerpt_fill_rate. +- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default): + semantic_recovery (context recall). +- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs + the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity, + citation_relevance, nc_semantic_precision, fabricated_entity, contradiction, + open_gap, actionability, severity_calibration, answer_relevancy, + comparative_vs_champion. + +Aggregation follows the flycanon custom-judge design: run each [J] metric `runs` +times and take the MEDIAN of its numeric scores (robust to an outlier vote). + +Zero new dependencies: stdlib (json, statistics) + numpy. All imports at top. +calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work). +""" + +from __future__ import annotations + +import concurrent.futures +import statistics +from dataclasses import dataclass, field + +import numpy as np + +from fireflyframework_agentic.evaluation.judge_client import ( + JudgeClient, + OllamaEmbedder, + cosine, + same_provider, +) +from fireflyframework_agentic.evaluation.matcher import source_stem + +SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object." + + +@dataclass +class AdvisoryReport: + """The G4 output: a plain metrics bag, never a GateResult. + + metrics maps metric-name -> small dict (the per-metric summary). details + carries supporting context (counts, ids). errors lists per-metric failures + captured by run_judge's best-effort try/except so nothing propagates. + """ + + judge_model: str + same_provider_caveat: bool + calibrated: bool # ALWAYS False for now (§14) + runs: int + metrics: dict = field(default_factory=dict) + details: dict = field(default_factory=dict) + errors: list[str] = field(default_factory=list) + + +# ── shared accessors ─────────────────────────────────────────────────────────── + + +def _evidence_index(result: dict) -> dict[str, dict]: + return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")} + + +def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]: + """Excerpts of the evidence a finding cites (via evidence_refs.evidence_id).""" + out: list[str] = [] + for ref in finding.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev: + excerpt = ev.get("excerpt") or "" + if excerpt: + out.append(excerpt) + return out + + +def _output_text(result: dict) -> str: + """All free text the model emitted: finding titles+descriptions + reports.""" + parts: list[str] = [] + for f in result.get("findings", []): + parts.append(f.get("title", "")) + parts.append(f.get("description", "")) + for r in result.get("reports", []): + parts.append(str(r)) + return "\n".join(p for p in parts if p) + + +def _workspace_intention(result: dict) -> str: + ws = result.get("workspace") or {} + return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip() + + +def _coerce_float(value, default=None): + """Coerce a model-returned number/numeric-string to float; total (never raises). + + Returns ``default`` (None) on junk so one malformed vote drops that single + vote instead of discarding the whole metric. + """ + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _map_chat(chat_fn, prompts, workers=1): + """Run a list of (system, user) chat prompts, returning ordered result dicts. + + ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to + the in-line loops it replaces, INCLUDING letting a raise propagate (so + run_judge's per-metric try/except still drops that whole metric, the + behaviour the suite locks in). + + ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while + PRESERVING input order in the returned list. Concurrency cannot let one + raising future poison the batch, so in that path a raising call's slot + becomes ``{}`` — the metric's aggregation degrades for that one vote but + never raises (the same best-effort contract as run_judge). + """ + prompts = list(prompts) + if workers <= 1: + return [chat_fn(system, user) for system, user in prompts] + + results: list[dict] = [{} for _ in prompts] + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit(chat_fn, system, user): idx + for idx, (system, user) in enumerate(prompts) + } + for future in concurrent.futures.as_completed(futures): + idx = futures[future] + try: + results[idx] = future.result() + except Exception: # best-effort: a dropped vote, never a raise + results[idx] = {} + return results + + +# ── [D] DETERMINISTIC — no LLM, always available ──────────────────────────────── + + +def source_coverage(result: dict) -> dict: + """Distinct source documents cited by >=1 finding vs all source documents. + + Returns {cited, total, orphaned} where orphaned is the sorted list of + source stems present in evidence_index but cited by no finding. + """ + evidence_index = _evidence_index(result) + all_stems = { + source_stem(ev.get("locator", "")) + for ev in result.get("evidence_index", []) + if ev.get("locator") + } + cited_stems: set[str] = set() + for f in result.get("findings", []): + for ref in f.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if ev and ev.get("locator"): + cited_stems.add(source_stem(ev["locator"])) + cited_stems &= all_stems + orphaned = sorted(all_stems - cited_stems) + return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned} + + +def excerpt_fill_rate(result: dict) -> dict: + """Fraction of evidence_index entries with a non-empty excerpt. + + Returns {populated, total}. This is the signal behind older runs' low G3 + grounding: empty excerpts cannot ground anything. + """ + entries = result.get("evidence_index", []) + populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip()) + return {"populated": populated, "total": len(entries)} + + +# ── [E] EMBEDDING — needs embed_fn ─────────────────────────────────────────────── + + +def semantic_recovery( + result: dict, + registry, + lexical_missed_ids: list[str], + embed_fn, + tau: float = 0.70, +) -> dict: + """Context-recall: recover G2 lexical misses by embedding similarity. + + For each registry item flagged a LEXICAL MISS by G2, embed its + description+keywords and take the max cosine against the embeddings of every + finding description (and their cited excerpts). If max cosine >= tau the + item is counted semantically present (recovered). + + recovered_recall = (lexical_hits + recovered) / scored_denominator, where + the scored denominator is the count of non-NC items scored by G2 (real + items, matching G2's recall denominator family). Returns the lexical recall, + the recovered recall, the recovered item list (with cosine), and tau. + """ + missed = set(lexical_missed_ids or []) + real_items = registry.real_items + scored_items = [i for i in real_items if i.tier != "L3"] + denom = len(scored_items) or 1 + lexical_hits = sum(1 for i in scored_items if i.id not in missed) + + # Candidate texts the findings actually surfaced. + evidence_index = _evidence_index(result) + candidate_texts: list[str] = [] + for f in result.get("findings", []): + desc = f.get("description", "") + if desc: + candidate_texts.append(desc) + candidate_texts.extend(_cited_excerpts(f, evidence_index)) + + missed_items = [i for i in scored_items if i.id in missed] + if not missed_items or not candidate_texts: + recovered_recall = lexical_hits / denom + return { + "lexical_recall": round(lexical_hits / denom, 4), + "recovered_recall": round(recovered_recall, 4), + "recovered": [], + "tau": tau, + "scored_denominator": denom, + } + + item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items] + item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64) + cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) + + recovered: list[dict] = [] + for item, ivec in zip(missed_items, item_vecs): + best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) + if best >= tau: + recovered.append({"id": item.id, "cosine": round(best, 4)}) + + recovered_recall = (lexical_hits + len(recovered)) / denom + return { + "lexical_recall": round(lexical_hits / denom, 4), + "recovered_recall": round(recovered_recall, 4), + "recovered": recovered, + "tau": tau, + "scored_denominator": denom, + } + + +# ── [J] JUDGE — needs chat_fn(system, user) -> dict ────────────────────────────── + + +def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Entailment: does each finding's cited evidence SUPPORT its claim? + + Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED. Returns + {supported, total, unsupported_ids}. Findings with no cited evidence are + counted as not-supported (nothing to entail against). + """ + evidence_index = _evidence_index(result) + findings = result.get("findings", []) + cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings] + prompts = [ + ( + SYSTEM, + "Does the cited evidence span ENTAIL the claim made in this finding?\n" + 'Reply with ONLY {"verdict": "SUPPORTED" or "NOT_SUPPORTED", "reason": ""}.\n\n' + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}", + ) + for f, excerpts in cited + if excerpts + ] + answers = iter(_map_chat(chat_fn, prompts, workers)) + supported = 0 + unsupported_ids: list[str] = [] + for f, excerpts in cited: + fid = f.get("id", "?") + if not excerpts: + unsupported_ids.append(fid) + continue + verdict = str(next(answers).get("verdict", "")).upper() + if verdict == "SUPPORTED": + supported += 1 + else: + unsupported_ids.append(fid) + return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids} + + +def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Flag numbers/dates asserted in a finding that do NOT match its evidence. + + Closes the 45-days-vs-3-days gap. Returns {mismatches: [{finding_id, value, + source}], count}. + """ + evidence_index = _evidence_index(result) + scored = [ + (f, excerpts) + for f in result.get("findings", []) + if (excerpts := _cited_excerpts(f, evidence_index)) + ] + prompts = [ + ( + SYSTEM, + "List every specific number or date asserted in the FINDING that does " + "NOT match the CITED EVIDENCE.\n" + 'Reply with ONLY {"mismatches": [{"value": "", "source": ""}]}. ' + "Empty list if all match.\n\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}", + ) + for f, excerpts in scored + ] + answers = _map_chat(chat_fn, prompts, workers) + mismatches: list[dict] = [] + for (f, _excerpts), answer in zip(scored, answers): + for m in answer.get("mismatches", []) or []: + mismatches.append( + { + "finding_id": f.get("id", "?"), + "value": m.get("value", ""), + "source": m.get("source", ""), + } + ) + return {"mismatches": mismatches, "count": len(mismatches)} + + +def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Context precision: fraction of cited passages actually relevant to the claim. + + Per evidence_ref, ask yes/no relevance. precision = relevant / total_refs. + Returns {precision, relevant, total}; when total == 0 (no cited passages with + excerpts) precision is None — the kept ``total`` lets a reader tell "perfect" + apart from "nothing to score". + """ + evidence_index = _evidence_index(result) + prompts: list[tuple[str, str]] = [] + for f in result.get("findings", []): + desc = f.get("description", "") + for ref in f.get("evidence_refs", []): + ev = evidence_index.get(ref.get("evidence_id", "")) + if not ev: + continue + excerpt = ev.get("excerpt") or "" + if not excerpt: + continue + prompts.append( + ( + SYSTEM, + "Is this cited passage actually relevant to / used by this claim?\n" + 'Reply with ONLY {"relevant": "yes" or "no"}.\n\n' + f"CLAIM: {desc}\n" + f"CITED PASSAGE: {excerpt}", + ) + ) + answers = _map_chat(chat_fn, prompts, workers) + total = len(prompts) + relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes") + if not total: + return {"precision": None, "relevant": relevant, "total": total} + return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total} + + +def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict: + """Count negative-control falsehoods the output asserts or endorses. + + Per NC item, ask yes/no whether the output endorses the false statement. + Returns {asserted, total, asserted_ids}. The semantic counterpart to the + inert deterministic NC check. + """ + output_text = _output_text(result) + nc_items = registry.nc_items + prompts = [ + ( + SYSTEM, + "Does the OUTPUT assert or endorse the following FALSE statement?\n" + 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' + f"FALSE STATEMENT: {item.description}\n" + f"OUTPUT:\n{output_text}", + ) + for item in nc_items + ] + answers = _map_chat(chat_fn, prompts, workers) + asserted_ids = [ + item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + ] + return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} + + +def fabricated_entity(result: dict, chat_fn) -> dict: + """Count systems/orgs/metrics named in the output but absent from the corpus. + + Returns {count, entities}. The corpus universe is the set of evidence + excerpts + locators. + """ + output_text = _output_text(result) + corpus = "\n".join( + f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" + for ev in result.get("evidence_index", []) + ) + user = ( + "List any system, organization, or metric NAMED in the OUTPUT that does NOT " + "appear anywhere in the CORPUS EVIDENCE.\n" + 'Reply with ONLY {"fabricated": ["", ...]}. Empty list if none.\n\n' + f"OUTPUT:\n{output_text}\n\n" + f"CORPUS EVIDENCE:\n{corpus}" + ) + entities = chat_fn(SYSTEM, user).get("fabricated", []) or [] + return {"count": len(entities), "entities": list(entities)} + + +def contradiction(result: dict, chat_fn) -> dict: + """Count internally contradictory finding pairs. + + Returns {count, pairs}. pairs is the list of contradicting finding-id pairs + the judge reports. + """ + lines = [] + for f in result.get("findings", []): + lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") + user = ( + "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + + "\n".join(lines) + ) + pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] + return {"count": len(pairs), "pairs": [list(p) for p in pairs]} + + +def open_gap(result: dict, chat_fn) -> dict: + """G-Eval open probe: the most important process issue the output missed. + + Returns {gap} — a free-text advisory narrative (no score). + """ + pg = result.get("process_graph") or {} + pg_summary = f"process_graph has {len(pg.get('processes', []))} processes" + user = ( + "Given this corpus scope and output, what important process issue did the " + "output FAIL to surface?\n" + 'Reply with ONLY {"gap": ""}.\n\n' + f"WORKSPACE SCOPE: {_workspace_intention(result)}\n" + f"{pg_summary}\n" + f"OUTPUT:\n{_output_text(result)}" + ) + return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))} + + +def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Average 0-1 rating of whether proposed actions are specific+quantified+linked. + + Returns {score, rated}. Each action is rated against whether it is specific, + quantified, and linked to a finding. + """ + actions = result.get("proposed_actions", []) or [] + finding_ids = {f.get("id") for f in result.get("findings", [])} + prompts = [ + ( + SYSTEM, + "Rate whether this proposed action is SPECIFIC, QUANTIFIED, and LINKED to a " + "finding.\n" + 'Reply with ONLY {"score": }.\n\n' + f"TITLE: {a.get('title', '')}\n" + f"DESCRIPTION: {a.get('description', '')}\n" + f"OWNER: {a.get('owner_persona', '')} HORIZON: {a.get('horizon', '')} " + f"LEVER: {a.get('lever', '')} EFFORT: {a.get('effort', '')}\n" + f"EXPECTED_SAVINGS_FTE: {a.get('expected_savings_fte', '')} " + f"EXPECTED_SAVINGS_USD: {a.get('expected_savings_usd', '')}\n" + f"LINKED_TO_FINDING: {a.get('finding_id') in finding_ids}", + ) + for a in actions + ] + answers = _map_chat(chat_fn, prompts, workers) + scores: list[float] = [] + for a in answers: + value = _coerce_float(a.get("score")) + if value is None: # malformed vote -> skip this action, keep the metric + continue + scores.append(value) + score = round(sum(scores) / len(scores), 4) if scores else None + return {"score": score, "rated": len(scores)} + + +def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Per-finding judgment of whether stated severity matches the evidence. + + Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}. + """ + evidence_index = _evidence_index(result) + findings = result.get("findings", []) + prompts = [ + ( + SYSTEM, + "Does the STATED SEVERITY match what the CITED EVIDENCE supports?\n" + 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' + f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}", + ) + for f in findings + ] + answers = _map_chat(chat_fn, prompts, workers) + verdicts: dict[str, str] = {} + miscalibrated = 0 + for f, a in zip(findings, answers): + verdict = str(a.get("calibration", "calibrated")).lower() + verdicts[f.get("id", "?")] = verdict + if verdict in ("under", "over"): + miscalibrated += 1 + return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts} + + +def answer_relevancy(result: dict, chat_fn) -> dict: + """RAGAS-style: does the output address the stated workspace intention? + + Returns {score} in [0,1], or {"score": None} when the vote fails to coerce. + """ + user = ( + "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n" + 'Reply with ONLY {"score": }.\n\n' + f"WORKSPACE INTENTION: {_workspace_intention(result)}\n" + f"OUTPUT:\n{_output_text(result)}" + ) + return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))} + + +def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: + """Fraction of near-duplicate process-graph node pairs that are genuinely distinct. + + Scoping rules: + - Processes: all pairs compared (cross-process is valid at this level). + - Activities and decisions: ONLY within the same parent process. The same + activity name appearing in two different processes is a legitimate repetition + (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate. + + For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30) + are selected. For activities/decisions the parent process name is included in + the judge prompt so it can reason about intra-process context. 30 pairs total. + + Returns {distinct, redundant, total, distinct_rate, redundant_pairs}. + """ + pg = result.get("process_graph", {}) + procs = pg.get("processes", []) + + def _toks(node: dict) -> frozenset[str]: + return frozenset(node.get("name", "").lower().split()) + + PER_SURFACE_CAP = 10 + # candidates: (surface, node_a, node_b, parent_process_name) + candidates: list[tuple[str, dict, dict, str]] = [] + + # Processes: compare all pairs + if len(procs) >= 2: + pairs: list[tuple[float, dict, dict]] = [] + for i in range(len(procs)): + for j in range(i + 1, len(procs)): + a_t, b_t = _toks(procs[i]), _toks(procs[j]) + union = a_t | b_t + if not union: + continue + jac = len(a_t & b_t) / len(union) + if jac >= 0.30: + pairs.append((jac, procs[i], procs[j])) + pairs.sort(key=lambda x: x[0], reverse=True) + for _jac, a, b in pairs[:PER_SURFACE_CAP]: + candidates.append(("process", a, b, "")) + + # Activities and decisions: within the same parent process only + for surface_key, attr in (("activity", "activities"), ("decision", "decisions")): + all_pairs: list[tuple[float, dict, dict, str]] = [] + for proc in procs: + nodes = proc.get(attr, []) + proc_name = proc.get("name", "") + if len(nodes) < 2: + continue + for i in range(len(nodes)): + for j in range(i + 1, len(nodes)): + a_t, b_t = _toks(nodes[i]), _toks(nodes[j]) + union = a_t | b_t + if not union: + continue + jac = len(a_t & b_t) / len(union) + if jac >= 0.30: + all_pairs.append((jac, nodes[i], nodes[j], proc_name)) + all_pairs.sort(key=lambda x: x[0], reverse=True) + for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + candidates.append((surface_key, a, b, proc_name)) + + if not candidates: + return {"distinct": 0, "redundant": 0, "total": 0, "distinct_rate": None, "redundant_pairs": []} + + prompts = [] + for surface, a, b, parent_proc in candidates: + ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" + prompts.append(( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + )) + + answers = _map_chat(chat_fn, prompts, workers) + + distinct = 0 + redundant = 0 + redundant_pairs: list[dict] = [] + for (surface, a, b, _parent), answer in zip(candidates, answers): + verdict = str(answer.get("verdict", "")).upper() + if verdict == "DISTINCT": + distinct += 1 + else: + redundant += 1 + redundant_pairs.append({ + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + }) + + total = distinct + redundant + return { + "distinct": distinct, + "redundant": redundant, + "total": total, + "distinct_rate": round(distinct / total, 4) if total else None, + "redundant_pairs": redundant_pairs, + } + + +def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict: + """Pairwise MT-Bench-style review of candidate vs champion (advisory only). + + Returns {candidate, champion, more_consistent} where candidate/champion are + 1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression. Never + feeds G5. + """ + user = ( + "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): " + "Coverage, Quality, Evidence, Actionability, Regression. Then say which is " + "more internally consistent.\n" + "Reply with ONLY " + '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' + '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' + '"more_consistent": "candidate" or "champion"}.\n\n' + f"CANDIDATE:\n{_output_text(result)}\n\n" + f"CHAMPION:\n{_output_text(champion_result)}" + ) + out = chat_fn(SYSTEM, user) + return { + "candidate": out.get("candidate", {}), + "champion": out.get("champion", {}), + "more_consistent": out.get("more_consistent", ""), + } + + +# ── median-of-N for [J] metrics ────────────────────────────────────────────────── + + +def _numeric_leaves(d: dict) -> dict[tuple, float]: + """Flatten a metric dict to {path: float} over its FLOAT score-leaves only. + + Median applies to continuous scores only. A leaf counts as numeric-for-median + only when its value is a ``float``; ``bool`` and ``int`` leaves (counts, + denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and + taken from the first run unchanged — this avoids fractional counts (rated=0.5) + and count/len(list) disagreement under runs>1 with an even N. + """ + out: dict[tuple, float] = {} + + def walk(node, path: tuple) -> None: + if isinstance(node, float): + out[path] = node + elif isinstance(node, dict): + for k, v in node.items(): + walk(v, path + (k,)) + + walk(d, ()) + return out + + +def _set_leaf(d: dict, path: tuple, value: float) -> None: + node = d + for key in path[:-1]: + node = node[key] + node[path[-1]] = value + + +def _median_runs(samples: list[dict]) -> dict: + """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first. + + Only continuous float scores are medianed; integer bookkeeping (counts, + denominators, 1-5 axes) and all non-numeric fields are taken from the first run. + """ + samples = [s for s in samples if isinstance(s, dict)] + if not samples: + return {} + base = samples[0] + if len(samples) == 1: + return base + leaf_values: dict[tuple, list[float]] = {} + for s in samples: + for path, val in _numeric_leaves(s).items(): + leaf_values.setdefault(path, []).append(val) + merged = dict(base) + for path, vals in leaf_values.items(): + try: + _set_leaf(merged, path, round(statistics.median(vals), 4)) + except (KeyError, TypeError): + continue + return merged + + +# ── orchestrator ───────────────────────────────────────────────────────────────── + + +def run_judge( + result: dict, + registry, + *, + judge_model: str, + runs: int = 1, + concurrency: int = 1, + pipeline_model: str = "", + champion_result: dict | None = None, + chat_fn=None, + embed_fn=None, + tau: float = 0.70, + lexical_missed_ids: list[str] | None = None, +) -> AdvisoryReport: + """Run the G4 advisory gate, best-effort. NEVER raises; NEVER affects verdict. + + If chat_fn / embed_fn are None, real ones are built from JudgeClient / + OllamaEmbedder (tests inject stubs instead). Each [J] metric runs `runs` + times and the median of its numeric scores is kept. Every metric is wrapped + in try/except: a failure appends to report.errors and the run continues. + + ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal + fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items + across a thread pool (order preserved). The median-of-N ``runs`` loop stays + sequential and the single-call metrics are unaffected. The result is + byte-for-byte identical at concurrency=1. + + Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and + same_provider_caveat = same_provider(pipeline_model, judge_model). + """ + if chat_fn is None: + client = JudgeClient(judge_model) + chat_fn = client.chat_json + if embed_fn is None: + embed_fn = OllamaEmbedder().embed + + report = AdvisoryReport( + judge_model=judge_model, + same_provider_caveat=same_provider(pipeline_model, judge_model), + calibrated=False, + runs=runs, + ) + + def _run_det(name: str, fn) -> None: + try: + report.metrics[name] = fn() + except Exception as exc: # best-effort: never raise + report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + + def _run_judge_metric(name: str, fn) -> None: + try: + samples = [fn() for _ in range(max(1, runs))] + report.metrics[name] = _median_runs(samples) + except Exception as exc: # best-effort: never raise + report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + + # [D] deterministic — always computed, no LLM. + _run_det("source_coverage", lambda: source_coverage(result)) + _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result)) + + # [E] embedding — context recall. + _run_det( + "semantic_recovery", + lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau), + ) + + # [J] judge — median-of-N. Per-item metrics fan out at workers=concurrency. + _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency)) + _run_judge_metric( + "numeric_temporal_fidelity", + lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), + ) + _run_judge_metric( + "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) + ) + _run_judge_metric( + "nc_semantic_precision", + lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), + ) + _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn)) + _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn)) + _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn)) + _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency)) + _run_judge_metric( + "severity_calibration", + lambda: severity_calibration(result, chat_fn, workers=concurrency), + ) + _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn)) + _run_judge_metric( + "surface_deduplication", + lambda: surface_deduplication(result, chat_fn, workers=concurrency), + ) + if champion_result is not None: + _run_judge_metric( + "comparative_vs_champion", + lambda: comparative_vs_champion(result, champion_result, chat_fn), + ) + + return report diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py new file mode 100644 index 00000000..1af17f53 --- /dev/null +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -0,0 +1,454 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate. + +Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy. +The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI, +Azure OpenAI, Ollama) plus an Ollama embedder. It is deliberately tolerant: +chat_json extracts the FIRST JSON object from the model text (models wrap JSON +in prose / code fences), and retries transient HTTP errors with backoff. + +This module is import-safe: importing it touches NO network and reads NO API +key. Keys are read lazily, per-call, only when a real request is made — so the +judge tests can import and inject stubs without any secret present. + +Provider/model spec format: ":", e.g. "anthropic:claude-sonnet-4-6", +"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3". A bare model with no prefix is +treated as provider "unknown" (see parse_model / same_provider). +""" + +from __future__ import annotations + +import json +import os +import re +import time +import urllib.error +import urllib.request + +import numpy as np + +# Transient HTTP status codes worth retrying (rate limit + 5xx). +_RETRY_STATUS = (429, 500, 502, 503, 504) + +# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us). +_MAX_RETRY_AFTER = 30.0 + + +def _env(name, default=None): + """Read an env var, stripping surrounding whitespace; empty-after-strip -> default. + + Defensive against a ``.env`` value that arrives with a trailing ``\\r`` / + whitespace (CRLF), which would otherwise corrupt a request URL or header. + An unset OR blank value falls back to ``default`` so the existing + missing-key -> RuntimeError behaviour is preserved. + """ + value = os.environ.get(name) + if value is None: + return default + value = value.strip() + return value if value else default + + +def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float: + """Seconds to sleep before retrying an HTTPError. + + On 429 honour the ``Retry-After`` header (capped at 30s) when it is present + and numeric; otherwise fall back to exponential backoff (2 ** attempt). + """ + if exc.code == 429: + headers = getattr(exc, "headers", None) + retry_after = headers.get("retry-after") if headers is not None else None + if retry_after is not None: + try: + return min(float(retry_after), _MAX_RETRY_AFTER) + except (TypeError, ValueError): + pass + return 2.0**attempt + + +def parse_model(spec: str) -> tuple[str, str]: + """Split a "provider:model" spec into (provider, model). + + A bare spec with no ':' is returned as provider "unknown" with the whole + string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6"). + The provider is lower-cased; the model keeps its original case. + """ + spec = (spec or "").strip() + if ":" not in spec: + return "unknown", spec + provider, model = spec.split(":", 1) + return provider.strip().lower(), model.strip() + + +def same_provider(pipeline_model: str, judge_model: str) -> bool: + """True iff both specs name the SAME known provider prefix. + + A missing or "unknown" provider on either side -> not-same (False). This is + the same-provider caveat signal: when the judge and the pipeline share a + provider the judged metrics are advisory (no cross-provider isolation). + """ + p_provider, _ = parse_model(pipeline_model) + j_provider, _ = parse_model(judge_model) + if p_provider == "unknown" or j_provider == "unknown": + return False + return p_provider == j_provider + + +def _first_json_object(text: str) -> dict: + """Extract and parse the FIRST balanced JSON object embedded in text. + + Models wrap JSON in prose, preambles, or ```json code fences. This scans + for the first '{' and walks the string tracking brace depth (string-aware, + so braces inside quoted values do not confuse the matcher) to find its + matching '}'. Falls back to a greedy regex span if no balanced object is + found. Raises ValueError when nothing parses. + """ + if not text: + raise ValueError("empty model response") + + # Fast path: a clean JSON object with no surrounding prose. A non-dict + # clean parse (e.g. a top-level array) is intentionally ignored so the brace + # scanner can still find an embedded object rather than returning arr[0]. + try: + parsed = json.loads(text.strip()) + except (json.JSONDecodeError, ValueError): + parsed = None + if isinstance(parsed, dict): + return parsed + + start = text.find("{") + while start != -1: + depth = 0 + in_string = False + escape = False + for i in range(start, len(text)): + ch = text[i] + if in_string: + if escape: + escape = False + elif ch == "\\": + escape = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + candidate = text[start : i + 1] + try: + return json.loads(candidate) + except json.JSONDecodeError: + break # try the next '{' + start = text.find("{", start + 1) + + # Greedy fallback: first '{' .. last '}' across newlines. + match = re.search(r"\{.*\}", text, re.DOTALL) + if match: + return json.loads(match.group(0)) + raise ValueError("no JSON object found in model response") + + +def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict: + """POST a JSON body and return the parsed JSON response (single attempt).""" + data = json.dumps(body).encode("utf-8") + req_headers = {"content-type": "application/json", **headers} + req = urllib.request.Request(url, data=data, headers=req_headers, method="POST") + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def _extract_openai_text(resp: dict) -> str: + """Pull the assistant text from an OpenAI/Azure chat-completions response. + + Guards an empty ``choices`` list and a null ``message.content`` and raises a + descriptive RuntimeError (not a KeyError) when no text is present, so the + judge layer records a clean dropped-vote reason instead of a stack trace. + """ + choices = resp.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return text + raise RuntimeError(f"judge returned no text: {resp}") + + +class JudgeClient: + """Minimal multi-provider chat client returning parsed JSON dicts. + + Dispatch is by the provider prefix of the model spec. temperature is pinned + to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and URL + errors are retried up to max_retries: a 429 honours the ``Retry-After`` + header (capped at 30s) when present, otherwise backoff is exponential + (2 ** attempt seconds). + + The API key / endpoint env vars are read lazily inside chat_json, so + constructing a JudgeClient never requires a secret. + """ + + def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None: + self.model_spec = model + self.provider, self.model = parse_model(model) + self.timeout = timeout + self.max_retries = max_retries + + def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: + """Send (system, user) to the provider and parse the first JSON object. + + Raises on exhausted retries / unknown provider / unparseable output. + The judge module wraps every call in try/except, so a raise here becomes + a dropped vote rather than a crash. + """ + last_exc: Exception | None = None + for attempt in range(self.max_retries): + try: + text = self._dispatch(system, user, max_tokens) + return _first_json_object(text) + except urllib.error.HTTPError as exc: + last_exc = exc + if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1: + raise + time.sleep(_retry_delay(exc, attempt)) + except (urllib.error.URLError, TimeoutError, ConnectionError) as exc: + last_exc = exc + if attempt == self.max_retries - 1: + raise + time.sleep(2**attempt) + if last_exc is not None: + raise last_exc + raise RuntimeError("chat_json exhausted retries without a response") + + def _dispatch(self, system: str, user: str, max_tokens: int) -> str: + """Route to the per-provider call and return the raw model text.""" + if self.provider == "anthropic": + return self._anthropic(system, user, max_tokens) + if self.provider == "openai": + return self._openai(system, user, max_tokens) + if self.provider == "azure": + return self._azure(system, user, max_tokens) + if self.provider == "ollama": + return self._ollama(system, user, max_tokens) + raise ValueError( + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " + "use anthropic:/openai:/azure:/ollama:" + ) + + def _anthropic(self, system: str, user: str, max_tokens: int) -> str: + api_key = _env("ANTHROPIC_API_KEY") + if not api_key: + raise RuntimeError("ANTHROPIC_API_KEY not set") + body = { + "model": self.model, + "max_tokens": max_tokens, + "temperature": 0.0, + "system": system, + "messages": [{"role": "user", "content": user}], + } + headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} + resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) + text = next( + (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None + ) + if not text: + raise RuntimeError(f"judge returned no text: {resp}") + return text + + def _openai(self, system: str, user: str, max_tokens: int) -> str: + api_key = _env("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY not set") + body = { + "model": self.model, + "max_tokens": max_tokens, + "temperature": 0.0, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + headers = {"Authorization": f"Bearer {api_key}"} + resp = _http_post_json( + "https://api.openai.com/v1/chat/completions", headers, body, self.timeout + ) + return _extract_openai_text(resp) + + def _azure(self, system: str, user: str, max_tokens: int) -> str: + endpoint = _env("AZURE_OPENAI_ENDPOINT") + api_key = _env("AZURE_OPENAI_API_KEY") + if not endpoint: + raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") + if not api_key: + raise RuntimeError("AZURE_OPENAI_API_KEY not set") + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" + # Azure deployment lives in the URL path, not the JSON body. + url = ( + f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" + f"?api-version={api_version}" + ) + body = { + "max_tokens": max_tokens, + "temperature": 0.0, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + headers = {"api-key": api_key} + resp = _http_post_json(url, headers, body, self.timeout) + return _extract_openai_text(resp) + + def _ollama(self, system: str, user: str, max_tokens: int) -> str: + host = _env("OLLAMA_HOST") or "http://localhost:11434" + body = { + "model": self.model, + "stream": False, + "options": {"temperature": 0.0, "num_predict": max_tokens}, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout) + text = (resp.get("message") or {}).get("content") + if not text: + raise RuntimeError(f"judge returned no text: {resp}") + return text + + +class OpenAIEmbedder: + """OpenAI embeddings client over /v1/embeddings. + + Reads OPENAI_API_KEY from the environment. Default model: text-embedding-3-small. + """ + + def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: + self.model = model + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + api_key = _env("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY not set") + headers = {"Authorization": f"Bearer {api_key}"} + body = {"model": self.model, "input": texts} + resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout) + data = resp.get("data", []) + vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] + return np.asarray(vectors, dtype=np.float32) + + +class AzureOpenAIEmbedder: + """Azure OpenAI embeddings client. + + Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally + AZURE_OPENAI_API_VERSION from the environment. The model name is the + deployment name. Default model: text-embedding-3-small. + """ + + def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: + self.model = model + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + endpoint = _env("AZURE_OPENAI_ENDPOINT") + api_key = _env("AZURE_OPENAI_API_KEY") + if not endpoint: + raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") + if not api_key: + raise RuntimeError("AZURE_OPENAI_API_KEY not set") + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" + url = ( + f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" + f"?api-version={api_version}" + ) + headers = {"api-key": api_key} + vectors = self._embed_with_split(texts, url, headers) + return np.asarray(vectors, dtype=np.float32) + + def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]: + """Send texts in one request; on HTTP 400 split in half and retry each half.""" + try: + resp = _http_post_json(url, headers, {"input": texts}, self.timeout) + data = resp.get("data", []) + return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] + except urllib.error.HTTPError as exc: + if exc.code == 400 and len(texts) > 1: + mid = len(texts) // 2 + left = self._embed_with_split(texts[:mid], url, headers) + right = self._embed_with_split(texts[mid:], url, headers) + return left + right + raise + + +class OllamaEmbedder: + """Local Ollama embedding client (default model bge-m3) over /api/embeddings. + + Posts one prompt per call (the stable single-prompt form) and stacks the + returned vectors into a 2-D numpy array. Constructing it touches no network; + the host is resolved from $OLLAMA_HOST at call time. + """ + + def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None: + self.model = model + self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/") + self.timeout = timeout + + def embed(self, texts: list[str]) -> np.ndarray: + """Embed a list of strings -> float32 ndarray of shape (len(texts), dim).""" + vectors: list[list[float]] = [] + for text in texts: + body = {"model": self.model, "prompt": text} + resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout) + vectors.append(resp["embedding"]) + return np.asarray(vectors, dtype=np.float32) + + +def build_embedder(spec: str): + """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec. + + Dispatch is on the provider prefix of a ":" spec: + - "ollama" / "ollama:" -> OllamaEmbedder(model or "bge-m3").embed. + - a bare "" with no ':' -> treated as an Ollama model. + - any other provider -> NotImplementedError (the extension point). + + Add a new backend by adding a branch here. + """ + if (spec or "").strip() == "ollama": # bare provider, no model -> default model + return OllamaEmbedder("bge-m3").embed + provider, model = parse_model(spec) + if provider in ("unknown", "ollama"): # bare "" or "ollama:" + return OllamaEmbedder(model or "bge-m3").embed + if provider == "openai": + return OpenAIEmbedder(model or "text-embedding-3-small").embed + if provider == "azure": + return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed + raise NotImplementedError( + f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" + ) + + +def cosine(a, b) -> float: + """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector.""" + a = np.asarray(a, dtype=np.float64).ravel() + b = np.asarray(b, dtype=np.float64).ravel() + na = float(np.linalg.norm(a)) + nb = float(np.linalg.norm(b)) + if na == 0.0 or nb == 0.0: + return 0.0 + return float(np.dot(a, b) / (na * nb)) diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py index 2f5065df..b4d81f44 100644 --- a/fireflyframework_agentic/evaluation/matcher.py +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -29,12 +29,7 @@ import numpy as np - -def cosine(a, b) -> float: - """Cosine similarity between two vectors.""" - a = np.asarray(a, dtype=float) - b = np.asarray(b, dtype=float) - return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) +from fireflyframework_agentic.evaluation.judge_client import cosine def tokens(text: str) -> list[str]: From 1906ede934bb82cca1b127341a2f457a66e59a3c Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:56:09 +0200 Subject: [PATCH 07/67] feat(evaluation): add champion tracking and flyeval CLI (#274) * feat(evaluation): add ChampionRecord and champion management functions * feat(evaluation): add run_config_snapshot for flyradar run configuration capture * feat(evaluation): add flyeval CLI with gate, aa-band, day-zero, invalidate subcommands --------- Co-authored-by: miguelgfierro --- .../evaluation/champion.py | 169 ++++++ fireflyframework_agentic/evaluation/cli.py | 573 ++++++++++++++++++ .../evaluation/run_config_snapshot.py | 160 +++++ 3 files changed, 902 insertions(+) create mode 100644 fireflyframework_agentic/evaluation/champion.py create mode 100644 fireflyframework_agentic/evaluation/cli.py create mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py new file mode 100644 index 00000000..239429eb --- /dev/null +++ b/fireflyframework_agentic/evaluation/champion.py @@ -0,0 +1,169 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Per-corpus champion management. + +Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction) +metrics live in incommensurable spaces. There is no global champion. +See EVALUATION_FRAMEWORK.md (per-corpus champions). + +The historical fake-100% incident: banca-cordobesa/baseline.json was populated +with a champion scored against an EMPTY must-find registry. The EMPTY_MUST_FIND +guard in G1 prevents a recurrence; the invalidate_champion() function provides +the corrective action when it does happen. +""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class ChampionRecord: + """Per-corpus champion, stored as 'champion' key in baseline.json.""" + + corpus: str + run_id: str + model_id: str + registry_sha256: str + scores: dict # {metric_name: float} + aa_noise: dict = field(default_factory=dict) # {metric_name: noise_floor} + is_day_zero: bool = False + human_sign_offs: list[str] = field(default_factory=list) + config: dict = field(default_factory=dict) # evaluation config snapshot + corpus_sha256: str = "" # pin of the evidence corpus the champion was verified against + + def primary_metric(self) -> str: + return next(iter(self.scores)) if self.scores else "" + + def primary_score(self) -> float: + return float(self.scores.get(self.primary_metric(), 0.0)) + + +def load_champion(baseline_path: str | Path) -> ChampionRecord | None: + """Load the current per-corpus champion from baseline.json. + + Returns None when: + - The file does not exist (normal Day-Zero state). + - The file exists but 'champion' is null (post-invalidation state). + """ + path = Path(baseline_path) + if not path.exists(): + return None + raw = json.loads(path.read_text(encoding="utf-8")) + champ_raw = raw.get("champion") + if champ_raw is None: + return None + return ChampionRecord( + corpus=champ_raw["corpus"], + run_id=champ_raw["run_id"], + model_id=champ_raw["model_id"], + registry_sha256=champ_raw["registry_sha256"], + scores=champ_raw.get("scores", {}), + aa_noise=champ_raw.get("aa_noise", {}), + is_day_zero=champ_raw.get("is_day_zero", False), + human_sign_offs=champ_raw.get("human_sign_offs", []), + config=champ_raw.get("config", {}), + corpus_sha256=champ_raw.get("corpus_sha256", ""), + ) + + +def save_champion( + baseline_path: str | Path, + champion: ChampionRecord, + *, + summary: str = "", + date: str = "", +) -> None: + """Persist a new champion and append a promotion log entry. + + Reads the existing file if it exists (to preserve the log), then writes + the new champion. The promotion log is append-only. + """ + path = Path(baseline_path) + if path.exists(): + raw = json.loads(path.read_text(encoding="utf-8")) + log = raw.get("promotion_log", []) + prev_run = raw.get("champion", {}) + prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None + else: + log = [] + prev_run_id = None + + log.append( + { + "date": date or "unknown", + "from": prev_run_id, + "to": champion.run_id, + "label": "day-zero" if champion.is_day_zero else "promotion", + "summary": summary, + } + ) + + payload = { + "champion": { + "corpus": champion.corpus, + "run_id": champion.run_id, + "model_id": champion.model_id, + "registry_sha256": champion.registry_sha256, + "scores": champion.scores, + "aa_noise": champion.aa_noise, + "is_day_zero": champion.is_day_zero, + "human_sign_offs": champion.human_sign_offs, + "config": champion.config, + "corpus_sha256": champion.corpus_sha256, + }, + "promotion_log": log, + } + path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + + +def invalidate_champion( + baseline_path: str | Path, + *, + reason: str, + date: str = "", +) -> None: + """Null out the current champion and record the invalidation reason. + + Used when a champion was locked in against an empty or tampered registry + (the banca-cordobesa fake-100% incident). + """ + path = Path(baseline_path) + if not path.exists(): + return + raw = json.loads(path.read_text(encoding="utf-8")) + log = raw.get("promotion_log", []) + prev_run = raw.get("champion", {}) + prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None + log.append( + { + "date": date or "unknown", + "from": prev_run_id, + "to": None, + "label": "INVALIDATED", + "summary": reason, + } + ) + raw["champion"] = None + raw["promotion_log"] = log + path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8") + + +def input_hash(result_dict: dict) -> str: + """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance.""" + canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py new file mode 100644 index 00000000..7ac868d9 --- /dev/null +++ b/fireflyframework_agentic/evaluation/cli.py @@ -0,0 +1,573 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""flyeval — FlyRadar Lean Core evaluation CLI. + +Usage +----- + flyeval gate --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M] + flyeval aa-band --results R1.json R2.json ... --registry REG.json + flyeval day-zero --result R.json --registry REG.json --baseline B.json --signoffs 2 + flyeval invalidate --baseline B.json --reason "..." + +The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every +subcommand exits 0 on PROMOTE, 1 on HOLD. G4 (the --judge-model LLM-as-a-Judge, +on by default, --no-judge to skip) is non-blocking — it prints advisory signals +and never changes the verdict or the exit code. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import __version__ +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + invalidate_champion, + load_champion, + save_champion, +) +from fireflyframework_agentic.evaluation.corpus import load_corpus +from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates +from fireflyframework_agentic.evaluation.judge import run_judge +from fireflyframework_agentic.evaluation.judge_client import build_embedder +from fireflyframework_agentic.evaluation.matcher import matches +from fireflyframework_agentic.evaluation.registry import load_registry +from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict +from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag + + +def _load_json(path: str) -> dict: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _lexical_missed_ids(result: dict, registry) -> list[str]: + """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers.""" + evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + findings = result.get("findings", []) + scored = [i for i in registry.real_items if i.tier != "L3"] + return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)] + + +def _read_experiment_config(result_path: str) -> dict | None: + """Read the experiment_configuration.json recorded next to the run's output.json. + + The experiment config records how the run was generated; it is authored by the + generation step at run time. Evaluation only reads it for display and never + writes or overwrites it. Returns None when the run has no recorded config. + """ + path = Path(result_path).parent / "experiment_configuration.json" + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def _write_eval_config(result_path: str, config: dict) -> Path: + """Write evaluation_configuration.json next to the run's output.json. + + The evaluation config is authored by flyeval at gate time (registry/corpus SHAs, + recall metric, floors, judge settings), so unlike the experiment config it is + owned here and safe to (over)write each run. It mirrors the block embedded in + the scorecard, as a machine-readable artifact. + """ + path = Path(result_path).parent / "evaluation_configuration.json" + path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return path + + +def _eval_config(args, registry, corpus=None) -> dict: + """Capture the run's evaluation configuration for provenance. + + Uses getattr defaults so it works for both `gate` (has every flag) and + `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge + defaults, which honestly reflects how day-zero scores). + """ + jm = getattr(args, "judge_model", None) + baseline = getattr(args, "baseline", None) + tau = getattr(args, "tau", 0.70) + return { + "evaluator_version": __version__, + "registry_sha256": registry.sha256(), + "corpus_sha256": corpus.sha256 if corpus else None, + "model_id": getattr(args, "model_id", None) or "unknown", + "gates": { + "G1": { + "name": "Structural & Safe", + "pii_list": getattr(args, "pii_list", None) or [], + "metrics": { + "empty_must_find": "registry has >=1 must-find item; guards the fake-100% " + "champion (EMPTY_MUST_FIND)", + "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", + "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", + "schema_valid": "required top-level keys present in the result " + "(SCHEMA_INVALID)", + "pii_non_disclosure": "no corpus PII name appears in any finding/report text " + "(PII_LEAK)", + }, + }, + "G2": { + "name": "Recall & Precision", + "recall_metric": getattr(args, "recall_metric", "lexical"), + "recall_floor": getattr(args, "recall_floor", 0.70), + "tau": tau, + "tau_nc": getattr(args, "tau_nc", 0.85), + "embedder": getattr(args, "embedder", None), + "metrics": { + "lexical_recall": "token-overlap recall (always reported)", + "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)", + "hybrid_recall": "per item, a lexical OR semantic match", + "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks", + "nc_precision": "negative-control items wrongly emitted; an NC hit blocks", + "finding_redundancy_rate": "fraction of findings duplicating another's topic", + }, + }, + "G3": { + "name": "Grounded", + "grounding_floor": getattr(args, "grounding_floor", 0.90), + "human_spot_check_n": 5, + "corpus_verification": corpus is not None, + "metrics": { + "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " + "below grounding_floor", + "evidence_verified": "cited excerpts located in the actual corpus " + "(when supplied)", + "evidence_fabricated": "populated excerpts not found in their cited source " + "(EVIDENCE_FABRICATED)", + "evidence_source_unknown": "locators resolving to no corpus document " + "(EVIDENCE_SOURCE_UNKNOWN)", + "excerpt_fill_rate": "evidence entries carrying a populated excerpt", + "source_coverage": "distinct corpus documents cited", + }, + }, + "G4": { + "name": "LLM Judge (advisory, non-blocking)", + "judge_model": jm, + "judge_runs": getattr(args, "judge_runs", 1) if jm else None, + "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None, + "judge_temperature": 0.0 if jm else None, + "tau": tau if jm else None, + "metrics": { + "faithfulness": "each finding's claim entailed by its cited evidence", + "numeric_temporal_fidelity": "numbers and dates in findings match the evidence", + "citation_relevance": "cited evidence refs are on-topic (context precision)", + "nc_semantic_precision": "negative-control items semantically asserted", + "fabricated_entity": "named entities absent from the corpus", + "contradiction": "findings contradicting the evidence or each other", + "open_gap": "a consequential issue the output failed to surface", + "actionability": "proposed actions are specific and actionable", + "severity_calibration": "stated severity matches the evidence", + "answer_relevancy": "output addresses the workspace intention", + "source_coverage": "distinct corpus documents cited (deterministic)", + "excerpt_fill_rate": "evidence entries with a populated excerpt " + "(deterministic)", + }, + }, + "G5": { + "name": "No-regression / promotion", + "is_day_zero": baseline is None, + "human_signed_off": getattr(args, "human_signed_off", False), + "signoffs": getattr(args, "signoffs", 0), + "baseline": baseline, + "baseline_sha256": _file_sha256(baseline) if baseline else None, + "metrics": { + "improvements": "metrics beating the champion by more than the AA noise band", + "regressions": "metrics that regressed versus the champion", + "noise_band": "per-metric AA noise floor a candidate must exceed", + "guardrail_regression": "any guardrail metric that dropped", + "signoffs": "independent human sign-offs recorded", + }, + }, + }, + } + + +def _file_sha256(path: str) -> str | None: + """SHA-256 of a file's bytes, or None when it can't be read.""" + try: + return hashlib.sha256(Path(path).read_bytes()).hexdigest() + except OSError: + return None + + +# ── gate ────────────────────────────────────────────────────────────────────── + + +def cmd_gate(args: argparse.Namespace) -> int: + if getattr(args, "no_judge", False): + args.judge_model = None # explicit opt-out; G4 runs by default otherwise + result = _load_json(args.result) + registry = load_registry(args.registry) + corpus = load_corpus(args.corpus) if args.corpus else None + champion = load_champion(args.baseline) if args.baseline else None + champion_scores = champion.scores if champion else None + aa_noise = champion.aa_noise if champion else None + + embed_fn = build_embedder(args.embedder) if args.embedder else None + + if args.recall_metric in ("hybrid", "semantic") and embed_fn is None: + print( + f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" + " Example: --embedder openai:text-embedding-3-small", + file=sys.stderr, + ) + return 2 + + gate_results = run_gates( + result, + registry, + args.registry, + pii_list=args.pii_list or [], + recall_floor=args.recall_floor, + grounding_floor=args.grounding_floor, + champion_scores=champion_scores, + aa_noise=aa_noise, + is_day_zero=(champion is None), + human_signed_off=args.human_signed_off, + signoff_count=args.signoffs, + embed_fn=embed_fn, + tau=args.tau, + recall_metric=args.recall_metric, + tau_nc=args.tau_nc, + corpus=corpus, + ) + + # G4 — on by default, non-blocking. Skipped only with --no-judge; never affects the verdict. + advisory = None + if args.judge_model: + champion_result = _load_json(args.champion_result) if args.champion_result else None + advisory = run_judge( + result, + registry, + judge_model=args.judge_model, + runs=args.judge_runs, + concurrency=args.judge_concurrency, + pipeline_model=args.model_id or "", + champion_result=champion_result, + embed_fn=embed_fn, + tau=args.tau, + lexical_missed_ids=_lexical_missed_ids(result, registry), + ) + + config = _eval_config(args, registry, corpus) + _write_eval_config(args.result, config) + experiment_config = _read_experiment_config(args.result) + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=args.model_id or "unknown", + run_id=args.run_id or "run", + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + evidence_unverified=corpus is None, + advisory=advisory, + config=config, + experiment_config=experiment_config, + ) + print(scorecard) + + v = get_verdict(gate_results) + return 0 if v == "PROMOTE" else 1 + + +# ── aa-band ─────────────────────────────────────────────────────────────────── + + +def cmd_aa_band(args: argparse.Namespace) -> int: + registry = load_registry(args.registry) + + if args.recall_metric in ("hybrid", "semantic") and not args.embedder: + print( + f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" + " Example: --embedder openai:text-embedding-3-small", + file=sys.stderr, + ) + return 2 + + embed_fn = build_embedder(args.embedder) if args.embedder else None + corpus = load_corpus(args.corpus) if args.corpus else None + scores: list[float] = [] + + for rp in args.results: + result = _load_json(rp) + g2 = g2_recall_precision( + result, registry, + recall_metric=args.recall_metric, embed_fn=embed_fn, + tau=args.tau, tau_nc=args.tau_nc, + corpus=corpus, + ) + if g2.passed or g2.details.get("recall") is not None: + scores.append(g2.details.get("recall", 0.0)) + + if len(scores) < 2: + print( + f"ERROR: need >= 2 runs for aa_band; got {len(scores)}. " + "Make sure the registry is non-empty and the runs are valid.", + file=sys.stderr, + ) + return 1 + + band = aa_band(scores) + high_var = left_skew_flag(scores) + print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}") + print(f"Scores across reruns: {[round(s, 4) for s in scores]}") + if high_var: + print("WARNING: HIGH_VARIANCE — min < median - 0.10. Investigate before using this band.") + return 0 + + +# ── day-zero ────────────────────────────────────────────────────────────────── + + +def cmd_day_zero(args: argparse.Namespace) -> int: + result = _load_json(args.result) + registry = load_registry(args.registry) + + if not args.corpus: + print( + "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n" + "verification — a champion must not be minted on unverified evidence.\n" + " Supply the run's input bundle, e.g. --corpus experiments//input.json", + file=sys.stderr, + ) + return 2 + corpus = load_corpus(args.corpus) + + if args.signoffs < 2: + print( + f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.", + file=sys.stderr, + ) + return 1 + + gate_results = run_gates( + result, + registry, + args.registry, + is_day_zero=True, + human_signed_off=True, + signoff_count=args.signoffs, + corpus=corpus, + ) + + config = _eval_config(args, registry, corpus) + _write_eval_config(args.result, config) + experiment_config = _read_experiment_config(args.result) + v = get_verdict(gate_results) + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=args.model_id or "unknown", + run_id=args.run_id or "day-zero", + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + config=config, + experiment_config=experiment_config, + ) + print(scorecard) + + if v == "PROMOTE" and args.baseline: + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + scores = {} + if g2: + scores["recall"] = g2.details.get("recall", 0.0) + if g3: + scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) + + champion = ChampionRecord( + corpus=registry.corpus, + run_id=args.run_id or "day-zero", + model_id=args.model_id or "unknown", + registry_sha256=registry.sha256(), + scores=scores, + is_day_zero=True, + human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)], + config=config, + corpus_sha256=corpus.sha256, + ) + save_champion( + args.baseline, + champion, + summary=f"Day-Zero champion for {registry.corpus}", + date=args.date or "unknown", + ) + print(f"\nDay-Zero champion saved to {args.baseline}") + + return 0 if v == "PROMOTE" else 1 + + +# ── invalidate ──────────────────────────────────────────────────────────────── + + +def cmd_invalidate(args: argparse.Namespace) -> int: + invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown") + print(f"Champion invalidated in {args.baseline}. Reason: {args.reason}") + return 0 + + +# ── parser ──────────────────────────────────────────────────────────────────── + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="flyeval", + description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default", + ) + sub = parser.add_subparsers(dest="command", required=True) + + def _add_common(p: argparse.ArgumentParser) -> None: + p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON") + p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON") + p.add_argument( + "--corpus", + help="Path to the run's input.json corpus bundle — enables deterministic " + "evidence verification (required for day-zero; without it, gate runs " + "carry an EVIDENCE UNVERIFIED disclosure)", + ) + p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)") + p.add_argument("--model-id", default="unknown") + p.add_argument("--run-id", default="run") + p.add_argument("--date", default="", help="ISO date for promotion log") + + # gate + p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard") + _add_common(p_gate) + p_gate.add_argument("--recall-floor", type=float, default=0.70) + p_gate.add_argument("--grounding-floor", type=float, default=0.90) + p_gate.add_argument("--pii-list", nargs="*", default=[]) + p_gate.add_argument( + "--embedder", + default=os.environ.get("FLYEVAL_EMBEDDER"), + help="opt-in embedder spec for the semantic recall path " + '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. ' + "Env: FLYEVAL_EMBEDDER", + ) + p_gate.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"), + help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). " + "Env: FLYEVAL_RECALL_METRIC", + ) + p_gate.add_argument( + "--tau", + type=float, + default=float(os.environ.get("FLYEVAL_TAU", "0.70")), + help="cosine similarity threshold for the semantic recall path (real items). " + "Env: FLYEVAL_TAU", + ) + p_gate.add_argument( + "--tau-nc", + type=float, + default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), + help="cosine similarity threshold for NC item detection (higher; no source anchor). " + "Env: FLYEVAL_TAU_NC", + ) + p_gate.add_argument("--human-signed-off", action="store_true") + p_gate.add_argument("--signoffs", type=int, default=0) + p_gate.add_argument( + "--judge-model", + default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"), + help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). " + "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL", + ) + p_gate.add_argument( + "--no-judge", + action="store_true", + help="skip the G4 LLM-as-a-Judge (it runs by default).", + ) + p_gate.add_argument( + "--judge-runs", + type=int, + default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), + help="G4 judge runs; the median of numeric scores is kept (odd recommended). " + "Env: FLYEVAL_JUDGE_RUNS", + ) + p_gate.add_argument( + "--judge-concurrency", + type=int, + default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")), + help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; " + ">=2 runs each metric's chat calls across a thread pool, order preserved). " + "Env: FLYEVAL_JUDGE_CONCURRENCY", + ) + p_gate.add_argument( + "--champion-result", + help="Path to the champion's output.json for the G4 comparative-review metric", + ) + p_gate.set_defaults(func=cmd_gate) + + # aa-band + p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns") + p_aa.add_argument( + "--results", + nargs="+", + required=True, + help="Paths to champion-rerun result JSON files (>= 2)", + ) + p_aa.add_argument("--registry", required=True) + p_aa.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default="hybrid", + help="recall metric to use — must match the champion's metric (default hybrid; " + "hybrid/semantic require --embedder)", + ) + p_aa.add_argument( + "--embedder", + default=None, + help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)", + ) + p_aa.add_argument("--tau", type=float, default=0.70) + p_aa.add_argument("--tau-nc", type=float, default=0.85) + p_aa.add_argument( + "--corpus", + help="Path to input.json — must match the gate's corpus setting so the " + "band is computed under the same evidence filtering as the champion", + ) + p_aa.set_defaults(func=cmd_aa_band) + + # day-zero + p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)") + _add_common(p_dz) + p_dz.add_argument( + "--signoffs", + type=int, + default=0, + help="Number of independent human sign-offs collected (need 2)", + ) + p_dz.set_defaults(func=cmd_day_zero) + + # invalidate + p_inv = sub.add_parser("invalidate", help="Invalidate the current champion") + p_inv.add_argument("--baseline", required=True) + p_inv.add_argument("--reason", required=True) + p_inv.add_argument("--date", default="") + p_inv.set_defaults(func=cmd_invalidate) + + return parser + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(args.func(args)) + + +if __name__ == "__main__": + main() diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py new file mode 100644 index 00000000..db543129 --- /dev/null +++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py @@ -0,0 +1,160 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Capture the effective flyradar run configuration into experiment_configuration.json. + +Non-invasive snapshot: it records how a run was generated by reading what flyradar +already exposes as data — the request options the caller sent, the ``/api/v1/version`` +endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar. The +snapshot is written next to the run's ``output.json`` at generation time, which is the +moment the configuration is known. + +This is the bridge: the durable fix is for flyradar to stamp the same config into +``DiscoveryResult`` itself (the one place that knows the effective values and cannot +drift). See the "flyradar improvements" issue. ``temperature`` and ``seed`` are not +exposed by ``RadarSettings`` and are recorded as ``null`` here. + +Usage: + cd flyradar_experiments + set -a && source .env && set +a + uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \ + --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \ + --options request_options.json \ + --commit c107918 +""" +from __future__ import annotations + +import argparse +import json +import os +import urllib.request +from importlib.resources import files +from pathlib import Path + +try: + from flyradar.config import RadarSettings +except ImportError: # flyradar is an optional dependency of this snapshot. + RadarSettings = None + +#: Path of the flyradar version endpoint (whitelisted in the service middleware). +VERSION_PATH = "/api/v1/version" + +#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim. +_SETTINGS_KEYS = ( + "model", + "fallback_model", + "duplicity_similarity_threshold", + "rootcause_cost_weight", + "rootcause_frequency_weight", + "rootcause_actionability_weight", +) + + +def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict: + """GET the flyradar version endpoint; return ``{}`` on any failure.""" + url = base_url.rstrip("/") + VERSION_PATH + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + except Exception: + return {} + + +def load_radar_settings() -> dict | None: + """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable.""" + if RadarSettings is None: + return None + settings = RadarSettings() + return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS} + + +def load_prompt_versions() -> dict | None: + """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``.""" + try: + catalog = files("flyradar.resources.prompts") + except ModuleNotFoundError: + return None + versions: dict[str, str] = {} + for entry in catalog.iterdir(): + if not entry.name.endswith(".yaml"): + continue + for line in entry.read_text(encoding="utf-8").splitlines(): + if line.strip().startswith("version:"): + versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"') + break + return versions or None + + +def build_run_config( + options: dict, + *, + version: dict, + settings: dict | None, + prompt_versions: dict | None, + commit: str | None = None, +) -> dict: + """Assemble the experiment-configuration snapshot from its captured parts.""" + return { + "captured_by": "config-snapshot (non-invasive)", + "flyradar_version": version.get("version"), + "flyradar_commit": commit or version.get("commit"), + "options": options, + "settings": settings, + "prompt_versions": prompt_versions, + "temperature": None, + "seed": None, + "_note": ( + "Non-invasive snapshot captured at generation time. `options` is the request " + "the caller sent; `settings` and `prompt_versions` are read from flyradar when " + "importable at the deployed commit. `temperature` and `seed` are not exposed by " + "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp " + "this config into DiscoveryResult (see the 'flyradar improvements' issue)." + ), + } + + +def write_snapshot(output_dir: str | Path, config: dict) -> Path: + """Write ``experiment_configuration.json`` into the run's output directory.""" + path = Path(output_dir) / "experiment_configuration.json" + path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return path + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") + parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") + parser.add_argument( + "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." + ) + parser.add_argument( + "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." + ) + parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") + args = parser.parse_args(argv) + + base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "") + options = json.loads(Path(args.options).read_text(encoding="utf-8")) + config = build_run_config( + options, + version=fetch_version(base_url) if base_url else {}, + settings=load_radar_settings(), + prompt_versions=load_prompt_versions(), + commit=args.commit, + ) + path = write_snapshot(args.output_dir, config) + print(f"Wrote {path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 4ab1d859d16d4ae92d6a6d3a4a283a236d25d29d Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:58:49 +0200 Subject: [PATCH 08/67] feat(lab): add retrieval metrics (hit@k, recall@k, MRR, MAP, nDCG) (#275) * feat(lab): add retrieval_metrics module with compute_retrieval_metrics and RetrieverMetrics * feat(lab): export RetrieverMetrics and compute_retrieval_metrics from lab package * feat(evaluation): import RetrieverMetrics and compute_retrieval_metrics from lab.retrieval_metrics --------- Co-authored-by: miguelgfierro --- .../evaluation/__init__.py | 2 +- fireflyframework_agentic/lab/__init__.py | 3 + .../lab/retrieval_metrics.py | 200 ++++++++++++++++++ 3 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 fireflyframework_agentic/lab/retrieval_metrics.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 37093075..ad01980c 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -39,7 +39,7 @@ from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.evaluation.retrieval import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag try: diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py index 46cc08dc..8e127d8a 100644 --- a/fireflyframework_agentic/lab/__init__.py +++ b/fireflyframework_agentic/lab/__init__.py @@ -18,6 +18,7 @@ from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.lab.session import LabSession, SessionEntry __all__ = [ @@ -31,5 +32,7 @@ "EvalResult", "LabSession", "ModelComparison", + "RetrieverMetrics", "SessionEntry", + "compute_retrieval_metrics", ] diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py new file mode 100644 index 00000000..5f3e2373 --- /dev/null +++ b/fireflyframework_agentic/lab/retrieval_metrics.py @@ -0,0 +1,200 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network). + +Industry-standard information-retrieval metrics computed over a ranked list of +retrieved chunks vs the gold set each result carries (``gold`` + per-hit +``is_gold``). Metrics are reported at cut-offs k ∈ {1, 5, 10}: + +* **Hit@k** -- at least one gold document appears in the top-k results. +* **Recall@k** -- fraction of gold documents found in top-k. +* **Precision@k** -- fraction of top-k results that are gold. +* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10). +* **MAP@10** -- mean average precision (up to k=10). +* **nDCG@10** -- normalised discounted cumulative gain (up to k=10). + +Optional fields (populated when the raw result rows contain them): + +* ``no_answer_rate`` -- fraction of rows where the model produced no answer. +* ``citation_precision`` -- precision of in-answer citations vs gold set. +* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies. + +Ported from ``flycanon_experiments/scripts/deterministic_eval.py``. +""" + +from __future__ import annotations + +import math + +from pydantic import BaseModel + +KS = (1, 5, 10) + + +def _dedup(retrieved: list[dict]) -> list[dict]: + """Return one entry per source, first chunk wins, preserving rank order. + + flycanon splits each ingested document into many chunks; a single gold + filing can therefore appear multiple times in the ranked list. Without + deduplication nDCG/MAP/Recall count every chunk separately, inflating + scores past 1.0 when a good embedding model retrieves several chunks from + the same filing. Taking only the first (highest-ranked) chunk per + source_id makes the list item-unique, matching the recommenders-library + contract that all IR formulae assume. + """ + seen: set[str] = set() + out: list[dict] = [] + for r in sorted(retrieved, key=lambda x: x["rank"]): + key = r.get("source_id") or "|".join(r.get("identities", [])) + if key not in seen: + seen.add(key) + out.append(r) + return out + + +def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: + """Return nDCG@k for a single query.""" + dcg = sum( + 1.0 / math.log2(r["rank"] + 1) + for r in retrieved + if r.get("is_gold") and r["rank"] <= k + ) + ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) + return dcg / ideal if ideal else 0.0 + + +def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: + """Return average precision@k for a single query.""" + hits, precisions = 0, [] + for r in sorted(retrieved, key=lambda x: x["rank"]): + if r["rank"] > k: + break + if r.get("is_gold"): + hits += 1 + precisions.append(hits / r["rank"]) + return sum(precisions) / min(n_gold, k) if n_gold else 0.0 + + +def compute_retrieval_metrics(results: list[dict]) -> dict: + """Compute deterministic IR metrics over a list of retrieval result rows. + + Each element of *results* must be a dict with at least: + + * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id`` + (str) or ``identities`` (list[str]), and ``is_gold`` (bool). + * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``). + + Optional keys per row: + + * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``. + * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision. + * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds. + + Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``, + ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``, + ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``, + ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``, + ``mean_answer_ms``. + """ + n = len(results) + agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")} + agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0}) + no_answer = 0 + cite_num = cite_den = 0.0 + search_ms: list[float] = [] + answer_ms: list[float] = [] + + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + for k in KS: + in_k = [g for g in gold_ranks if g <= k] + agg[f"hit@{k}"] += 1.0 if in_k else 0.0 + agg[f"recall@{k}"] += len(in_k) / n_gold + agg[f"precision@{k}"] += len(in_k) / k + agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0 + agg["map@10"] += _ap(retrieved, n_gold) + agg["ndcg@10"] += _ndcg(retrieved, n_gold) + + if row.get("no_answer") or not row.get("answer", "").strip(): + no_answer += 1 + cites = row.get("citations", []) + if cites: + cite_num += sum(1 for c in cites if c.get("is_gold")) + cite_den += len(cites) + if row.get("search_ms") is not None: + search_ms.append(row["search_ms"]) + if row.get("answer_ms") is not None: + answer_ms.append(row["answer_ms"]) + + out = {k: round(v / n, 4) for k, v in agg.items()} if n else {} + out["n_queries"] = n + out["no_answer_rate"] = round(no_answer / n, 4) if n else None + out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None + out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None + out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None + return out + + +class RetrieverMetrics(BaseModel): + """Structured IR metrics for a retrieval evaluation run. + + Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`. + Optional fields are ``None`` when the raw result rows lack the required data + (e.g. no latency timestamps, no citations). + """ + + n_queries: int = 0 + hit_at_1: float = 0.0 + hit_at_5: float = 0.0 + hit_at_10: float = 0.0 + recall_at_1: float = 0.0 + recall_at_5: float = 0.0 + recall_at_10: float = 0.0 + precision_at_1: float = 0.0 + precision_at_5: float = 0.0 + precision_at_10: float = 0.0 + mrr_at_10: float = 0.0 + map_at_10: float = 0.0 + ndcg_at_10: float = 0.0 + no_answer_rate: float | None = None + citation_precision: float | None = None + mean_search_ms: float | None = None + mean_answer_ms: float | None = None + + @classmethod + def from_results(cls, results: list[dict]) -> "RetrieverMetrics": + """Compute metrics from raw retrieval result rows and return a model instance.""" + m = compute_retrieval_metrics(results) + return cls( + n_queries=m.get("n_queries", 0), + hit_at_1=m.get("hit@1", 0.0), + hit_at_5=m.get("hit@5", 0.0), + hit_at_10=m.get("hit@10", 0.0), + recall_at_1=m.get("recall@1", 0.0), + recall_at_5=m.get("recall@5", 0.0), + recall_at_10=m.get("recall@10", 0.0), + precision_at_1=m.get("precision@1", 0.0), + precision_at_5=m.get("precision@5", 0.0), + precision_at_10=m.get("precision@10", 0.0), + mrr_at_10=m.get("mrr@10", 0.0), + map_at_10=m.get("map@10", 0.0), + ndcg_at_10=m.get("ndcg@10", 0.0), + no_answer_rate=m.get("no_answer_rate"), + citation_precision=m.get("citation_precision"), + mean_search_ms=m.get("mean_search_ms"), + mean_answer_ms=m.get("mean_answer_ms"), + ) From 0acac370f601451015b3f98366717b57a7c9c401 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:02:20 +0200 Subject: [PATCH 09/67] feat(examples): add flyradar and flycanon evaluation examples (#276) * feat(evaluation): add flyradar gate evaluation example * feat(evaluation): add flycanon RAG retrieval evaluation example --------- Co-authored-by: miguelgfierro --- examples/flycanon_eval_example.py | 379 ++++++++++++++++++++++++++++ examples/flyradar_eval_example.py | 406 ++++++++++++++++++++++++++++++ 2 files changed, 785 insertions(+) create mode 100644 examples/flycanon_eval_example.py create mode 100644 examples/flyradar_eval_example.py diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py new file mode 100644 index 00000000..9d8d071b --- /dev/null +++ b/examples/flycanon_eval_example.py @@ -0,0 +1,379 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""FlyCanon evaluation example — RAG retrieval benchmark with champion/challenger tracking. + +Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate +the flycanon experiment evaluation workflow: + +1. Load a results JSONL file produced by a flycanon retrieval pipeline. +2. Compute deterministic IR metrics (Recall@k, Precision@k, MRR, nDCG, MAP). +3. Compare against a saved baseline to detect regression. +4. Print a formatted metrics table. +5. Offer to promote the new run to champion when it beats the baseline. + +The champion/challenger pattern mirrors the flycanon_experiments harness: +each run writes metrics to a file; ``approve`` promotes it by repointing +baseline.json. Here we replicate that flow using the framework's +``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly. + +Usage:: + + # Score a results file (no baseline comparison) + python examples/flycanon_eval_example.py --results-file results.jsonl + + # Compare against a saved baseline + python examples/flycanon_eval_example.py \\ + --results-file results.jsonl \\ + --baseline baseline.json + + # Promote if better (write new champion to baseline.json) + python examples/flycanon_eval_example.py \\ + --results-file results.jsonl \\ + --baseline baseline.json \\ + --promote-if-better + +Exit codes: 0 = scored successfully, 1 = regression detected vs baseline. + +Results JSONL format +-------------------- +Each line is a JSON object representing one query's retrieval result:: + + { + "question": "What was Apple's revenue in Q4 2023?", + "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"], + "retrieved": [ + {"rank": 1, "source_id": "AAPL_10K_2023", "is_gold": true}, + {"rank": 2, "source_id": "MSFT_10K_2023", "is_gold": false}, + {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true} + ], + "answer": "Apple's revenue in Q4 2023 was $89.5 billion.", + "no_answer": false, + "citations": [ + {"source_id": "AAPL_10K_2023", "is_gold": true} + ], + "search_ms": 142, + "answer_ms": 2310 + } + +The ``gold`` list contains the source IDs that are considered correct answers. +Each entry in ``retrieved`` must have a 1-based ``rank``, ``source_id`` (or +``identities`` list), and ``is_gold`` bool. + +Baseline JSON format +-------------------- +A flat JSON object with metric names as keys and float values:: + + { + "ndcg@10": 0.7234, + "mrr@10": 0.6891, + "recall@10": 0.8120, + "hit@10": 0.9100, + "map@10": 0.6543, + "n_queries": 200 + } + +This is the same format written by ``--promote-if-better``. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# Metrics that form the primary quality signal for champion/challenger +# comparisons. These are listed in priority order: nDCG@10 is the primary +# ranking metric; MRR@10 measures how quickly the first gold result appears; +# Recall@10 measures overall coverage; Hit@10 measures binary success rate; +# MAP@10 measures precision across the ranked list. +PRIMARY_METRICS = ["ndcg@10", "mrr@10", "recall@10", "hit@10", "map@10"] + +# Regression threshold: a metric must drop by more than this fraction of its +# baseline value to be flagged as a regression (guards against noise). +REGRESSION_THRESHOLD = 0.01 + + +def _load_jsonl(path: str) -> list[dict]: + """Load a newline-delimited JSON file, one object per line.""" + lines = Path(path).read_text(encoding="utf-8").strip().splitlines() + return [json.loads(line) for line in lines if line.strip()] + + +def _load_baseline(path: str) -> dict | None: + """Load a baseline JSON file, returning None if it does not exist.""" + p = Path(path) + if not p.exists(): + return None + return json.loads(p.read_text(encoding="utf-8")) + + +def _save_baseline(path: str, metrics: dict) -> None: + """Write a flat metrics dict to the baseline JSON file.""" + Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + + +def _metrics_to_flat(m: RetrieverMetrics) -> dict: + """Convert a RetrieverMetrics model to the flat dict stored in baseline.json.""" + return { + "n_queries": m.n_queries, + "hit@1": m.hit_at_1, + "hit@5": m.hit_at_5, + "hit@10": m.hit_at_10, + "recall@1": m.recall_at_1, + "recall@5": m.recall_at_5, + "recall@10": m.recall_at_10, + "precision@1": m.precision_at_1, + "precision@5": m.precision_at_5, + "precision@10": m.precision_at_10, + "mrr@10": m.mrr_at_10, + "map@10": m.map_at_10, + "ndcg@10": m.ndcg_at_10, + "no_answer_rate": m.no_answer_rate, + "citation_precision": m.citation_precision, + "mean_search_ms": m.mean_search_ms, + "mean_answer_ms": m.mean_answer_ms, + } + + +def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None: + """Print a formatted table comparing current metrics vs baseline.""" + flat = _metrics_to_flat(metrics) + + col_w = 22 + num_w = 10 + header = f"{'Metric':<{col_w}} {'Current':>{num_w}}" + if baseline: + header += f" {'Baseline':>{num_w}} {'Delta':>{num_w}}" + print(header) + print("-" * (col_w + num_w + (num_w * 2 + 2 if baseline else 0))) + + for key, value in flat.items(): + if value is None: + continue + # Format floats as 4 decimal places; ints as plain integers. + if isinstance(value, float): + cur_str = f"{value:.4f}" + else: + cur_str = str(value) + + row = f"{key:<{col_w}} {cur_str:>{num_w}}" + if baseline and key in baseline and isinstance(value, float): + base_val = baseline[key] + delta = value - base_val + delta_str = f"{delta:+.4f}" + row += f" {base_val:>{num_w}.4f} {delta_str:>{num_w}}" + print(row) + + print() + + +def _detect_regressions(flat: dict, baseline: dict) -> list[str]: + """Return the names of primary metrics that regressed vs baseline. + + A regression is flagged when the new value drops by more than + REGRESSION_THRESHOLD * baseline_value (relative threshold). This + guards against flagging noise as a regression. + """ + regressions = [] + for key in PRIMARY_METRICS: + new_val = flat.get(key) + base_val = baseline.get(key) + if new_val is None or base_val is None: + continue + if base_val > 0 and (base_val - new_val) / base_val > REGRESSION_THRESHOLD: + regressions.append(key) + return regressions + + +def _beats_baseline(flat: dict, baseline: dict) -> bool: + """Return True if the new metrics are better than or equal to the baseline. + + 'Better' means no primary metric has regressed beyond REGRESSION_THRESHOLD + AND at least one primary metric has improved. + """ + regressions = _detect_regressions(flat, baseline) + if regressions: + return False + # Check for at least one improvement. + for key in PRIMARY_METRICS: + new_val = flat.get(key) + base_val = baseline.get(key) + if new_val is not None and base_val is not None and new_val > base_val: + return True + return False + + +# --------------------------------------------------------------------------- +# Main evaluation flow +# --------------------------------------------------------------------------- + + +def run_evaluation(args: argparse.Namespace) -> int: + """Run retrieval metric scoring and optional champion/challenger comparison.""" + + # ------------------------------------------------------------------ + # Step 1 — Load results from the JSONL file. + # + # Each line is one query's retrieval result. The file is produced by + # a flycanon pipeline run (runner.run_queries writes results.jsonl). + # ------------------------------------------------------------------ + print(f"Loading results : {args.results_file}") + results = _load_jsonl(args.results_file) + print(f" {len(results)} query results loaded.") + + if not results: + print("ERROR: results file is empty.", file=sys.stderr) + return 1 + + # ------------------------------------------------------------------ + # Step 2 — Compute deterministic IR metrics. + # + # compute_retrieval_metrics() returns a flat dict of standard IR metrics. + # RetrieverMetrics.from_results() wraps that into a typed Pydantic model + # for convenient attribute access. + # + # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include: + # hit@k -- at least one gold doc in top-k (binary) + # recall@k -- fraction of gold docs found in top-k + # precision@k -- fraction of top-k that are gold + # mrr@10 -- mean reciprocal rank of first gold hit + # map@10 -- mean average precision + # ndcg@10 -- normalised discounted cumulative gain + # ------------------------------------------------------------------ + print("\nComputing retrieval metrics ...") + metrics = RetrieverMetrics.from_results(results) + + print(f" nDCG@10 : {metrics.ndcg_at_10:.4f}") + print(f" MRR@10 : {metrics.mrr_at_10:.4f}") + print(f" Recall@10 : {metrics.recall_at_10:.4f}") + print(f" Hit@10 : {metrics.hit_at_10:.4f}") + print(f" MAP@10 : {metrics.map_at_10:.4f}") + + # ------------------------------------------------------------------ + # Step 3 — Load the baseline (champion) for regression detection. + # ------------------------------------------------------------------ + baseline = None + if args.baseline: + baseline = _load_baseline(args.baseline) + if baseline: + print(f"\nLoaded baseline : {args.baseline}") + else: + print(f"\nNo baseline found at {args.baseline} — first run, no comparison.") + + # ------------------------------------------------------------------ + # Step 4 — Print the full metrics table. + # ------------------------------------------------------------------ + print("\n" + "=" * 56) + print("Retrieval Metrics") + print("=" * 56) + _print_metrics_table(metrics, baseline) + + # ------------------------------------------------------------------ + # Step 5 — Regression check. + # + # Compare against the baseline on primary metrics. Regressions block + # promotion (exit code 1) unless --promote-if-better is set and the + # run actually improved overall. + # ------------------------------------------------------------------ + flat = _metrics_to_flat(metrics) + + if baseline: + regressions = _detect_regressions(flat, baseline) + if regressions: + print(f"REGRESSION detected on: {', '.join(regressions)}") + print(f" Threshold: {REGRESSION_THRESHOLD * 100:.0f}% relative drop on any primary metric.") + else: + better = _beats_baseline(flat, baseline) + if better: + print("Challenger BEATS baseline on at least one primary metric.") + else: + print("Challenger is on-par with baseline (no regression, no improvement).") + + if regressions and not args.promote_if_better: + print("\nVerdict: HOLD — regression detected. Tune the pipeline and re-run.") + return 1 + + # ------------------------------------------------------------------ + # Step 6 — Champion promotion. + # + # When --promote-if-better is set and the metrics beat (or equal) the + # baseline, save the new metrics as the champion. Future runs will + # compare against this updated record. + # ------------------------------------------------------------------ + if args.promote_if_better and args.baseline: + if baseline is None or _beats_baseline(flat, baseline): + _save_baseline(args.baseline, flat) + print(f"\nChampion PROMOTED — metrics saved to {args.baseline}") + else: + print("\nNot promoted — challenger did not beat baseline on primary metrics.") + + print("\nVerdict: PROMOTE" if not (baseline and _detect_regressions(flat, baseline)) else "\nVerdict: HOLD") + return 0 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="flycanon_eval_example", + description=( + "FlyCanon RAG retrieval benchmark — computes IR metrics from a results JSONL " + "and compares against a champion baseline." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument( + "--results-file", + required=True, + help="Path to results.jsonl produced by the flycanon pipeline.", + ) + p.add_argument( + "--baseline", + default=None, + help=( + "Path to baseline.json (champion store). When absent, scores are printed " + "without comparison." + ), + ) + p.add_argument( + "--promote-if-better", + action="store_true", + help=( + "When set, write new metrics to baseline.json if the challenger beats the " + "champion on primary metrics. Has no effect when --baseline is omitted." + ), + ) + return p + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(run_evaluation(args)) + + +if __name__ == "__main__": + main() diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py new file mode 100644 index 00000000..706528f4 --- /dev/null +++ b/examples/flyradar_eval_example.py @@ -0,0 +1,406 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""FlyRadar evaluation example — gate-based process-mining quality gate. + +Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate +the flyradar experiment quality-gate workflow: + +1. Load a must-find registry (the gold standard items the model must discover). +2. Load a DiscoveryResult produced by a flyradar pipeline run. +3. Run gates G1-G5 to produce a structured verdict: + G1 -- Structural & Safe (schema validity, PII, empty-registry guard). + G2 -- Recall & Precision (must-find recall floor, NC precision). + G3 -- Grounded (finding-to-evidence anchoring). + G4 -- LLM-as-a-Judge (advisory only; never blocks promotion). + G5 -- No-regression / promotion (champion/challenger comparison). +4. Render a human-readable scorecard and print the final verdict. +5. Promote the challenger to champion when the verdict is PROMOTE. + +Usage:: + + # Minimal: deterministic gates only (no G4 judge, no baseline) + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json + + # With corpus verification and a champion baseline + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json \\ + --baseline baseline.json \\ + --corpus input.json + + # With the advisory G4 LLM judge (requires API key in environment) + FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\ + python examples/flyradar_eval_example.py \\ + --result output.json \\ + --registry registry.json \\ + --judge-model anthropic:claude-sonnet-4-6 + +Exit codes: 0 = PROMOTE, 1 = HOLD. + +Input file formats +------------------ +``--result`` (output.json) + A DiscoveryResult JSON produced by a flyradar pipeline run. Must contain + at minimum ``findings`` (list) and ``evidence_index`` (list). + +``--registry`` (registry.json) + A lean-1 registry JSON. Each item has ``id``, ``tier`` (L0-L3), ``title``, + ``description``, and ``nc`` (bool, True for negative controls). + +``--baseline`` (baseline.json) + A ChampionRecord JSON written by a previous PROMOTE run. When omitted the + gate runs in day-zero mode (G5 always passes and a new champion is minted). + +``--corpus`` (input.json) + The corpus bundle used during the run. When supplied, G3 verifies that cited + evidence excerpts actually appear in the corpus documents. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from fireflyframework_agentic.evaluation import ( + ChampionRecord, + GateResult, + build_embedder, + load_champion, + load_corpus, + load_registry, + render_scorecard, + run_gates, + run_judge, + save_champion, + verdict, + VERDICT_PROMOTE, +) +from fireflyframework_agentic.evaluation.models import EvalConfig + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _load_json(path: str) -> dict: + """Read a JSON file and return its contents as a dict.""" + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _lexical_missed_ids(result: dict, registry) -> list[str]: + """Return the IDs of registry items not matched by any finding (lexically). + + The G4 judge uses these to focus its coverage checks on items that + lexical recall missed — the places where semantic recovery matters most. + """ + from fireflyframework_agentic.evaluation.matcher import matches + + evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} + findings = result.get("findings", []) + # L3 items are informational-only and are never scored. + scored_items = [item for item in registry.real_items if item.tier != "L3"] + return [ + item.id + for item in scored_items + if not any(matches(f, item, evidence_index) for f in findings) + ] + + +# --------------------------------------------------------------------------- +# Main evaluation flow +# --------------------------------------------------------------------------- + + +def run_evaluation(args: argparse.Namespace) -> int: + """Run the full flyradar gate evaluation and return an exit code.""" + + # ------------------------------------------------------------------ + # Step 1 — Load inputs. + # ------------------------------------------------------------------ + print(f"Loading result : {args.result}") + result = _load_json(args.result) + + print(f"Loading registry : {args.registry}") + registry = load_registry(args.registry) + print(f" {len(registry.real_items)} real items, {len(registry.nc_items)} NC items") + + # The EvalConfig captures provenance for the run record. + config = EvalConfig( + model_id=args.model_id, + corpus=registry.corpus, + run_id=args.run_id, + registry_path=args.registry, + corpus_path=args.corpus or "", + baseline_path=args.baseline or "", + judge_model=args.judge_model or "", + ) + + # Optional: corpus bundle for deterministic evidence verification (G3). + corpus = None + if args.corpus: + print(f"Loading corpus : {args.corpus}") + corpus = load_corpus(args.corpus) + + # Optional: champion record for regression detection (G5). + champion = None + champion_scores = None + aa_noise = None + if args.baseline: + print(f"Loading baseline : {args.baseline}") + champion = load_champion(args.baseline) + if champion: + champion_scores = champion.scores + aa_noise = champion.aa_noise + print(f" Champion run : {champion.run_id} ({champion.model_id})") + else: + print(" No champion found — running in day-zero mode.") + + # Optional: embedder for semantic/hybrid recall (G2). + embed_fn = None + if args.embedder: + print(f"Building embedder: {args.embedder}") + embed_fn = build_embedder(args.embedder) + + print() + + # ------------------------------------------------------------------ + # Step 2 — Run deterministic gates G1-G3 + G5. + # + # run_gates() returns a list of GateResult objects, one per gate. + # Each GateResult carries: + # .gate -- "G1" | "G2" | "G3" | "G5" + # .passed -- bool + # .details -- dict with per-metric values + # .errors -- list[str] of blocking error codes + # ------------------------------------------------------------------ + print("Running gates G1-G3 + G5 ...") + gate_results: list[GateResult] = run_gates( + result, + registry, + args.registry, + pii_list=args.pii_list or [], + recall_floor=args.recall_floor, + grounding_floor=args.grounding_floor, + champion_scores=champion_scores, + aa_noise=aa_noise, + is_day_zero=(champion is None), + human_signed_off=args.human_signed_off, + signoff_count=args.signoffs, + embed_fn=embed_fn, + tau=args.tau, + recall_metric=args.recall_metric, + tau_nc=args.tau_nc, + corpus=corpus, + ) + + # Quick gate summary before the full scorecard. + for gr in gate_results: + status = "PASS" if gr.passed else "FAIL" + print(f" {gr.gate}: {status}") + + # ------------------------------------------------------------------ + # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional). + # + # G4 is non-blocking: it never changes the verdict or exit code. + # It produces an AdvisoryReport with per-finding quality signals + # (faithfulness, citation relevance, fabricated entities, etc.). + # ------------------------------------------------------------------ + advisory = None + if args.judge_model: + print(f"\nRunning G4 judge ({args.judge_model}) ...") + missed_ids = _lexical_missed_ids(result, registry) + advisory = run_judge( + result, + registry, + judge_model=args.judge_model, + runs=args.judge_runs, + concurrency=args.judge_concurrency, + pipeline_model=args.model_id, + embed_fn=embed_fn, + tau=args.tau, + lexical_missed_ids=missed_ids, + ) + print(f" Judge completed ({args.judge_runs} run(s)).") + else: + print("\nG4 judge skipped (pass --judge-model to enable).") + + # ------------------------------------------------------------------ + # Step 4 — Render the scorecard. + # + # render_scorecard() produces a markdown-formatted human-readable + # report that mirrors the output of `flyeval gate` in the playground. + # ------------------------------------------------------------------ + print() + scorecard = render_scorecard( + gate_results, + corpus=registry.corpus, + model_id=config.model_id, + run_id=config.run_id, + is_self_graded=True, + kappa_advisory=registry.is_kappa_advisory(), + evidence_unverified=(corpus is None), + advisory=advisory, + ) + print(scorecard) + + # ------------------------------------------------------------------ + # Step 5 — Inspect the verdict and handle promotion. + # + # verdict() returns "PROMOTE" or "HOLD" based on the gate results. + # On PROMOTE, save the challenger as the new champion so future runs + # can detect regressions against this baseline. + # ------------------------------------------------------------------ + v = verdict(gate_results) + print(f"\nFinal verdict: {v}") + + if v == VERDICT_PROMOTE and args.baseline: + # Extract the key scores from G2 and G3 to store in the champion record. + g2 = next((g for g in gate_results if g.gate == "G2"), None) + g3 = next((g for g in gate_results if g.gate == "G3"), None) + scores: dict[str, float] = {} + if g2: + scores["recall"] = g2.details.get("recall", 0.0) + if g3: + scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) + + new_champion = ChampionRecord( + corpus=registry.corpus, + run_id=config.run_id, + model_id=config.model_id, + registry_sha256=registry.sha256(), + scores=scores, + is_day_zero=(champion is None), + ) + save_champion( + args.baseline, + new_champion, + summary=f"Promoted by flyradar_eval_example.py — {config.run_id}", + ) + print(f"Champion saved to {args.baseline}") + + # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention). + return 0 if v == VERDICT_PROMOTE else 1 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="flyradar_eval_example", + description="FlyRadar gate evaluation — replicates the flyeval gate workflow.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Required inputs. + p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.") + p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.") + + # Optional inputs. + p.add_argument( + "--baseline", + help="Path to baseline.json (champion store). When absent, runs in day-zero mode.", + ) + p.add_argument( + "--corpus", + help="Path to input.json corpus bundle for deterministic evidence verification (G3).", + ) + + # Run metadata. + p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.") + p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.") + + # Gate thresholds. + p.add_argument( + "--recall-floor", + type=float, + default=0.70, + help="Minimum recall required for G2 to pass.", + ) + p.add_argument( + "--grounding-floor", + type=float, + default=0.90, + help="Minimum grounding percentage required for G3 to pass.", + ) + p.add_argument( + "--recall-metric", + choices=["lexical", "semantic", "hybrid"], + default="lexical", + help="Recall metric used by G2. 'semantic' and 'hybrid' require --embedder.", + ) + p.add_argument( + "--tau", + type=float, + default=0.70, + help="Cosine similarity threshold for semantic recall (real items).", + ) + p.add_argument( + "--tau-nc", + type=float, + default=0.85, + help="Cosine similarity threshold for NC item detection.", + ) + p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.") + p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.") + p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.") + + # G4 judge options. + p.add_argument( + "--judge-model", + default=None, + help=( + "Provider:model string for the advisory G4 LLM judge " + "(e.g. 'anthropic:claude-sonnet-4-6'). Omit to skip G4." + ), + ) + p.add_argument( + "--judge-runs", + type=int, + default=1, + help="Number of judge calls to aggregate (odd number recommended for median).", + ) + p.add_argument( + "--judge-concurrency", + type=int, + default=1, + help="Thread fan-out for per-item G4 metrics (1 = sequential).", + ) + + # Embedder for semantic recall. + p.add_argument( + "--embedder", + default=None, + help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').", + ) + + return p + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + sys.exit(run_evaluation(args)) + + +if __name__ == "__main__": + main() From cc048cf187371d99927072d03dd3016c8e765777 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:08:13 +0200 Subject: [PATCH 10/67] test(evaluation): add unit tests for evaluation package and retrieval metrics (#277) * feat(evaluation): add tests/unit/evaluation package init * feat(evaluation): add unit tests for matcher (anchored, source_stem, tokens, matches) * feat(evaluation): add unit tests for stats (aa_band, aggregate_grounding, left_skew_flag) * feat(evaluation): add unit tests for gates (GateResult, verdict, render_scorecard, g5_no_regression) * feat(evaluation): add unit tests for champion (ChampionRecord, load/save/invalidate, input_hash) * feat(evaluation): add unit tests for retrieval_metrics (compute_retrieval_metrics, RetrieverMetrics) * feat(evaluation): fix boundary test for left_skew_flag (floating-point precision) * feat(evaluation): fix no_answer_rate test to match implementation behaviour --------- Co-authored-by: miguelgfierro --- tests/unit/evaluation/__init__.py | 0 tests/unit/evaluation/test_champion.py | 199 ++++++++++++++++++ tests/unit/evaluation/test_gates.py | 219 ++++++++++++++++++++ tests/unit/evaluation/test_matcher.py | 221 ++++++++++++++++++++ tests/unit/evaluation/test_stats.py | 183 +++++++++++++++++ tests/unit/lab/test_retrieval_metrics.py | 247 +++++++++++++++++++++++ 6 files changed, 1069 insertions(+) create mode 100644 tests/unit/evaluation/__init__.py create mode 100644 tests/unit/evaluation/test_champion.py create mode 100644 tests/unit/evaluation/test_gates.py create mode 100644 tests/unit/evaluation/test_matcher.py create mode 100644 tests/unit/evaluation/test_stats.py create mode 100644 tests/unit/lab/test_retrieval_metrics.py diff --git a/tests/unit/evaluation/__init__.py b/tests/unit/evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py new file mode 100644 index 00000000..948a9639 --- /dev/null +++ b/tests/unit/evaluation/test_champion.py @@ -0,0 +1,199 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash.""" + +from __future__ import annotations + +import json + +import pytest + +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + input_hash, + invalidate_champion, + load_champion, + save_champion, +) + + +def _make_champion(**overrides) -> ChampionRecord: + defaults = dict( + corpus="test-corpus", + run_id="run-2026-01", + model_id="claude-sonnet-4-5", + registry_sha256="abc123", + scores={"recall": 0.85, "grounding_pct": 0.92}, + aa_noise={"recall": 0.02}, + is_day_zero=False, + human_sign_offs=["reviewer-1"], + ) + defaults.update(overrides) + return ChampionRecord(**defaults) + + +# ── load_champion ───────────────────────────────────────────────────────────── + + +def test_load_champion_nonexistent_file_returns_none(tmp_path): + result = load_champion(tmp_path / "baseline.json") + assert result is None + + +def test_load_champion_file_with_null_champion_returns_none(tmp_path): + baseline = tmp_path / "baseline.json" + baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8") + assert load_champion(baseline) is None + + +# ── save_champion / load_champion round-trip ────────────────────────────────── + + +def test_save_then_load_round_trips_all_fields(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion() + save_champion(baseline, champ, summary="initial champion", date="2026-01-01") + + loaded = load_champion(baseline) + assert loaded is not None + assert loaded.corpus == champ.corpus + assert loaded.run_id == champ.run_id + assert loaded.model_id == champ.model_id + assert loaded.registry_sha256 == champ.registry_sha256 + assert loaded.scores == champ.scores + assert loaded.aa_noise == champ.aa_noise + assert loaded.is_day_zero == champ.is_day_zero + assert loaded.human_sign_offs == champ.human_sign_offs + + +def test_save_champion_appends_promotion_log_entry(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion() + save_champion(baseline, champ, summary="first", date="2026-01-01") + + champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90}) + save_champion(baseline, champ2, summary="second", date="2026-02-01") + + raw = json.loads(baseline.read_text(encoding="utf-8")) + log = raw["promotion_log"] + assert len(log) == 2 + assert log[0]["to"] == "run-2026-01" + assert log[1]["to"] == "run-2026-02" + assert log[1]["from"] == "run-2026-01" + + +def test_save_champion_creates_file_when_missing(tmp_path): + baseline = tmp_path / "baseline.json" + assert not baseline.exists() + save_champion(baseline, _make_champion()) + assert baseline.exists() + + +def test_save_champion_day_zero_flag_preserved(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion(is_day_zero=True) + save_champion(baseline, champ) + loaded = load_champion(baseline) + assert loaded.is_day_zero is True + + +def test_save_champion_label_is_day_zero_when_flag_set(tmp_path): + baseline = tmp_path / "baseline.json" + champ = _make_champion(is_day_zero=True) + save_champion(baseline, champ) + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["promotion_log"][0]["label"] == "day-zero" + + +def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion(is_day_zero=False)) + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["promotion_log"][0]["label"] == "promotion" + + +# ── invalidate_champion ─────────────────────────────────────────────────────── + + +def test_invalidate_champion_sets_champion_to_null(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion()) + invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01") + + loaded = load_champion(baseline) + assert loaded is None + + raw = json.loads(baseline.read_text(encoding="utf-8")) + assert raw["champion"] is None + + +def test_invalidate_champion_appends_invalidation_log(tmp_path): + baseline = tmp_path / "baseline.json" + save_champion(baseline, _make_champion(), date="2026-01-01") + invalidate_champion(baseline, reason="fake champion", date="2026-03-01") + + raw = json.loads(baseline.read_text(encoding="utf-8")) + log = raw["promotion_log"] + assert log[-1]["label"] == "INVALIDATED" + assert "fake champion" in log[-1]["summary"] + assert log[-1]["to"] is None + + +def test_invalidate_champion_noop_when_file_missing(tmp_path): + # Should not raise when file does not exist. + invalidate_champion(tmp_path / "no-file.json", reason="test") + + +# ── ChampionRecord helpers ──────────────────────────────────────────────────── + + +def test_primary_metric_returns_first_key(): + champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) + assert champ.primary_metric() == "recall" + + +def test_primary_score_returns_first_value(): + champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) + assert champ.primary_score() == 0.85 + + +def test_primary_metric_empty_scores(): + champ = _make_champion(scores={}) + assert champ.primary_metric() == "" + assert champ.primary_score() == 0.0 + + +# ── input_hash ──────────────────────────────────────────────────────────────── + + +def test_input_hash_is_16_chars(): + result = input_hash({"key": "value"}) + assert len(result) == 16 + + +def test_input_hash_is_deterministic(): + data = {"process_graph": {"processes": []}, "findings": []} + h1 = input_hash(data) + h2 = input_hash(data) + assert h1 == h2 + + +def test_input_hash_differs_for_different_inputs(): + assert input_hash({"a": 1}) != input_hash({"a": 2}) + + +def test_input_hash_key_order_independent(): + # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}. + assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1}) diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py new file mode 100644 index 00000000..2edc3b99 --- /dev/null +++ b/tests/unit/evaluation/test_gates.py @@ -0,0 +1,219 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression.""" + +from __future__ import annotations + +from fireflyframework_agentic.evaluation.gates import ( + GateResult, + Verdict, + g5_no_regression, + render_scorecard, +) +from fireflyframework_agentic.evaluation.scorecard import verdict + + +# ── GateResult ──────────────────────────────────────────────────────────────── + + +def test_gate_result_str_pass(): + gr = GateResult(gate="G1", passed=True) + assert str(gr) == "[G1] PASS" + + +def test_gate_result_str_flag(): + gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR") + assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR" + + +def test_gate_result_flag_without_reason_code(): + gr = GateResult(gate="G3", passed=False, reason_code="") + assert str(gr) == "[G3] FLAG:" + + +def test_gate_result_passed_true(): + gr = GateResult(gate="G5", passed=True, details={"note": "ok"}) + assert gr.passed is True + assert gr.details["note"] == "ok" + + +def test_gate_result_default_details_is_empty_dict(): + gr = GateResult(gate="G1", passed=True) + assert gr.details == {} + + +# ── verdict ─────────────────────────────────────────────────────────────────── + + +def test_verdict_promote_when_all_pass_and_g5_present(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + assert verdict(gates) == "PROMOTE" + + +def test_verdict_hold_when_any_gate_fails(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + assert verdict(gates) == "HOLD" + + +def test_verdict_hold_when_g5_missing(): + # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off. + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + ] + assert verdict(gates) == "HOLD" + + +def test_verdict_hold_on_empty_list(): + assert verdict([]) == "HOLD" + + +def test_verdict_hold_when_g5_fails(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=False, reason_code="HOLD"), + ] + assert verdict(gates) == "HOLD" + + +# ── render_scorecard (from gates module) ────────────────────────────────────── + + +def test_render_scorecard_contains_verdict_line(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + assert "VERDICT: PROMOTE" in output + + +def test_render_scorecard_hold_when_flag(): + gates = [ + GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + assert "VERDICT: HOLD" in output + + +def test_render_scorecard_includes_all_gate_lines(): + gates = [ + GateResult(gate="G1", passed=True), + GateResult(gate="G2", passed=True), + GateResult(gate="G3", passed=True), + GateResult(gate="G5", passed=True), + ] + output = render_scorecard(gates) + for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"): + assert gate_label in output + + +# ── g5_no_regression ────────────────────────────────────────────────────────── + + +def test_g5_day_zero_insufficient_signoffs(): + result = g5_no_regression( + candidate_scores={"recall": 0.85}, + champion_scores=None, + aa_noise=None, + is_day_zero=True, + human_signed_off=False, + signoff_count=1, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + + +def test_g5_day_zero_sufficient_signoffs(): + result = g5_no_regression( + candidate_scores={"recall": 0.85}, + champion_scores=None, + aa_noise=None, + is_day_zero=True, + human_signed_off=False, + signoff_count=2, + ) + assert result.passed is True + assert result.details["day_zero"] is True + + +def test_g5_hold_when_no_human_signoff(): + result = g5_no_regression( + candidate_scores={"recall": 0.90}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=False, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + + +def test_g5_hold_when_regression_beyond_band(): + # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02. + result = g5_no_regression( + candidate_scores={"recall": 0.75}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is False + assert result.reason_code == "HOLD" + assert any("recall" in r for r in result.details["regressions"]) + + +def test_g5_promote_when_candidate_beats_champion(): + result = g5_no_regression( + candidate_scores={"recall": 0.90}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is True + assert result.details["improvements"] + + +def test_g5_promote_when_within_noise_band(): + # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement. + result = g5_no_regression( + candidate_scores={"recall": 0.81}, + champion_scores={"recall": 0.80}, + aa_noise={"recall": 0.02}, + human_signed_off=True, + ) + assert result.passed is True + assert result.details["improvements"] == [] + + +def test_g5_verdict_constants(): + assert Verdict.PROMOTE == "PROMOTE" + assert Verdict.HOLD == "HOLD" diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py new file mode 100644 index 00000000..cc87564b --- /dev/null +++ b/tests/unit/evaluation/test_matcher.py @@ -0,0 +1,221 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches.""" + +from __future__ import annotations + +import pytest + +from fireflyframework_agentic.evaluation.matcher import ( + anchored, + matches, + source_stem, + tokens, +) +from fireflyframework_agentic.evaluation.registry import RegistryItem + + +# ── tokens ─────────────────────────────────────────────────────────────────── + + +def test_tokens_basic(): + result = tokens("Hello World") + assert result == ["hello", "world"] + + +def test_tokens_lowercases(): + result = tokens("KYC AML PEP") + assert result == ["kyc", "aml", "pep"] + + +def test_tokens_strips_punctuation(): + result = tokens("risk-management: cost (FTE).") + assert "risk" in result + assert "management" in result + assert "cost" in result + assert "fte" in result + + +def test_tokens_empty_string(): + assert tokens("") == [] + + +def test_tokens_numbers_included(): + result = tokens("case-id CU-2026-1003") + assert "2026" in result or "cu" in result + + +def test_tokens_unicode(): + result = tokens("análisis de crédito") + assert "análisis" in result or "an" in result + + +# ── anchored ───────────────────────────────────────────────────────────────── + + +def test_anchored_overlapping_long_token(): + # "underwriting" is 12 chars — well above the 5-char floor. + assert anchored("credit underwriting risk", "underwriting process steps") is True + + +def test_anchored_no_overlap(): + # No token >= 5 chars shared between claim and evidence. + assert anchored("cat sat", "dog ran") is False + + +def test_anchored_short_tokens_ignored(): + # All tokens in both strings are < 5 chars; no overlap counts. + assert anchored("a big cat", "a big dog") is False + + +def test_anchored_mixed_lengths_match(): + # "kyc" is < 5, but "compliance" is long enough. + assert anchored("kyc compliance review", "compliance framework") is True + + +def test_anchored_custom_min_token(): + # Lower the floor so short tokens can anchor. + assert anchored("kyc check", "kyc process", min_token=3) is True + + +def test_anchored_both_empty(): + assert anchored("", "") is False + + +def test_anchored_partial_token_no_match(): + # "risk" (4 chars) is below the default 5-char floor. + assert anchored("risk alert", "risk factor") is False + + +def test_anchored_returns_bool(): + result = anchored("credit underwriting", "underwriting model") + assert isinstance(result, bool) + + +# ── source_stem ─────────────────────────────────────────────────────────────── + + +def test_source_stem_bare_filename_with_extension(): + assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_directory_prefixed(): + assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_deep_path_prefix(): + assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" + + +def test_source_stem_lowercase(): + # Stems are always lowercased. + assert source_stem("REPORT-FINAL.pdf") == "report-final" + + +def test_source_stem_event_log_row_id(): + # src-: → process stem. + assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting" + + +def test_source_stem_event_log_row_id_preserves_hyphens(): + assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding" + + +def test_source_stem_strips_fragment(): + # #page=N should be removed before stemming. + assert source_stem("docs/report.pdf#page=5") == "report" + + +def test_source_stem_strips_anchor(): + assert source_stem("sops/SOP-001.md#section-3") == "sop-001" + + +def test_source_stem_bare_no_extension(): + # No extension, no directory — stem is just the lowercase name. + assert source_stem("my-document") == "my-document" + + +def test_source_stem_no_directory_no_extension_lowercase(): + assert source_stem("Signal") == "signal" + + +def test_source_stem_csv_extension(): + assert source_stem("activity-cost-fte.csv") == "activity-cost-fte" + + +# ── matches ─────────────────────────────────────────────────────────────────── + + +def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem: + """Construct a minimal RegistryItem for matching tests.""" + return RegistryItem( + id="test-item", + tier="L1", + description=description, + evidence=evidence, + scope="finding", + keywords=keywords or [], + ) + + +def _make_finding(title: str, description: str, evidence_id: str) -> dict: + return { + "title": title, + "description": description, + "evidence_refs": [{"evidence_id": evidence_id}], + } + + +def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict: + return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}} + + +def test_matches_true_when_source_and_topic_match(): + # Finding title shares a long token with item description and cites the same source. + item = _make_item("credit underwriting process", ["sop-kyc-credit.md"]) + finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md") + assert matches(finding, item, evidence_index, scope="finding") is True + + +def test_matches_false_when_source_differs(): + # Token match exists but sources don't overlap — anti-gaming guard fires. + item = _make_item("credit underwriting process", ["sop-credit.md"]) + finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1") + evidence_index = _make_evidence_index("ev-1", "other-document.md") + assert matches(finding, item, evidence_index, scope="finding") is False + + +def test_matches_false_when_no_token_overlap(): + # Same source, but no shared long token between finding text and item description. + item = _make_item("regulatory capital requirement", ["sop-capital.md"]) + finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-capital.md") + assert matches(finding, item, evidence_index, scope="finding") is False + + +def test_matches_keyword_rail_short_token(): + # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword. + item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"]) + finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1") + evidence_index = _make_evidence_index("ev-1", "sop-kyc.md") + assert matches(finding, item, evidence_index, scope="finding") is True + + +def test_matches_empty_evidence_refs_returns_false(): + # Finding with no evidence refs cannot share a source with any item. + item = _make_item("credit underwriting", ["sop-credit.md"]) + finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []} + assert matches(finding, item, {}, scope="finding") is False diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py new file mode 100644 index 00000000..9523be8c --- /dev/null +++ b/tests/unit/evaluation/test_stats.py @@ -0,0 +1,183 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag.""" + +from __future__ import annotations + +import pytest + +from fireflyframework_agentic.evaluation.stats import ( + aa_band, + aggregate_grounding, + left_skew_flag, +) + + +# ── aa_band ────────────────────────────────────────────────────────────────── + + +def test_aa_band_two_identical_scores(): + # Two identical scores produce zero pairwise delta. + assert aa_band([0.80, 0.80]) == 0.0 + + +def test_aa_band_two_different_scores(): + # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value. + result = aa_band([0.80, 0.90]) + assert abs(result - 0.10) < 1e-9 + + +def test_aa_band_three_scores_known_deltas(): + # Scores: 0.70, 0.80, 0.90 + # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10 + # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20 + result = aa_band([0.70, 0.80, 0.90]) + assert abs(result - 0.20) < 1e-9 + + +def test_aa_band_large_spread(): + # Max delta in [0.0, 1.0] is 1.0. + result = aa_band([0.0, 1.0]) + assert abs(result - 1.0) < 1e-9 + + +def test_aa_band_requires_at_least_two_scores(): + with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): + aa_band([0.80]) + + +def test_aa_band_empty_raises(): + with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): + aa_band([]) + + +def test_aa_band_custom_percentile(): + # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10. + result = aa_band([0.70, 0.80, 0.90], percentile=50) + assert abs(result - 0.10) < 1e-9 + + +def test_aa_band_returns_float(): + result = aa_band([0.80, 0.85, 0.90]) + assert isinstance(result, float) + + +# ── aggregate_grounding ─────────────────────────────────────────────────────── + + +def test_aggregate_grounding_single_dict(): + g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]} + result = aggregate_grounding([g]) + assert result["support_pct"] == 90.0 + assert result["unsupported_ids"] == ["ev-1"] + assert result["_aggregate_runs"] == 1 + + +def test_aggregate_grounding_mean_support_pct(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["support_pct"] == 90.0 + + +def test_aggregate_grounding_union_of_unsupported_ids(): + dicts = [ + {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]}, + {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]}, + ] + result = aggregate_grounding(dicts) + assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"} + + +def test_aggregate_grounding_union_sorted(): + dicts = [ + {"support_pct": 90.0, "unsupported_ids": ["ev-b"]}, + {"support_pct": 90.0, "unsupported_ids": ["ev-a"]}, + ] + result = aggregate_grounding(dicts) + assert result["unsupported_ids"] == ["ev-a", "ev-b"] + + +def test_aggregate_grounding_empty_input(): + result = aggregate_grounding([]) + assert result["support_pct"] == 0.0 + assert result["unsupported_ids"] == [] + + +def test_aggregate_grounding_records_run_count(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 90.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["_aggregate_runs"] == 3 + + +def test_aggregate_grounding_per_run_pct_recorded(): + dicts = [ + {"support_pct": 80.0, "unsupported_ids": []}, + {"support_pct": 100.0, "unsupported_ids": []}, + ] + result = aggregate_grounding(dicts) + assert result["_support_pct_per_run"] == [80.0, 100.0] + + +def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty(): + dicts = [ + {"support_pct": 90.0}, # no unsupported_ids key + {"support_pct": 80.0, "unsupported_ids": ["ev-1"]}, + ] + result = aggregate_grounding(dicts) + assert result["unsupported_ids"] == ["ev-1"] + + +# ── left_skew_flag ──────────────────────────────────────────────────────────── + + +def test_left_skew_flag_true_when_catastrophic_run(): + # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70. + assert left_skew_flag([0.60, 0.80, 0.80]) is True + + +def test_left_skew_flag_false_when_min_close_to_median(): + # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag. + assert left_skew_flag([0.75, 0.80, 0.85]) is False + + +def test_left_skew_flag_false_when_all_equal(): + assert left_skew_flag([0.85, 0.85, 0.85]) is False + + +def test_left_skew_flag_boundary_just_above_threshold(): + # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag. + assert left_skew_flag([0.71, 0.80, 0.80]) is False + + +def test_left_skew_flag_single_score_always_false(): + # A single score has no meaningful distribution; function returns False. + assert left_skew_flag([0.50]) is False + + +def test_left_skew_flag_two_scores_with_large_gap(): + # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60. + assert left_skew_flag([0.50, 0.90]) is True + + +def test_left_skew_flag_returns_bool(): + result = left_skew_flag([0.80, 0.85, 0.90]) + assert isinstance(result, bool) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py new file mode 100644 index 00000000..a018a08b --- /dev/null +++ b/tests/unit/lab/test_retrieval_metrics.py @@ -0,0 +1,247 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics.""" + +from __future__ import annotations + +import math + +import pytest + +from fireflyframework_agentic.lab.retrieval_metrics import ( + RetrieverMetrics, + compute_retrieval_metrics, +) + + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: + """Build one result row with ``total`` retrieved items. + + If ``gold_rank`` is not None, the item at that rank is marked as gold. + All items get a unique ``source_id`` so dedup leaves them all. + """ + retrieved = [] + for rank in range(1, total + 1): + retrieved.append({ + "rank": rank, + "source_id": f"doc-{rank}", + "is_gold": rank == gold_rank, + }) + gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] + return { + "retrieved": retrieved, + "gold": gold_ids * n_gold, + } + + +# ── hit@k ───────────────────────────────────────────────────────────────────── + + +def test_hit_at_1_perfect_when_gold_is_rank1(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["hit@1"] == 1.0 + + +def test_hit_at_1_zero_when_gold_not_in_top1(): + results = [_row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert m["hit@1"] == 0.0 + + +def test_hit_at_5_one_when_gold_at_rank5(): + results = [_row(gold_rank=5)] + m = compute_retrieval_metrics(results) + assert m["hit@5"] == 1.0 + + +def test_hit_at_5_zero_when_gold_not_in_top5(): + # Gold is at rank 10 — outside top-5 window with only 5 items, make 10. + results = [_row(gold_rank=None, total=10)] # no gold in retrieved + m = compute_retrieval_metrics(results) + assert m["hit@5"] == 0.0 + + +def test_hit_at_10_one_when_gold_at_rank10(): + results = [_row(gold_rank=10, total=10)] + m = compute_retrieval_metrics(results) + assert m["hit@10"] == 1.0 + + +# ── recall@k ────────────────────────────────────────────────────────────────── + + +def test_recall_at_k_increases_with_k(): + # Gold at rank 3: recall@1=0, recall@5>=recall@1. + results = [_row(gold_rank=3)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] <= m["recall@5"] <= m["recall@10"] + + +def test_recall_at_1_full_when_single_gold_at_rank1(): + results = [_row(gold_rank=1, n_gold=1)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] == 1.0 + + +def test_recall_at_1_zero_when_no_gold_in_rank1(): + results = [_row(gold_rank=5)] + m = compute_retrieval_metrics(results) + assert m["recall@1"] == 0.0 + + +# ── MRR ─────────────────────────────────────────────────────────────────────── + + +def test_mrr_is_1_when_gold_at_rank1(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["mrr@10"] == 1.0 + + +def test_mrr_is_half_when_gold_at_rank2(): + results = [_row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert abs(m["mrr@10"] - 0.5) < 1e-9 + + +def test_mrr_is_zero_when_no_gold(): + results = [_row(gold_rank=None)] + m = compute_retrieval_metrics(results) + assert m["mrr@10"] == 0.0 + + +def test_mrr_average_across_queries(): + # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5). + results = [_row(gold_rank=1), _row(gold_rank=2)] + m = compute_retrieval_metrics(results) + assert abs(m["mrr@10"] - 0.75) < 1e-3 + + +# ── nDCG ────────────────────────────────────────────────────────────────────── + + +def test_ndcg_is_1_when_gold_at_rank1(): + results = [_row(gold_rank=1, n_gold=1)] + m = compute_retrieval_metrics(results) + assert abs(m["ndcg@10"] - 1.0) < 1e-9 + + +def test_ndcg_is_less_than_1_when_gold_not_at_rank1(): + results = [_row(gold_rank=3, n_gold=1)] + m = compute_retrieval_metrics(results) + assert m["ndcg@10"] < 1.0 + assert m["ndcg@10"] > 0.0 + + +def test_ndcg_is_zero_when_no_gold(): + results = [_row(gold_rank=None)] + m = compute_retrieval_metrics(results) + assert m["ndcg@10"] == 0.0 + + +# ── n_queries ───────────────────────────────────────────────────────────────── + + +def test_n_queries_matches_input_length(): + results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)] + m = compute_retrieval_metrics(results) + assert m["n_queries"] == 3 + + +def test_empty_results_returns_zero_n_queries(): + m = compute_retrieval_metrics([]) + assert m["n_queries"] == 0 + + +# ── optional fields ─────────────────────────────────────────────────────────── + + +def test_no_answer_rate_is_zero_when_answer_present(): + # Rows with a non-empty answer string are counted as answered. + results = [{**_row(gold_rank=1), "answer": "some answer text"}] + m = compute_retrieval_metrics(results) + assert m["no_answer_rate"] == 0.0 + + +def test_no_answer_rate_is_one_when_no_answer_field(): + # Rows without an answer field are treated as no-answer by the implementation. + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["no_answer_rate"] == 1.0 + + +def test_citation_precision_is_none_when_no_citations(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["citation_precision"] is None + + +def test_latency_fields_are_none_when_absent(): + results = [_row(gold_rank=1)] + m = compute_retrieval_metrics(results) + assert m["mean_search_ms"] is None + assert m["mean_answer_ms"] is None + + +def test_mean_search_ms_computed_when_present(): + results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] + m = compute_retrieval_metrics(results) + assert m["mean_search_ms"] == 100 + assert m["mean_answer_ms"] == 200 + + +# ── RetrieverMetrics.from_results ───────────────────────────────────────────── + + +def test_retriever_metrics_from_results_hit_at_1(): + results = [_row(gold_rank=1)] + rm = RetrieverMetrics.from_results(results) + assert rm.hit_at_1 == 1.0 + + +def test_retriever_metrics_from_results_n_queries(): + results = [_row(gold_rank=1), _row(gold_rank=2)] + rm = RetrieverMetrics.from_results(results) + assert rm.n_queries == 2 + + +def test_retriever_metrics_from_results_mrr(): + results = [_row(gold_rank=1)] + rm = RetrieverMetrics.from_results(results) + assert rm.mrr_at_10 == 1.0 + + +def test_retriever_metrics_from_results_defaults_on_empty(): + rm = RetrieverMetrics.from_results([]) + assert rm.n_queries == 0 + assert rm.hit_at_1 == 0.0 + assert rm.mrr_at_10 == 0.0 + + +def test_retriever_metrics_is_pydantic_model(): + rm = RetrieverMetrics() + assert rm.n_queries == 0 + assert rm.hit_at_1 == 0.0 + assert rm.no_answer_rate is None + + +def test_retriever_metrics_recall_increases_with_k(): + results = [_row(gold_rank=3)] + rm = RetrieverMetrics.from_results(results) + assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10 From f79439b0abec86dac42ae96ad3e61856a39cea60 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:12:26 +0200 Subject: [PATCH 11/67] docs(evaluation): add evaluation package documentation (#278) * feat(evaluation): add evaluation package documentation * docs(evaluation): mention evaluation subpackage in README --------- Co-authored-by: miguelgfierro --- README.md | 7 + docs/evaluation.md | 435 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 442 insertions(+) create mode 100644 docs/evaluation.md diff --git a/README.md b/README.md index 9d005b23..904237da 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,12 @@ classDiagram `EvalDataset` loads/saves test cases from JSON. `ModelComparison` runs the same prompts across multiple agents for side-by-side analysis. +- **Evaluation** — Gate-based quality gates (G1–G5), LLM-as-judge advisory scoring, + champion/challenger tracking, and deterministic retrieval metrics for assessing + agent and pipeline outputs. The `flyeval` CLI drives the full gate pipeline from + the command line. Install with `pip install "fireflyframework-agentic[evaluation]"`. + See [docs/evaluation.md](docs/evaluation.md) for the full guide. + > **Optional developer tooling.** `fireflyframework_agentic.experiments` (A/B > experiments) and `fireflyframework_agentic.lab` (offline evaluation / > benchmarking) are leaf modules — nothing in the core imports them and they add @@ -817,6 +823,7 @@ Detailed guides for each module: - [Security](docs/security.md) — Prompt/output guards, at-rest encryption - [Experiments](docs/experiments.md) — A/B testing, variant comparison - [Lab](docs/lab.md) — Benchmarks, datasets, evaluators +- [Evaluation](docs/evaluation.md) — Gate pipeline, flyeval CLI, champion/challenger, retrieval metrics - Studio — moved to [fireflyframework-agentic-studio](https://github.com/fireflyframework/fireflyframework-agentic-studio) --- diff --git a/docs/evaluation.md b/docs/evaluation.md new file mode 100644 index 00000000..c2abe319 --- /dev/null +++ b/docs/evaluation.md @@ -0,0 +1,435 @@ +# Evaluation Guide + +Copyright 2026 Firefly Software Foundation. Licensed under the Apache License 2.0. + +The Evaluation subpackage provides gate-based quality gates, LLM-as-judge advisory scoring, +champion/challenger tracking, and deterministic retrieval metrics for assessing agent outputs. + +--- + +## Concepts + +### Gate pipeline + +The evaluation framework runs **five gates** in sequence. Every gate always runs — a failed +gate raises a *flag*, not a veto, so the scorecard always carries the complete picture. + +| Gate | Name | Kind | Description | +|------|------|------|-------------| +| G1 | Structural & Safe | Deterministic | Schema validity, PII non-disclosure, empty-registry guard. | +| G2 | Must-finds & Negative Controls | Deterministic | Lexical/semantic recall against the must-find registry; NC precision. | +| G3 | Evidence (Grounding) | Deterministic | Excerpt-to-corpus anchoring; fabricated-evidence detection. | +| G4 | LLM-as-a-Judge | Advisory (non-blocking) | Semantic faithfulness, entailment, gap detection — never changes the verdict. | +| G5 | No-regression / Promotion | Human decision | Champion/challenger comparison with A/A noise band; collects sign-offs. | + +**No gate vetoes.** Failures append to the `GateResult` flags list and scoring continues. +The scorecard carries every signal regardless of which gates fired. + +### GateResult + +`GateResult` is a dataclass returned by each gate: + +```python +@dataclass +class GateResult: + gate: str # "G1", "G2", …, "G5" + passed: bool + reason_code: str = "" # e.g. "SCHEMA_INVALID", "NC_HIT", "UNGROUNDED" + details: dict = field(default_factory=dict) +``` + +`str(gate_result)` prints `[G2] PASS` or `[G2] FLAG:NC_HIT`. + +### Verdict + +`verdict(gate_results)` returns `VERDICT_PROMOTE` or `VERDICT_HOLD`: + +- `VERDICT_PROMOTE` — all gates passed **and** G5 (the human sign-off gate) is present. +- `VERDICT_HOLD` — any gate flagged, or G5 is missing. + +The CLI exits `0` on PROMOTE and `1` on HOLD, so it composes into CI. + +### Must-find registry + +A registry (`lean-1` schema) is a JSON file listing items the discovery output is +expected to surface (`tier` L0–L3) and negative controls (NC) it must *not* assert. + +```json +{ + "schema_version": "lean-1", + "corpus": "banca-cordobesa", + "items": [ + { "id": "ao-pep-4eyes", "tier": "L0", "scope": "decision", + "description": "PEP cases require a second analyst sign-off (4-eyes)", + "keywords": ["PEP", "4-eyes"], + "evidence": ["SOP-002-kyc-edd.md"] }, + { "id": "ao-nc-realtime", "tier": "NC", "scope": "finding", + "description": "KYC-Hub synchronises in real time — factually false" } + ] +} +``` + +Tier semantics: L0 = must-find control (a single miss flags the run), L1 = high-priority, +L2 = important, L3 = nice-to-have (not counted in the recall floor). + +### Advisory judge (G4) + +G4 calls a chat LLM (or local Ollama model) for semantic checks the deterministic gates +cannot perform: faithfulness, entailment, numeric/temporal fidelity, actionability, +fabricated-entity detection, and more. It is: + +- **Non-blocking** — `AdvisoryReport` is carried separately and never enters `verdict()`. +- **Non-deterministic** — each metric runs `judge_runs` times (default: 3) and the + median score is reported. +- **Opt-in** — pass `--judge-model provider:model` to activate it; omit the flag to skip. + +### Champion/challenger pattern + +Champions are **per-corpus**. `ChampionRecord` persists the best-known run so that +promotion decisions are made against a stable, signed baseline rather than the last run. + +``` + ┌──────────────────────────────────────────┐ + │ run result JSON (challenger) │ + └──────────────┬───────────────────────────┘ + │ + ┌───────────────▼───────────────┐ + │ G1 · G2 · G3 (deterministic) │ + │ G4 (advisory, opt-in) │ + └───────────────┬───────────────┘ + │ flags + scores + ┌───────────────▼───────────────┐ + │ G5 — no-regression vs │ + │ champion baseline + A/A band │ + └───────────────┬───────────────┘ + │ + ┌───────────────▼───────────────┐ + │ Markdown scorecard │ + │ PROMOTE / HOLD │ + └───────────────────────────────┘ +``` + +`invalidate_champion()` marks a baseline invalid. The `EMPTY_MUST_FIND` guard in G1 +prevents a fake-100% champion being created against an empty registry. + +--- + +## Installation + +The evaluation subpackage requires `scipy` and `numpy`. Install the optional extra: + +```bash +pip install "fireflyframework-agentic[evaluation]" +``` + +The `flyeval` CLI entry-point is registered automatically by the package. Verify: + +```bash +flyeval --version +``` + +--- + +## CLI + +All subcommands exit `0` on PROMOTE and `1` on HOLD. + +### `flyeval gate` + +Run the full gate pipeline against a result JSON and print a Markdown scorecard. + +```bash +flyeval gate \ + --result runs/2026-06-18/output.json \ + --registry registries/banca-cordobesa.json \ + --baseline baselines/banca-cordobesa.json \ + --judge-model anthropic:claude-3-5-haiku \ + --judge-runs 3 +``` + +Key flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--result` | required | Path to the run's `output.json`. | +| `--registry` | required | Must-find registry (lean-1 JSON). | +| `--baseline` | — | Champion baseline JSON for G5 regression check. | +| `--judge-model` | — | `provider:model` for G4 advisory judge. | +| `--judge-runs` | 3 | Number of independent judge calls (median aggregation). | +| `--no-judge` | — | Skip G4 entirely. | +| `--recall-floor` | 0.70 | Minimum G2 recall before flagging. | +| `--grounding-floor` | 0.90 | Minimum G3 grounding rate before flagging. | +| `--corpus` | — | Path to the evidence corpus bundle for G3 verification. | +| `--pii-list` | — | Path to a JSON array of names to scan for PII leaks (G1). | +| `--embedder` | — | `provider:model` for semantic recall (G2 embedding path). | +| `--model-id` | "unknown" | Identifier of the model under evaluation (for scorecard). | + +### `flyeval aa-band` + +Compute the A/A noise band from multiple repeated runs of the same model to establish +the noise floor before setting up the champion comparison. + +```bash +flyeval aa-band \ + --results runs/aa-run-1/output.json runs/aa-run-2/output.json runs/aa-run-3/output.json \ + --registry registries/banca-cordobesa.json +``` + +The command prints per-metric variance and recommended noise floors. + +### `flyeval day-zero` + +Promote the very first champion for a corpus (Day-Zero protocol). Requires at least +`--signoffs` sign-offs (default: 2) before PROMOTE is issued. + +```bash +flyeval day-zero \ + --result runs/2026-06-18/output.json \ + --registry registries/banca-cordobesa.json \ + --baseline baselines/banca-cordobesa.json \ + --signoffs 2 +``` + +The command writes the new `ChampionRecord` into `--baseline` on success. + +### `flyeval invalidate` + +Mark the current champion invalid with a documented reason. Use this when the registry +changes in a way that makes the existing champion incommensurable. + +```bash +flyeval invalidate \ + --baseline baselines/banca-cordobesa.json \ + --reason "Registry expanded from 39 to 94 items (lean-1 v2)." +``` + +--- + +## Python API + +### Running gates + +```python +import json +from fireflyframework_agentic.evaluation import ( + run_gates, + render_scorecard, + verdict, + load_registry, + VERDICT_PROMOTE, +) + +result = json.loads(open("runs/2026-06-18/output.json").read()) +registry = load_registry("registries/banca-cordobesa.json") + +gate_results = run_gates(result, registry) +scorecard_md = render_scorecard( + gate_results, + corpus="banca-cordobesa", + model_id="anthropic:claude-3-5-sonnet", + run_id="2026-06-18-sonnet-01", +) +print(scorecard_md) + +v = verdict(gate_results) +print("Verdict:", v) # "PROMOTE" or "HOLD" +assert v == VERDICT_PROMOTE +``` + +### Champion management + +```python +from fireflyframework_agentic.evaluation import ( + load_champion, + save_champion, + invalidate_champion, + ChampionRecord, +) + +# Load the current champion (returns None on Day Zero). +champ = load_champion("baselines/banca-cordobesa.json") +if champ is None: + print("Day Zero — no champion yet.") +else: + print(f"Champion: {champ.run_id} | {champ.primary_metric()}={champ.primary_score():.3f}") + +# Save a new champion after a successful PROMOTE. +new_champ = ChampionRecord( + corpus="banca-cordobesa", + run_id="2026-06-18-sonnet-01", + model_id="anthropic:claude-3-5-sonnet", + registry_sha256=registry.sha256(), + scores={"lexical_recall": 0.857, "grounding_pct": 0.941}, + human_sign_offs=["alice", "bob"], +) +save_champion("baselines/banca-cordobesa.json", new_champ) + +# Invalidate when the registry changes materially. +invalidate_champion( + "baselines/banca-cordobesa.json", + reason="Registry expanded from 39 to 94 items.", +) +``` + +### EvalConfig + +`EvalConfig` is a Pydantic model that captures the parameters of a single evaluation run. +Use it to build reproducible, serialisable run records. + +```python +from fireflyframework_agentic.evaluation.models import EvalConfig + +cfg = EvalConfig( + model_id="anthropic:claude-3-5-sonnet", + corpus="banca-cordobesa", + run_id="2026-06-18-sonnet-01", + registry_path="registries/banca-cordobesa.json", + corpus_path="corpora/banca-cordobesa/", + baseline_path="baselines/banca-cordobesa.json", + judge_model="anthropic:claude-3-5-haiku", + judge_runs=3, +) +print(cfg.model_dump_json(indent=2)) +``` + +### Advisory judge (G4) + +```python +from fireflyframework_agentic.evaluation import run_judge, JudgeClient, build_embedder + +client = JudgeClient( + chat_fn=my_chat_fn, # callable(system: str, user: str) -> dict + embed_fn=build_embedder("ollama:bge-m3"), +) + +advisory = run_judge( + result=result, + registry=registry, + client=client, + runs=3, + missed_ids=[], # IDs the deterministic G2 missed — judge tries to recover them +) +print(advisory.scores) # dict of metric -> float +print(advisory.errors) # any metrics that failed (best-effort, never raises) +``` + +--- + +## Retrieval Metrics + +The `compute_retrieval_metrics()` function computes standard IR metrics over ranked +retrieval results. It is imported from `fireflyframework_agentic.lab.retrieval_metrics` +and re-exported by the evaluation package. + +Supported metrics at cut-offs k ∈ {1, 5, 10}: + +- **Hit@k** — at least one gold document in top-k. +- **Recall@k** — fraction of gold documents in top-k. +- **Precision@k** — fraction of top-k results that are gold. +- **MRR@10** — mean reciprocal rank of the first gold hit. +- **MAP@10** — mean average precision. +- **nDCG@10** — normalised discounted cumulative gain. + +```python +from fireflyframework_agentic.evaluation import compute_retrieval_metrics, RetrieverMetrics + +# Each row is a query; each row's "retrieved" list is ranked (rank=1 is top). +rows = [ + { + "query": "KYC enhanced due diligence steps", + "gold": ["SOP-002-kyc-edd.md"], + "retrieved": [ + {"rank": 1, "source_id": "SOP-002-kyc-edd.md", "is_gold": True}, + {"rank": 2, "source_id": "SOP-001-account-opening.md", "is_gold": False}, + {"rank": 3, "source_id": "INT-002-KYC-Jaime.md", "is_gold": True}, + ], + }, +] + +metrics: RetrieverMetrics = compute_retrieval_metrics(rows) +print(f"Recall@5: {metrics.recall_5:.3f}") +print(f"nDCG@10: {metrics.ndcg_10:.3f}") +print(f"MRR@10: {metrics.mrr_10:.3f}") +``` + +`RetrieverMetrics` also carries optional fields when the raw rows include them: +`no_answer_rate`, `citation_precision`, `mean_search_ms`, `mean_answer_ms`. + +--- + +## Architecture + +```mermaid +flowchart TD + R["result JSON\n(DiscoveryResult / output.json)"] + REG["Registry JSON\n(lean-1 must-find)"] + CORP["Corpus bundle\n(raw evidence documents)"] + BASE["Baseline JSON\n(champion record)"] + + R --> G1["G1 · Structural & Safe\n(schema, PII, empty-registry)"] + REG --> G1 + R --> G2["G2 · Recall & NC Precision\n(lexical + optional semantic)"] + REG --> G2 + R --> G3["G3 · Grounding\n(excerpt anchoring, fabrication)"] + CORP --> G3 + R --> G4["G4 · LLM Judge advisory\n(faithfulness, entailment, gaps)"] + REG --> G4 + G1 --> SC["Markdown Scorecard\nrender_scorecard()"] + G2 --> SC + G3 --> SC + G4 -.advisory.-> SC + BASE --> G5["G5 · No-regression\n(A/A band, sign-offs)"] + G1 --> G5 + G2 --> G5 + G3 --> G5 + G5 --> SC + SC --> V["verdict()\nPROMOTE / HOLD"] + V --> CHAMP["save_champion()\nor invalidate_champion()"] +``` + +--- + +## Reference + +### Exports + +All symbols below are importable from `fireflyframework_agentic.evaluation`. + +| Symbol | Kind | Description | +|--------|------|-------------| +| `EvalConfig` | Pydantic model | Parameters for a single evaluation run. | +| `GateResult` | Dataclass | Result of one gate: `gate`, `passed`, `reason_code`, `details`. | +| `Verdict` | Constants class | `Verdict.PROMOTE`, `Verdict.HOLD`. | +| `VERDICT_PROMOTE` | `str` | `"PROMOTE"`. | +| `VERDICT_HOLD` | `str` | `"HOLD"`. | +| `run_gates()` | Function | Run all four deterministic gates (G1–G3, G5 shape) and return results. | +| `g2_recall_precision()` | Function | Run only G2 (recall + NC precision) and return `GateResult`. | +| `verdict()` | Function | Derive PROMOTE/HOLD from a list of `GateResult`. | +| `render_scorecard()` | Function | Render a Markdown scorecard from gate results and metadata. | +| `ChampionRecord` | Dataclass | Per-corpus champion metadata and scores. | +| `load_champion()` | Function | Load the current champion from `baseline.json`; returns `None` on Day Zero. | +| `save_champion()` | Function | Persist a new champion to `baseline.json`. | +| `invalidate_champion()` | Function | Mark the champion invalid with a reason string. | +| `AdvisoryReport` | Dataclass | G4 judge output: `scores`, `errors`, `raw`. | +| `run_judge()` | Function | Run the LLM-as-a-Judge advisory pass. | +| `JudgeClient` | Dataclass | Holds `chat_fn` and `embed_fn` for the judge. | +| `OllamaEmbedder` | Class | Local Ollama embedding callable (default BGE-M3). | +| `build_embedder()` | Function | Factory: `"ollama:bge-m3"` → `OllamaEmbedder`. | +| `cosine()` | Function | Cosine similarity between two numpy vectors. | +| `Registry` | Dataclass | Parsed must-find registry with real items and NC items. | +| `RegistryItem` | Dataclass | One must-find or NC item: `id`, `tier`, `scope`, `description`, …. | +| `load_registry()` | Function | Parse and validate a lean-1 registry JSON file. | +| `registry_sha256()` | Function | SHA-256 of a registry file path. | +| `load_corpus()` | Function | Load and index a corpus bundle for G3 evidence verification. | +| `corpus_sha256()` | Function | SHA-256 of a corpus directory or bundle. | +| `verify_evidence_index()` | Function | Check each `evidence_index` entry against the corpus. | +| `EMPTY` / `FABRICATED` / `SOURCE_UNKNOWN` / `VERIFIED` | `str` | Evidence verification status constants. | +| `RetrieverMetrics` | Pydantic model | IR metrics: `recall_k`, `precision_k`, `ndcg_10`, `mrr_10`, `map_10`. | +| `compute_retrieval_metrics()` | Function | Compute IR metrics from a list of ranked-retrieval result rows. | +| `anchored()` | Function | True if claim and evidence share at least one non-trivial token. | +| `matches()` | Function | Gate predicate: does a candidate match a registry item? | +| `source_stem()` | Function | Normalise a `locator` path to its file stem for dedup. | +| `tokens()` | Function | Tokenise text to a list of lowercase word strings. | +| `aa_band()` | Function | Compute per-metric A/A noise floor from repeated runs. | +| `aggregate_grounding()` | Function | Summarise grounding stats across a result's findings. | +| `left_skew_flag()` | Function | True when the score distribution is left-skewed (over-optimistic). | From a1d28a597ad87559dad0e26a2f266cf516553d21 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:24 +0200 Subject: [PATCH 12/67] remove examples/flyradar_eval_example.py --- examples/flyradar_eval_example.py | 406 ------------------------------ 1 file changed, 406 deletions(-) delete mode 100644 examples/flyradar_eval_example.py diff --git a/examples/flyradar_eval_example.py b/examples/flyradar_eval_example.py deleted file mode 100644 index 706528f4..00000000 --- a/examples/flyradar_eval_example.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""FlyRadar evaluation example — gate-based process-mining quality gate. - -Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate -the flyradar experiment quality-gate workflow: - -1. Load a must-find registry (the gold standard items the model must discover). -2. Load a DiscoveryResult produced by a flyradar pipeline run. -3. Run gates G1-G5 to produce a structured verdict: - G1 -- Structural & Safe (schema validity, PII, empty-registry guard). - G2 -- Recall & Precision (must-find recall floor, NC precision). - G3 -- Grounded (finding-to-evidence anchoring). - G4 -- LLM-as-a-Judge (advisory only; never blocks promotion). - G5 -- No-regression / promotion (champion/challenger comparison). -4. Render a human-readable scorecard and print the final verdict. -5. Promote the challenger to champion when the verdict is PROMOTE. - -Usage:: - - # Minimal: deterministic gates only (no G4 judge, no baseline) - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json - - # With corpus verification and a champion baseline - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --baseline baseline.json \\ - --corpus input.json - - # With the advisory G4 LLM judge (requires API key in environment) - FLYEVAL_JUDGE_MODEL=anthropic:claude-sonnet-4-6 \\ - python examples/flyradar_eval_example.py \\ - --result output.json \\ - --registry registry.json \\ - --judge-model anthropic:claude-sonnet-4-6 - -Exit codes: 0 = PROMOTE, 1 = HOLD. - -Input file formats ------------------- -``--result`` (output.json) - A DiscoveryResult JSON produced by a flyradar pipeline run. Must contain - at minimum ``findings`` (list) and ``evidence_index`` (list). - -``--registry`` (registry.json) - A lean-1 registry JSON. Each item has ``id``, ``tier`` (L0-L3), ``title``, - ``description``, and ``nc`` (bool, True for negative controls). - -``--baseline`` (baseline.json) - A ChampionRecord JSON written by a previous PROMOTE run. When omitted the - gate runs in day-zero mode (G5 always passes and a new champion is minted). - -``--corpus`` (input.json) - The corpus bundle used during the run. When supplied, G3 verifies that cited - evidence excerpts actually appear in the corpus documents. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import ( - ChampionRecord, - GateResult, - build_embedder, - load_champion, - load_corpus, - load_registry, - render_scorecard, - run_gates, - run_judge, - save_champion, - verdict, - VERDICT_PROMOTE, -) -from fireflyframework_agentic.evaluation.models import EvalConfig - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _load_json(path: str) -> dict: - """Read a JSON file and return its contents as a dict.""" - return json.loads(Path(path).read_text(encoding="utf-8")) - - -def _lexical_missed_ids(result: dict, registry) -> list[str]: - """Return the IDs of registry items not matched by any finding (lexically). - - The G4 judge uses these to focus its coverage checks on items that - lexical recall missed — the places where semantic recovery matters most. - """ - from fireflyframework_agentic.evaluation.matcher import matches - - evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - findings = result.get("findings", []) - # L3 items are informational-only and are never scored. - scored_items = [item for item in registry.real_items if item.tier != "L3"] - return [ - item.id - for item in scored_items - if not any(matches(f, item, evidence_index) for f in findings) - ] - - -# --------------------------------------------------------------------------- -# Main evaluation flow -# --------------------------------------------------------------------------- - - -def run_evaluation(args: argparse.Namespace) -> int: - """Run the full flyradar gate evaluation and return an exit code.""" - - # ------------------------------------------------------------------ - # Step 1 — Load inputs. - # ------------------------------------------------------------------ - print(f"Loading result : {args.result}") - result = _load_json(args.result) - - print(f"Loading registry : {args.registry}") - registry = load_registry(args.registry) - print(f" {len(registry.real_items)} real items, {len(registry.nc_items)} NC items") - - # The EvalConfig captures provenance for the run record. - config = EvalConfig( - model_id=args.model_id, - corpus=registry.corpus, - run_id=args.run_id, - registry_path=args.registry, - corpus_path=args.corpus or "", - baseline_path=args.baseline or "", - judge_model=args.judge_model or "", - ) - - # Optional: corpus bundle for deterministic evidence verification (G3). - corpus = None - if args.corpus: - print(f"Loading corpus : {args.corpus}") - corpus = load_corpus(args.corpus) - - # Optional: champion record for regression detection (G5). - champion = None - champion_scores = None - aa_noise = None - if args.baseline: - print(f"Loading baseline : {args.baseline}") - champion = load_champion(args.baseline) - if champion: - champion_scores = champion.scores - aa_noise = champion.aa_noise - print(f" Champion run : {champion.run_id} ({champion.model_id})") - else: - print(" No champion found — running in day-zero mode.") - - # Optional: embedder for semantic/hybrid recall (G2). - embed_fn = None - if args.embedder: - print(f"Building embedder: {args.embedder}") - embed_fn = build_embedder(args.embedder) - - print() - - # ------------------------------------------------------------------ - # Step 2 — Run deterministic gates G1-G3 + G5. - # - # run_gates() returns a list of GateResult objects, one per gate. - # Each GateResult carries: - # .gate -- "G1" | "G2" | "G3" | "G5" - # .passed -- bool - # .details -- dict with per-metric values - # .errors -- list[str] of blocking error codes - # ------------------------------------------------------------------ - print("Running gates G1-G3 + G5 ...") - gate_results: list[GateResult] = run_gates( - result, - registry, - args.registry, - pii_list=args.pii_list or [], - recall_floor=args.recall_floor, - grounding_floor=args.grounding_floor, - champion_scores=champion_scores, - aa_noise=aa_noise, - is_day_zero=(champion is None), - human_signed_off=args.human_signed_off, - signoff_count=args.signoffs, - embed_fn=embed_fn, - tau=args.tau, - recall_metric=args.recall_metric, - tau_nc=args.tau_nc, - corpus=corpus, - ) - - # Quick gate summary before the full scorecard. - for gr in gate_results: - status = "PASS" if gr.passed else "FAIL" - print(f" {gr.gate}: {status}") - - # ------------------------------------------------------------------ - # Step 3 — Run the advisory G4 LLM-as-a-Judge (optional). - # - # G4 is non-blocking: it never changes the verdict or exit code. - # It produces an AdvisoryReport with per-finding quality signals - # (faithfulness, citation relevance, fabricated entities, etc.). - # ------------------------------------------------------------------ - advisory = None - if args.judge_model: - print(f"\nRunning G4 judge ({args.judge_model}) ...") - missed_ids = _lexical_missed_ids(result, registry) - advisory = run_judge( - result, - registry, - judge_model=args.judge_model, - runs=args.judge_runs, - concurrency=args.judge_concurrency, - pipeline_model=args.model_id, - embed_fn=embed_fn, - tau=args.tau, - lexical_missed_ids=missed_ids, - ) - print(f" Judge completed ({args.judge_runs} run(s)).") - else: - print("\nG4 judge skipped (pass --judge-model to enable).") - - # ------------------------------------------------------------------ - # Step 4 — Render the scorecard. - # - # render_scorecard() produces a markdown-formatted human-readable - # report that mirrors the output of `flyeval gate` in the playground. - # ------------------------------------------------------------------ - print() - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=config.model_id, - run_id=config.run_id, - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - evidence_unverified=(corpus is None), - advisory=advisory, - ) - print(scorecard) - - # ------------------------------------------------------------------ - # Step 5 — Inspect the verdict and handle promotion. - # - # verdict() returns "PROMOTE" or "HOLD" based on the gate results. - # On PROMOTE, save the challenger as the new champion so future runs - # can detect regressions against this baseline. - # ------------------------------------------------------------------ - v = verdict(gate_results) - print(f"\nFinal verdict: {v}") - - if v == VERDICT_PROMOTE and args.baseline: - # Extract the key scores from G2 and G3 to store in the champion record. - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - scores: dict[str, float] = {} - if g2: - scores["recall"] = g2.details.get("recall", 0.0) - if g3: - scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) - - new_champion = ChampionRecord( - corpus=registry.corpus, - run_id=config.run_id, - model_id=config.model_id, - registry_sha256=registry.sha256(), - scores=scores, - is_day_zero=(champion is None), - ) - save_champion( - args.baseline, - new_champion, - summary=f"Promoted by flyradar_eval_example.py — {config.run_id}", - ) - print(f"Champion saved to {args.baseline}") - - # Exit 0 = PROMOTE, 1 = HOLD (mirrors `flyeval gate` convention). - return 0 if v == VERDICT_PROMOTE else 1 - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def build_parser() -> argparse.ArgumentParser: - p = argparse.ArgumentParser( - prog="flyradar_eval_example", - description="FlyRadar gate evaluation — replicates the flyeval gate workflow.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Required inputs. - p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON.") - p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON.") - - # Optional inputs. - p.add_argument( - "--baseline", - help="Path to baseline.json (champion store). When absent, runs in day-zero mode.", - ) - p.add_argument( - "--corpus", - help="Path to input.json corpus bundle for deterministic evidence verification (G3).", - ) - - # Run metadata. - p.add_argument("--model-id", default="unknown", help="Model identifier for the scorecard.") - p.add_argument("--run-id", default="example-run", help="Run identifier for the scorecard.") - - # Gate thresholds. - p.add_argument( - "--recall-floor", - type=float, - default=0.70, - help="Minimum recall required for G2 to pass.", - ) - p.add_argument( - "--grounding-floor", - type=float, - default=0.90, - help="Minimum grounding percentage required for G3 to pass.", - ) - p.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default="lexical", - help="Recall metric used by G2. 'semantic' and 'hybrid' require --embedder.", - ) - p.add_argument( - "--tau", - type=float, - default=0.70, - help="Cosine similarity threshold for semantic recall (real items).", - ) - p.add_argument( - "--tau-nc", - type=float, - default=0.85, - help="Cosine similarity threshold for NC item detection.", - ) - p.add_argument("--pii-list", nargs="*", default=[], help="PII tokens to check for in findings.") - p.add_argument("--human-signed-off", action="store_true", help="Mark this run as human-reviewed.") - p.add_argument("--signoffs", type=int, default=0, help="Number of human sign-offs collected.") - - # G4 judge options. - p.add_argument( - "--judge-model", - default=None, - help=( - "Provider:model string for the advisory G4 LLM judge " - "(e.g. 'anthropic:claude-sonnet-4-6'). Omit to skip G4." - ), - ) - p.add_argument( - "--judge-runs", - type=int, - default=1, - help="Number of judge calls to aggregate (odd number recommended for median).", - ) - p.add_argument( - "--judge-concurrency", - type=int, - default=1, - help="Thread fan-out for per-item G4 metrics (1 = sequential).", - ) - - # Embedder for semantic recall. - p.add_argument( - "--embedder", - default=None, - help="Embedder spec for semantic/hybrid recall (e.g. 'ollama:bge-m3').", - ) - - return p - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(run_evaluation(args)) - - -if __name__ == "__main__": - main() From 61617186f1ed103c783197784497dd841a260b43 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:27 +0200 Subject: [PATCH 13/67] ci: add --extra evaluation to typecheck and test sync steps --- .github/workflows/pr-gate.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml index c0ef76d4..86e35717 100644 --- a/.github/workflows/pr-gate.yml +++ b/.github/workflows/pr-gate.yml @@ -57,7 +57,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation - run: uv run pyright test: @@ -72,7 +72,7 @@ jobs: - uses: actions/setup-python@v6 with: python-version: '3.13' - - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings + - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing build: From 203134ca971377816c462b7d4c5125d9ebc9d4e0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 09:24:32 +0200 Subject: [PATCH 14/67] fix(evaluation): resolve all ruff lint errors (import sort, SIM108, B905, N806, UP035) --- examples/flycanon_eval_example.py | 13 +-- .../evaluation/__init__.py | 21 ++++- fireflyframework_agentic/evaluation/cli.py | 42 +++++----- fireflyframework_agentic/evaluation/corpus.py | 20 ++--- fireflyframework_agentic/evaluation/gates.py | 42 +++------- fireflyframework_agentic/evaluation/judge.py | 79 ++++++++----------- .../evaluation/judge_client.py | 25 ++---- .../evaluation/matcher.py | 60 +++++++------- .../evaluation/registry.py | 40 +++++----- .../evaluation/run_config_snapshot.py | 9 +-- .../evaluation/scorecard.py | 44 +++-------- fireflyframework_agentic/evaluation/stats.py | 9 +-- uv.lock | 59 +++++++++++++- 13 files changed, 220 insertions(+), 243 deletions(-) diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py index 9d8d071b..856b520b 100644 --- a/examples/flycanon_eval_example.py +++ b/examples/flycanon_eval_example.py @@ -94,8 +94,7 @@ import sys from pathlib import Path -from fireflyframework_agentic.evaluation import RetrieverMetrics, compute_retrieval_metrics - +from fireflyframework_agentic.evaluation import RetrieverMetrics # --------------------------------------------------------------------------- # Helpers @@ -171,10 +170,7 @@ def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> No if value is None: continue # Format floats as 4 decimal places; ints as plain integers. - if isinstance(value, float): - cur_str = f"{value:.4f}" - else: - cur_str = str(value) + cur_str = f"{value:.4f}" if isinstance(value, float) else str(value) row = f"{key:<{col_w}} {cur_str:>{num_w}}" if baseline and key in baseline and isinstance(value, float): @@ -353,10 +349,7 @@ def build_parser() -> argparse.ArgumentParser: p.add_argument( "--baseline", default=None, - help=( - "Path to baseline.json (champion store). When absent, scores are printed " - "without comparison." - ), + help=("Path to baseline.json (champion store). When absent, scores are printed without comparison."), ) p.add_argument( "--promote-if-better", diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index ad01980c..d986d09f 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -31,16 +31,29 @@ from importlib.metadata import PackageNotFoundError, version -from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index +from fireflyframework_agentic.evaluation.champion import ( + ChampionRecord, + invalidate_champion, + load_champion, + save_champion, +) +from fireflyframework_agentic.evaluation.corpus import ( + EMPTY, + FABRICATED, + SOURCE_UNKNOWN, + VERIFIED, + corpus_sha256, + load_corpus, + verify_evidence_index, +) from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD -from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics +from fireflyframework_agentic.evaluation.scorecard import VERDICT_HOLD, VERDICT_PROMOTE, render_scorecard, verdict from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag +from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics try: __version__ = version("fireflyframework-agentic") diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py index 7ac868d9..80dc418a 100644 --- a/fireflyframework_agentic/evaluation/cli.py +++ b/fireflyframework_agentic/evaluation/cli.py @@ -48,7 +48,8 @@ from fireflyframework_agentic.evaluation.judge_client import build_embedder from fireflyframework_agentic.evaluation.matcher import matches from fireflyframework_agentic.evaluation.registry import load_registry -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict +from fireflyframework_agentic.evaluation.scorecard import render_scorecard +from fireflyframework_agentic.evaluation.scorecard import verdict as get_verdict from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag @@ -114,10 +115,8 @@ def _eval_config(args, registry, corpus=None) -> dict: "champion (EMPTY_MUST_FIND)", "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", - "schema_valid": "required top-level keys present in the result " - "(SCHEMA_INVALID)", - "pii_non_disclosure": "no corpus PII name appears in any finding/report text " - "(PII_LEAK)", + "schema_valid": "required top-level keys present in the result (SCHEMA_INVALID)", + "pii_non_disclosure": "no corpus PII name appears in any finding/report text (PII_LEAK)", }, }, "G2": { @@ -142,14 +141,10 @@ def _eval_config(args, registry, corpus=None) -> dict: "human_spot_check_n": 5, "corpus_verification": corpus is not None, "metrics": { - "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " - "below grounding_floor", - "evidence_verified": "cited excerpts located in the actual corpus " - "(when supplied)", - "evidence_fabricated": "populated excerpts not found in their cited source " - "(EVIDENCE_FABRICATED)", - "evidence_source_unknown": "locators resolving to no corpus document " - "(EVIDENCE_SOURCE_UNKNOWN)", + "grounding_pct": "findings whose cited excerpt shares a topic token; blocks below grounding_floor", + "evidence_verified": "cited excerpts located in the actual corpus (when supplied)", + "evidence_fabricated": "populated excerpts not found in their cited source (EVIDENCE_FABRICATED)", + "evidence_source_unknown": "locators resolving to no corpus document (EVIDENCE_SOURCE_UNKNOWN)", "excerpt_fill_rate": "evidence entries carrying a populated excerpt", "source_coverage": "distinct corpus documents cited", }, @@ -173,8 +168,7 @@ def _eval_config(args, registry, corpus=None) -> dict: "severity_calibration": "stated severity matches the evidence", "answer_relevancy": "output addresses the workspace intention", "source_coverage": "distinct corpus documents cited (deterministic)", - "excerpt_fill_rate": "evidence entries with a populated excerpt " - "(deterministic)", + "excerpt_fill_rate": "evidence entries with a populated excerpt (deterministic)", }, }, "G5": { @@ -305,9 +299,12 @@ def cmd_aa_band(args: argparse.Namespace) -> int: for rp in args.results: result = _load_json(rp) g2 = g2_recall_precision( - result, registry, - recall_metric=args.recall_metric, embed_fn=embed_fn, - tau=args.tau, tau_nc=args.tau_nc, + result, + registry, + recall_metric=args.recall_metric, + embed_fn=embed_fn, + tau=args.tau, + tau_nc=args.tau_nc, corpus=corpus, ) if g2.passed or g2.details.get("recall") is not None: @@ -468,15 +465,13 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--tau", type=float, default=float(os.environ.get("FLYEVAL_TAU", "0.70")), - help="cosine similarity threshold for the semantic recall path (real items). " - "Env: FLYEVAL_TAU", + help="cosine similarity threshold for the semantic recall path (real items). Env: FLYEVAL_TAU", ) p_gate.add_argument( "--tau-nc", type=float, default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), - help="cosine similarity threshold for NC item detection (higher; no source anchor). " - "Env: FLYEVAL_TAU_NC", + help="cosine similarity threshold for NC item detection (higher; no source anchor). Env: FLYEVAL_TAU_NC", ) p_gate.add_argument("--human-signed-off", action="store_true") p_gate.add_argument("--signoffs", type=int, default=0) @@ -495,8 +490,7 @@ def _add_common(p: argparse.ArgumentParser) -> None: "--judge-runs", type=int, default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), - help="G4 judge runs; the median of numeric scores is kept (odd recommended). " - "Env: FLYEVAL_JUDGE_RUNS", + help="G4 judge runs; the median of numeric scores is kept (odd recommended). Env: FLYEVAL_JUDGE_RUNS", ) p_gate.add_argument( "--judge-concurrency", diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py index 32835f2c..34926b41 100644 --- a/fireflyframework_agentic/evaluation/corpus.py +++ b/fireflyframework_agentic/evaluation/corpus.py @@ -80,7 +80,7 @@ def normalize(text: str) -> str: smart quotes, collapse whitespace, casefold.""" text = unicodedata.normalize("NFKC", text) text = text.replace("**", "").replace("*", "") - text = re.sub(r"[\"""''']", "", text) + text = re.sub(r"[\"" "''']", "", text) return re.sub(r"\s+", " ", text).strip().casefold() @@ -129,9 +129,7 @@ def load_corpus(path: str | Path) -> Corpus: def _fragment_coverage(fragment: str, source: str) -> float: """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" - blocks = difflib.SequenceMatcher( - None, fragment, source, autojunk=False - ).get_matching_blocks() + blocks = difflib.SequenceMatcher(None, fragment, source, autojunk=False).get_matching_blocks() covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) return covered / len(fragment) @@ -158,11 +156,9 @@ def verify_entry(corpus: Corpus, entry: dict) -> str: if not excerpt: return EMPTY - fragments = [ - f.strip() - for f in _SPLICE_PATTERN.split(excerpt) - if len(f.strip()) >= _MIN_FRAGMENT_CHARS - ] or [excerpt] + fragments = [f.strip() for f in _SPLICE_PATTERN.split(excerpt) if len(f.strip()) >= _MIN_FRAGMENT_CHARS] or [ + excerpt + ] for fragment in fragments: if fragment in source: @@ -178,8 +174,4 @@ def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: Returns {evidence_id: status} over all entries — referenced or not — so the gates share one verification pass. """ - return { - ev["id"]: verify_entry(corpus, ev) - for ev in result.get("evidence_index", []) - if ev.get("id") - } + return {ev["id"]: verify_entry(corpus, ev) for ev in result.get("evidence_index", []) if ev.get("id")} diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py index 057bfea7..fc98d311 100644 --- a/fireflyframework_agentic/evaluation/gates.py +++ b/fireflyframework_agentic/evaluation/gates.py @@ -93,11 +93,7 @@ def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[st if corpus is None: return index statuses = verify_evidence_index(corpus, result) - return { - eid: ev - for eid, ev in index.items() - if statuses[eid] in (VERIFIED, EMPTY) - } + return {eid: ev for eid, ev in index.items() if statuses[eid] in (VERIFIED, EMPTY)} # ── G1: Structural & Safe ──────────────────────────────────────────────────── @@ -322,8 +318,10 @@ def _finding_redundancy_rate(findings: list[dict]) -> float: """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" if len(findings) < 2: return 0.0 + def _tok(text: str) -> frozenset[str]: return frozenset(t.lower() for t in text.split() if len(t) >= 5) + token_sets = [_tok(f.get("description", "")) for f in findings] in_redundant: set[int] = set() for i in range(len(token_sets)): @@ -381,9 +379,7 @@ def g2_recall_precision( if item.tier == "NC": lexical[item.id] = False elif item.scope == "dependency_graph" and item.from_node: - lexical[item.id] = matcher.matches_dependency_graph_relation( - item, result, evidence_index - ) + lexical[item.id] = matcher.matches_dependency_graph_relation(item, result, evidence_index) else: lexical[item.id] = any( matches(c, item, evidence_index, scope=scope) @@ -394,14 +390,10 @@ def g2_recall_precision( if recall_metric not in ("lexical", "semantic", "hybrid"): raise ValueError(f"unknown recall_metric {recall_metric!r}") if recall_metric in ("semantic", "hybrid") and embed_fn is None: - raise ValueError( - f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" - ) + raise ValueError(f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn") if embed_fn is not None: - semantic = matcher.semantic_hits( - candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc - ) + semantic = matcher.semantic_hits(candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc) # dependency_graph relation items have no embedding candidates (§5.3b uses # the endpoint matcher, not per-candidate text embeddings); mirror the # lexical result so semantic/hybrid never under-credits them. @@ -424,8 +416,7 @@ def g2_recall_precision( finding_count = len(findings) finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] findings_matched = sum( - 1 for f in findings - if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) + 1 for f in findings if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) ) _sn = { "finding_count": finding_count, @@ -493,9 +484,7 @@ def _semantic_details() -> dict: "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), "hybrid_recall": round( - _weighted_recall( - scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} - ), + _weighted_recall(scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical}), 4, ), "tau": tau, @@ -577,8 +566,8 @@ def g3_grounded( grounded_ids: list[str] = [] # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. - ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt - ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored + ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt + ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. total_refs = 0 @@ -657,18 +646,14 @@ def g3_grounded( "Populated excerpt(s) not found in the cited corpus document — " "the run asserts evidence the source does not contain." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details) if unknown_source_ids: details["message"] = ( "Evidence locator(s) resolve to no corpus document — either the " "corpus bundle is incomplete or the run invented a source." ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details - ) + return GateResult(gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details) if grounding_pct < grounding_floor: details["floor"] = grounding_floor @@ -746,8 +731,7 @@ def g5_no_regression( band = noise.get(metric, 0.0) if delta < -band: regressions.append( - f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " - f"delta={delta:+.4f} < -band={-band:.4f}" + f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} delta={delta:+.4f} < -band={-band:.4f}" ) elif delta > band: improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a347c8e1..80a90b04 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -142,10 +142,7 @@ def _map_chat(chat_fn, prompts, workers=1): results: list[dict] = [{} for _ in prompts] with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(chat_fn, system, user): idx - for idx, (system, user) in enumerate(prompts) - } + futures = {executor.submit(chat_fn, system, user): idx for idx, (system, user) in enumerate(prompts)} for future in concurrent.futures.as_completed(futures): idx = futures[future] try: @@ -165,11 +162,7 @@ def source_coverage(result: dict) -> dict: source stems present in evidence_index but cited by no finding. """ evidence_index = _evidence_index(result) - all_stems = { - source_stem(ev.get("locator", "")) - for ev in result.get("evidence_index", []) - if ev.get("locator") - } + all_stems = {source_stem(ev.get("locator", "")) for ev in result.get("evidence_index", []) if ev.get("locator")} cited_stems: set[str] = set() for f in result.get("findings", []): for ref in f.get("evidence_refs", []): @@ -245,7 +238,7 @@ def semantic_recovery( cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) recovered: list[dict] = [] - for item, ivec in zip(missed_items, item_vecs): + for item, ivec in zip(missed_items, item_vecs, strict=False): best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) if best >= tau: recovered.append({"id": item.id, "cosine": round(best, 4)}) @@ -307,11 +300,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic source}], count}. """ evidence_index = _evidence_index(result) - scored = [ - (f, excerpts) - for f in result.get("findings", []) - if (excerpts := _cited_excerpts(f, evidence_index)) - ] + scored = [(f, excerpts) for f in result.get("findings", []) if (excerpts := _cited_excerpts(f, evidence_index))] prompts = [ ( SYSTEM, @@ -326,7 +315,7 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic ] answers = _map_chat(chat_fn, prompts, workers) mismatches: list[dict] = [] - for (f, _excerpts), answer in zip(scored, answers): + for (f, _excerpts), answer in zip(scored, answers, strict=False): for m in answer.get("mismatches", []) or []: mismatches.append( { @@ -395,7 +384,7 @@ def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) ] answers = _map_chat(chat_fn, prompts, workers) asserted_ids = [ - item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + item.id for item, a in zip(nc_items, answers, strict=False) if str(a.get("asserted", "")).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} @@ -407,10 +396,7 @@ def fabricated_entity(result: dict, chat_fn) -> dict: excerpts + locators. """ output_text = _output_text(result) - corpus = "\n".join( - f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" - for ev in result.get("evidence_index", []) - ) + corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in result.get("evidence_index", [])) user = ( "List any system, organization, or metric NAMED in the OUTPUT that does NOT " "appear anywhere in the CORPUS EVIDENCE.\n" @@ -433,8 +419,7 @@ def contradiction(result: dict, chat_fn) -> dict: lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") user = ( "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" - 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' - + "\n".join(lines) + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] return {"count": len(pairs), "pairs": [list(p) for p in pairs]} @@ -514,7 +499,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: answers = _map_chat(chat_fn, prompts, workers) verdicts: dict[str, str] = {} miscalibrated = 0 - for f, a in zip(findings, answers): + for f, a in zip(findings, answers, strict=False): verdict = str(a.get("calibration", "calibrated")).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): @@ -557,7 +542,7 @@ def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: def _toks(node: dict) -> frozenset[str]: return frozenset(node.get("name", "").lower().split()) - PER_SURFACE_CAP = 10 + per_surface_cap = 10 # candidates: (surface, node_a, node_b, parent_process_name) candidates: list[tuple[str, dict, dict, str]] = [] @@ -574,7 +559,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: pairs.append((jac, procs[i], procs[j])) pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b in pairs[:PER_SURFACE_CAP]: + for _jac, a, b in pairs[:per_surface_cap]: candidates.append(("process", a, b, "")) # Activities and decisions: within the same parent process only @@ -595,7 +580,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: all_pairs.append((jac, nodes[i], nodes[j], proc_name)) all_pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + for _jac, a, b, proc_name in all_pairs[:per_surface_cap]: candidates.append((surface_key, a, b, proc_name)) if not candidates: @@ -604,33 +589,37 @@ def _toks(node: dict) -> frozenset[str]: prompts = [] for surface, a, b, parent_proc in candidates: ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append(( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - )) + prompts.append( + ( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + ) + ) answers = _map_chat(chat_fn, prompts, workers) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] - for (surface, a, b, _parent), answer in zip(candidates, answers): + for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): verdict = str(answer.get("verdict", "")).upper() if verdict == "DISTINCT": distinct += 1 else: redundant += 1 - redundant_pairs.append({ - "surface": surface, - "a": a.get("name", ""), - "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), - }) + redundant_pairs.append( + { + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + } + ) total = distinct + redundant return { @@ -800,9 +789,7 @@ def _run_judge_metric(name: str, fn) -> None: "numeric_temporal_fidelity", lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), ) - _run_judge_metric( - "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) - ) + _run_judge_metric("citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency)) _run_judge_metric( "nc_semantic_precision", lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 1af17f53..e4b58dea 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -245,8 +245,7 @@ def _dispatch(self, system: str, user: str, max_tokens: int) -> str: if self.provider == "ollama": return self._ollama(system, user, max_tokens) raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; use anthropic:/openai:/azure:/ollama:" ) def _anthropic(self, system: str, user: str, max_tokens: int) -> str: @@ -262,9 +261,7 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str: } headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) - text = next( - (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None - ) + text = next((b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None) if not text: raise RuntimeError(f"judge returned no text: {resp}") return text @@ -283,9 +280,7 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str: ], } headers = {"Authorization": f"Bearer {api_key}"} - resp = _http_post_json( - "https://api.openai.com/v1/chat/completions", headers, body, self.timeout - ) + resp = _http_post_json("https://api.openai.com/v1/chat/completions", headers, body, self.timeout) return _extract_openai_text(resp) def _azure(self, system: str, user: str, max_tokens: int) -> str: @@ -297,10 +292,7 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" # Azure deployment lives in the URL path, not the JSON body. - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" body = { "max_tokens": max_tokens, "temperature": 0.0, @@ -373,10 +365,7 @@ def embed(self, texts: list[str]) -> np.ndarray: if not api_key: raise RuntimeError("AZURE_OPENAI_API_KEY not set") api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" - f"?api-version={api_version}" - ) + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings?api-version={api_version}" headers = {"api-key": api_key} vectors = self._embed_with_split(texts, url, headers) return np.asarray(vectors, dtype=np.float32) @@ -438,9 +427,7 @@ def build_embedder(spec: str): return OpenAIEmbedder(model or "text-embedding-3-small").embed if provider == "azure": return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed - raise NotImplementedError( - f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" - ) + raise NotImplementedError(f"embedder backend {provider!r} not implemented yet; add it in build_embedder()") def cosine(a, b) -> float: diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py index b4d81f44..ccf61c96 100644 --- a/fireflyframework_agentic/evaluation/matcher.py +++ b/fireflyframework_agentic/evaluation/matcher.py @@ -113,9 +113,7 @@ def _keyword_anchored(desc: str, keywords: list[str]) -> bool: if not keywords: return False desc_lower = desc.lower() - return any( - re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords - ) + return any(re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords) def candidate_text(candidate: dict, scope: str) -> str: @@ -141,18 +139,28 @@ def candidate_text(candidate: dict, scope: str) -> str: pain = candidate.get("pain_points") or [] goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("role", ""), - goals_str, - pain_str, - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("role", ""), + goals_str, + pain_str, + ], + ) + ) if scope == "informal_channel": - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("usage_context", ""), - candidate.get("notes", ""), - ])) + return " ".join( + filter( + None, + [ + candidate.get("name", ""), + candidate.get("usage_context", ""), + candidate.get("notes", ""), + ], + ) + ) # process, decision, system, dependency_graph (diagnostic nodes) return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) @@ -246,9 +254,7 @@ def matches_dependency_graph_relation( def _anchor(endpoint_text: str) -> set[str]: return { - a["id"] - for a in all_activities - if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) + a["id"] for a in all_activities if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) } from_ids = _anchor(item.from_node) @@ -268,9 +274,8 @@ def _node_stems(node: dict) -> set[str]: dg = result.get("dependency_graph", {}) for edge in dg.get("activity_edges", []): - if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: - if _node_stems(edge) & item_stems: - return True + if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids and _node_stems(edge) & item_stems: + return True for path in dg.get("critical_paths", []): if not (_node_stems(path) & item_stems): @@ -325,19 +330,13 @@ def semantic_hits( # Flatten all candidates across scopes, preserving their scope tag for # text extraction and per-item filtering. - scoped: list[tuple[str, dict]] = [ - (scope, cand) - for scope, cands in candidates.items() - for cand in cands - ] + scoped: list[tuple[str, dict]] = [(scope, cand) for scope, cands in candidates.items() for cand in cands] if not scoped: return {item.id: False for item in items} cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] - item_texts = [ - " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items - ] + item_texts = [" ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items] cand_vecs = np.asarray(embed_fn(cand_texts)) item_vecs = np.asarray(embed_fn(item_texts)) @@ -359,10 +358,7 @@ def semantic_hits( if cosine(cand_vecs[k], item_vec) >= tau_nc: hit = True break - elif ( - shares_source(cand, item, evidence_index) - and cosine(cand_vecs[k], item_vec) >= tau - ): + elif shares_source(cand, item, evidence_index) and cosine(cand_vecs[k], item_vec) >= tau: hit = True break hits[item.id] = hit diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py index 2b869ba9..87c4beb1 100644 --- a/fireflyframework_agentic/evaluation/registry.py +++ b/fireflyframework_agentic/evaluation/registry.py @@ -24,6 +24,7 @@ - kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) - ABANCA DILO items must target a single measured sub-population """ + from __future__ import annotations import hashlib @@ -35,8 +36,15 @@ VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") VALID_SCOPES = ( - "process", "activity", "decision", "finding", "action", - "persona", "system", "informal_channel", "dependency_graph", + "process", + "activity", + "decision", + "finding", + "action", + "persona", + "system", + "informal_channel", + "dependency_graph", ) SCHEMA_VERSION = "lean-1" KAPPA_ADVISORY_THRESHOLD = 0.70 @@ -47,13 +55,13 @@ class RegistryItem: id: str tier: Literal["L0", "L1", "L2", "L3", "NC"] description: str - evidence: list[str] # source file paths (path portion of locator, no #page=N) - scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) + evidence: list[str] # source file paths (path portion of locator, no #page=N) + scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) keywords: list[str] = field(default_factory=list) weight: float = 1.0 - from_node: str = "" # dependency_graph relation items only - to_node: str = "" # dependency_graph relation items only - relation: str = "" # defaults to "precedes" when from/to present + from_node: str = "" # dependency_graph relation items only + to_node: str = "" # dependency_graph relation items only + relation: str = "" # defaults to "precedes" when from/to present @dataclass(frozen=True) @@ -87,10 +95,7 @@ def sha256(self) -> str: def _validate(raw: dict, path: Path) -> None: if raw.get("schema_version") != SCHEMA_VERSION: - raise ValueError( - f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " - f"got {raw.get('schema_version')!r}" - ) + raise ValueError(f"{path.name}: schema_version must be '{SCHEMA_VERSION}', got {raw.get('schema_version')!r}") for fname in ("corpus", "author", "date"): if not raw.get(fname): raise ValueError(f"{path.name}: missing required field '{fname}'") @@ -116,20 +121,17 @@ def _validate(raw: dict, path: Path) -> None: tier = it.get("tier") if tier not in VALID_TIERS: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " - f"must be one of {VALID_TIERS}" + f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; must be one of {VALID_TIERS}" ) scope = it.get("scope", "finding") if scope not in VALID_SCOPES: raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " - f"must be one of {VALID_SCOPES}" + f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; must be one of {VALID_SCOPES}" ) if scope == "dependency_graph": if not it.get("from") or not it.get("to"): raise ValueError( - f"{path.name}: dependency_graph item '{it.get('id')}' must have " - "non-empty 'from' and 'to'" + f"{path.name}: dependency_graph item '{it.get('id')}' must have non-empty 'from' and 'to'" ) else: if "from" in it or "to" in it or "relation" in it: @@ -153,13 +155,13 @@ def _validate(raw: dict, path: Path) -> None: # ABANCA DILO blend guard: items must assert a single sub-population target. # Checks for phrases that would indicate a blended numeric target is asserted. # "blend" alone is too broad (items may reference it negatively). - BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") + blend_phrases = ("combined distribution", "across all offices regardless of segment") for it in items: if it.get("tier") == "NC": continue desc = it.get("description", "").lower() iid = it.get("id", "") - if any(phrase in desc for phrase in BLEND_PHRASES): + if any(phrase in desc for phrase in blend_phrases): raise ValueError( f"{path.name}: item '{iid}' description targets a blended distribution; " "ABANCA DILO items must target a single measured sub-population " diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py index db543129..c029e8e6 100644 --- a/fireflyframework_agentic/evaluation/run_config_snapshot.py +++ b/fireflyframework_agentic/evaluation/run_config_snapshot.py @@ -32,6 +32,7 @@ --options request_options.json \ --commit c107918 """ + from __future__ import annotations import argparse @@ -133,12 +134,8 @@ def write_snapshot(output_dir: str | Path, config: dict) -> Path: def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") - parser.add_argument( - "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." - ) - parser.add_argument( - "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." - ) + parser.add_argument("--options", required=True, help="JSON file of the DiscoveryRequest options that were sent.") + parser.add_argument("--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL).") parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") args = parser.parse_args(argv) diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py index b34885e8..da3c4a87 100644 --- a/fireflyframework_agentic/evaluation/scorecard.py +++ b/fireflyframework_agentic/evaluation/scorecard.py @@ -188,13 +188,9 @@ def _render_advisory(report) -> list[str]: d = m["faithfulness"] u = d.get("unsupported_ids", []) extra = f" (unsupported: {', '.join(u)})" if u else "" - lines.append( - f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" - ) + lines.append(f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}") if "numeric_temporal_fidelity" in m: - lines.append( - f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" - ) + lines.append(f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)") if "citation_relevance" in m: d = m["citation_relevance"] lines.append( @@ -218,14 +214,10 @@ def _render_advisory(report) -> list[str]: lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") if "actionability" in m: d = m["actionability"] - lines.append( - f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" - ) + lines.append(f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})") if "severity_calibration" in m: d = m["severity_calibration"] - lines.append( - f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" - ) + lines.append(f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated") if "answer_relevancy" in m: lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") if "comparative_vs_champion" in m: @@ -236,14 +228,10 @@ def _render_advisory(report) -> list[str]: d = m["source_coverage"] o = d.get("orphaned", []) extra = f" (orphaned: {', '.join(o)})" if o else "" - lines.append( - f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" - ) + lines.append(f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}") if "excerpt_fill_rate" in m: d = m["excerpt_fill_rate"] - lines.append( - f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" - ) + lines.append(f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated") if "open_gap" in m: gap = (m["open_gap"].get("gap") or "").strip() if gap: @@ -259,9 +247,7 @@ def _render_advisory(report) -> list[str]: json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), "```", ] - lines.append( - "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." - ) + lines.append("> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10).") lines.append("") return lines @@ -284,9 +270,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) tier_summary = ", ".join( - f"{t} {v['hit']}/{v['total']}" - for t, v in tiers.items() - if "hit" in v and "total" in v + f"{t} {v['hit']}/{v['total']}" for t, v in tiers.items() if "hit" in v and "total" in v ) lines.append( f"Lexical recall is **{recall:.3f}** ({tier_summary}). " @@ -300,9 +284,7 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: "The run is covering the same ground multiple times rather than broadening coverage." ) else: - lines.append( - f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." - ) + lines.append(f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic.") lines.append( "_G2 is a topic-level test. A recall of 1.000 means every required topic was " "mentioned somewhere — it does not verify that the specific claims about those " @@ -453,14 +435,10 @@ def _render_analysis(gate_results: list, advisory=None) -> list[str]: flag_names = [g.gate for g in flags] if not flags: - lines.append( - "All deterministic gates pass. The run is ready for G5 human sign-off." - ) + lines.append("All deterministic gates pass. The run is ready for G5 human sign-off.") else: flag_str = ", ".join(flag_names) - lines.append( - f"The run is at **HOLD** due to flags on: {flag_str}. " - ) + lines.append(f"The run is at **HOLD** due to flags on: {flag_str}. ") for g in flags: if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": lines.append( diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py index e70c629a..c622588c 100644 --- a/fireflyframework_agentic/evaluation/stats.py +++ b/fireflyframework_agentic/evaluation/stats.py @@ -23,10 +23,11 @@ aggregation bug where the previous runner inherited run 0's grounding report unchanged instead of merging across all runs. """ + from __future__ import annotations import statistics -from typing import Sequence +from collections.abc import Sequence def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: @@ -49,11 +50,7 @@ def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: scores = list(scores) if len(scores) < 2: raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") - deltas = [ - abs(x - y) - for i, x in enumerate(scores) - for y in scores[i + 1:] - ] + deltas = [abs(x - y) for i, x in enumerate(scores) for y in scores[i + 1 :]] sorted_deltas = sorted(deltas) # Index for the requested percentile; clamp to valid range idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) diff --git a/uv.lock b/uv.lock index 7e3b501c..93e18075 100644 --- a/uv.lock +++ b/uv.lock @@ -1209,6 +1209,10 @@ dev = [ embeddings = [ { name = "numpy" }, ] +evaluation = [ + { name = "numpy" }, + { name = "scipy" }, +] google-embeddings = [ { name = "google-generativeai" }, ] @@ -1279,6 +1283,7 @@ requires-dist = [ { name = "mistralai", marker = "extra == 'mistral-embeddings'", specifier = ">=1.0.0" }, { name = "motor", marker = "extra == 'mongodb'", specifier = ">=3.6.0" }, { name = "numpy", marker = "extra == 'embeddings'", specifier = ">=1.26.0" }, + { name = "numpy", marker = "extra == 'evaluation'", specifier = ">=1.26.0" }, { name = "numpy", marker = "extra == 'reasoning-eval'", specifier = ">=2.0.0" }, { name = "openai", marker = "extra == 'azure-embeddings'", specifier = ">=1.0.0" }, { name = "openai", marker = "extra == 'openai-embeddings'", specifier = ">=1.0.0" }, @@ -1304,13 +1309,14 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "qdrant-client", marker = "extra == 'vectorstores-qdrant'", specifier = ">=1.12.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, + { name = "scipy", marker = "extra == 'evaluation'", specifier = ">=1.11" }, { name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" }, { name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" }, { name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" }, { name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" }, { name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" }, ] -provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "dev"] +provides-extras = ["postgres", "mongodb", "security", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "reasoning-eval", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-pgvector", "watch", "binary", "all", "evaluation", "dev"] [[package]] name = "flatbuffers" @@ -4489,6 +4495,57 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" }, ] +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" }, + { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" }, + { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" }, + { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" }, + { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" }, + { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" }, + { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" }, + { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" }, + { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" }, + { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" }, + { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, + { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" }, + { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" }, + { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" }, + { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" }, + { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" }, + { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" }, + { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" }, + { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" }, + { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" }, + { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" }, + { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" }, + { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" }, + { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" }, + { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" }, +] + [[package]] name = "secretstorage" version = "3.5.0" From 9c3555d03331bb8e05361dc49865df0355171d29 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:11 +0200 Subject: [PATCH 15/67] chore(evaluation): delete cli.py --- fireflyframework_agentic/evaluation/cli.py | 573 --------------------- 1 file changed, 573 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/cli.py diff --git a/fireflyframework_agentic/evaluation/cli.py b/fireflyframework_agentic/evaluation/cli.py deleted file mode 100644 index 7ac868d9..00000000 --- a/fireflyframework_agentic/evaluation/cli.py +++ /dev/null @@ -1,573 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""flyeval — FlyRadar Lean Core evaluation CLI. - -Usage ------ - flyeval gate --result R.json --registry REG.json [--baseline B.json] [--judge-model P:M] - flyeval aa-band --results R1.json R2.json ... --registry REG.json - flyeval day-zero --result R.json --registry REG.json --baseline B.json --signoffs 2 - flyeval invalidate --baseline B.json --reason "..." - -The deterministic gates G1-G3 + G5 (human sign-off) decide the verdict: every -subcommand exits 0 on PROMOTE, 1 on HOLD. G4 (the --judge-model LLM-as-a-Judge, -on by default, --no-judge to skip) is non-blocking — it prints advisory signals -and never changes the verdict or the exit code. -""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import os -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import __version__ -from fireflyframework_agentic.evaluation.champion import ( - ChampionRecord, - invalidate_champion, - load_champion, - save_champion, -) -from fireflyframework_agentic.evaluation.corpus import load_corpus -from fireflyframework_agentic.evaluation.gates import g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.judge import run_judge -from fireflyframework_agentic.evaluation.judge_client import build_embedder -from fireflyframework_agentic.evaluation.matcher import matches -from fireflyframework_agentic.evaluation.registry import load_registry -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict as get_verdict -from fireflyframework_agentic.evaluation.stats import aa_band, left_skew_flag - - -def _load_json(path: str) -> dict: - return json.loads(Path(path).read_text(encoding="utf-8")) - - -def _lexical_missed_ids(result: dict, registry) -> list[str]: - """Scored (non-L3) real-item ids matched by no finding — the G2 lexical misses G4 recovers.""" - evidence_index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - findings = result.get("findings", []) - scored = [i for i in registry.real_items if i.tier != "L3"] - return [i.id for i in scored if not any(matches(f, i, evidence_index) for f in findings)] - - -def _read_experiment_config(result_path: str) -> dict | None: - """Read the experiment_configuration.json recorded next to the run's output.json. - - The experiment config records how the run was generated; it is authored by the - generation step at run time. Evaluation only reads it for display and never - writes or overwrites it. Returns None when the run has no recorded config. - """ - path = Path(result_path).parent / "experiment_configuration.json" - if not path.exists(): - return None - return json.loads(path.read_text(encoding="utf-8")) - - -def _write_eval_config(result_path: str, config: dict) -> Path: - """Write evaluation_configuration.json next to the run's output.json. - - The evaluation config is authored by flyeval at gate time (registry/corpus SHAs, - recall metric, floors, judge settings), so unlike the experiment config it is - owned here and safe to (over)write each run. It mirrors the block embedded in - the scorecard, as a machine-readable artifact. - """ - path = Path(result_path).parent / "evaluation_configuration.json" - path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") - return path - - -def _eval_config(args, registry, corpus=None) -> dict: - """Capture the run's evaluation configuration for provenance. - - Uses getattr defaults so it works for both `gate` (has every flag) and - `day-zero` (lacks the gate-only flags, falling back to the lexical/no-judge - defaults, which honestly reflects how day-zero scores). - """ - jm = getattr(args, "judge_model", None) - baseline = getattr(args, "baseline", None) - tau = getattr(args, "tau", 0.70) - return { - "evaluator_version": __version__, - "registry_sha256": registry.sha256(), - "corpus_sha256": corpus.sha256 if corpus else None, - "model_id": getattr(args, "model_id", None) or "unknown", - "gates": { - "G1": { - "name": "Structural & Safe", - "pii_list": getattr(args, "pii_list", None) or [], - "metrics": { - "empty_must_find": "registry has >=1 must-find item; guards the fake-100% " - "champion (EMPTY_MUST_FIND)", - "registry_sha256_pin": "loaded registry matches its file hash (GOLD_DRIFT)", - "corpus_sha256_pin": "corpus matches its hash when supplied (CORPUS_DRIFT)", - "schema_valid": "required top-level keys present in the result " - "(SCHEMA_INVALID)", - "pii_non_disclosure": "no corpus PII name appears in any finding/report text " - "(PII_LEAK)", - }, - }, - "G2": { - "name": "Recall & Precision", - "recall_metric": getattr(args, "recall_metric", "lexical"), - "recall_floor": getattr(args, "recall_floor", 0.70), - "tau": tau, - "tau_nc": getattr(args, "tau_nc", 0.85), - "embedder": getattr(args, "embedder", None), - "metrics": { - "lexical_recall": "token-overlap recall (always reported)", - "semantic_recall": "embedding-similarity recall at >= tau (needs embedder)", - "hybrid_recall": "per item, a lexical OR semantic match", - "per_tier_recall": "hit/total per tier L0-L3; an L0 miss blocks", - "nc_precision": "negative-control items wrongly emitted; an NC hit blocks", - "finding_redundancy_rate": "fraction of findings duplicating another's topic", - }, - }, - "G3": { - "name": "Grounded", - "grounding_floor": getattr(args, "grounding_floor", 0.90), - "human_spot_check_n": 5, - "corpus_verification": corpus is not None, - "metrics": { - "grounding_pct": "findings whose cited excerpt shares a topic token; blocks " - "below grounding_floor", - "evidence_verified": "cited excerpts located in the actual corpus " - "(when supplied)", - "evidence_fabricated": "populated excerpts not found in their cited source " - "(EVIDENCE_FABRICATED)", - "evidence_source_unknown": "locators resolving to no corpus document " - "(EVIDENCE_SOURCE_UNKNOWN)", - "excerpt_fill_rate": "evidence entries carrying a populated excerpt", - "source_coverage": "distinct corpus documents cited", - }, - }, - "G4": { - "name": "LLM Judge (advisory, non-blocking)", - "judge_model": jm, - "judge_runs": getattr(args, "judge_runs", 1) if jm else None, - "judge_concurrency": getattr(args, "judge_concurrency", 1) if jm else None, - "judge_temperature": 0.0 if jm else None, - "tau": tau if jm else None, - "metrics": { - "faithfulness": "each finding's claim entailed by its cited evidence", - "numeric_temporal_fidelity": "numbers and dates in findings match the evidence", - "citation_relevance": "cited evidence refs are on-topic (context precision)", - "nc_semantic_precision": "negative-control items semantically asserted", - "fabricated_entity": "named entities absent from the corpus", - "contradiction": "findings contradicting the evidence or each other", - "open_gap": "a consequential issue the output failed to surface", - "actionability": "proposed actions are specific and actionable", - "severity_calibration": "stated severity matches the evidence", - "answer_relevancy": "output addresses the workspace intention", - "source_coverage": "distinct corpus documents cited (deterministic)", - "excerpt_fill_rate": "evidence entries with a populated excerpt " - "(deterministic)", - }, - }, - "G5": { - "name": "No-regression / promotion", - "is_day_zero": baseline is None, - "human_signed_off": getattr(args, "human_signed_off", False), - "signoffs": getattr(args, "signoffs", 0), - "baseline": baseline, - "baseline_sha256": _file_sha256(baseline) if baseline else None, - "metrics": { - "improvements": "metrics beating the champion by more than the AA noise band", - "regressions": "metrics that regressed versus the champion", - "noise_band": "per-metric AA noise floor a candidate must exceed", - "guardrail_regression": "any guardrail metric that dropped", - "signoffs": "independent human sign-offs recorded", - }, - }, - }, - } - - -def _file_sha256(path: str) -> str | None: - """SHA-256 of a file's bytes, or None when it can't be read.""" - try: - return hashlib.sha256(Path(path).read_bytes()).hexdigest() - except OSError: - return None - - -# ── gate ────────────────────────────────────────────────────────────────────── - - -def cmd_gate(args: argparse.Namespace) -> int: - if getattr(args, "no_judge", False): - args.judge_model = None # explicit opt-out; G4 runs by default otherwise - result = _load_json(args.result) - registry = load_registry(args.registry) - corpus = load_corpus(args.corpus) if args.corpus else None - champion = load_champion(args.baseline) if args.baseline else None - champion_scores = champion.scores if champion else None - aa_noise = champion.aa_noise if champion else None - - embed_fn = build_embedder(args.embedder) if args.embedder else None - - if args.recall_metric in ("hybrid", "semantic") and embed_fn is None: - print( - f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" - " Example: --embedder openai:text-embedding-3-small", - file=sys.stderr, - ) - return 2 - - gate_results = run_gates( - result, - registry, - args.registry, - pii_list=args.pii_list or [], - recall_floor=args.recall_floor, - grounding_floor=args.grounding_floor, - champion_scores=champion_scores, - aa_noise=aa_noise, - is_day_zero=(champion is None), - human_signed_off=args.human_signed_off, - signoff_count=args.signoffs, - embed_fn=embed_fn, - tau=args.tau, - recall_metric=args.recall_metric, - tau_nc=args.tau_nc, - corpus=corpus, - ) - - # G4 — on by default, non-blocking. Skipped only with --no-judge; never affects the verdict. - advisory = None - if args.judge_model: - champion_result = _load_json(args.champion_result) if args.champion_result else None - advisory = run_judge( - result, - registry, - judge_model=args.judge_model, - runs=args.judge_runs, - concurrency=args.judge_concurrency, - pipeline_model=args.model_id or "", - champion_result=champion_result, - embed_fn=embed_fn, - tau=args.tau, - lexical_missed_ids=_lexical_missed_ids(result, registry), - ) - - config = _eval_config(args, registry, corpus) - _write_eval_config(args.result, config) - experiment_config = _read_experiment_config(args.result) - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=args.model_id or "unknown", - run_id=args.run_id or "run", - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - evidence_unverified=corpus is None, - advisory=advisory, - config=config, - experiment_config=experiment_config, - ) - print(scorecard) - - v = get_verdict(gate_results) - return 0 if v == "PROMOTE" else 1 - - -# ── aa-band ─────────────────────────────────────────────────────────────────── - - -def cmd_aa_band(args: argparse.Namespace) -> int: - registry = load_registry(args.registry) - - if args.recall_metric in ("hybrid", "semantic") and not args.embedder: - print( - f"ERROR: --recall-metric {args.recall_metric} requires --embedder.\n" - " Example: --embedder openai:text-embedding-3-small", - file=sys.stderr, - ) - return 2 - - embed_fn = build_embedder(args.embedder) if args.embedder else None - corpus = load_corpus(args.corpus) if args.corpus else None - scores: list[float] = [] - - for rp in args.results: - result = _load_json(rp) - g2 = g2_recall_precision( - result, registry, - recall_metric=args.recall_metric, embed_fn=embed_fn, - tau=args.tau, tau_nc=args.tau_nc, - corpus=corpus, - ) - if g2.passed or g2.details.get("recall") is not None: - scores.append(g2.details.get("recall", 0.0)) - - if len(scores) < 2: - print( - f"ERROR: need >= 2 runs for aa_band; got {len(scores)}. " - "Make sure the registry is non-empty and the runs are valid.", - file=sys.stderr, - ) - return 1 - - band = aa_band(scores) - high_var = left_skew_flag(scores) - print(f"A/A noise band (95th-pct pairwise delta): {band:.4f}") - print(f"Scores across reruns: {[round(s, 4) for s in scores]}") - if high_var: - print("WARNING: HIGH_VARIANCE — min < median - 0.10. Investigate before using this band.") - return 0 - - -# ── day-zero ────────────────────────────────────────────────────────────────── - - -def cmd_day_zero(args: argparse.Namespace) -> int: - result = _load_json(args.result) - registry = load_registry(args.registry) - - if not args.corpus: - print( - "ERROR: day-zero (a promotion decision) requires --corpus for evidence\n" - "verification — a champion must not be minted on unverified evidence.\n" - " Supply the run's input bundle, e.g. --corpus experiments//input.json", - file=sys.stderr, - ) - return 2 - corpus = load_corpus(args.corpus) - - if args.signoffs < 2: - print( - f"ERROR: Day-Zero requires 2 independent human sign-offs; got {args.signoffs}.", - file=sys.stderr, - ) - return 1 - - gate_results = run_gates( - result, - registry, - args.registry, - is_day_zero=True, - human_signed_off=True, - signoff_count=args.signoffs, - corpus=corpus, - ) - - config = _eval_config(args, registry, corpus) - _write_eval_config(args.result, config) - experiment_config = _read_experiment_config(args.result) - v = get_verdict(gate_results) - scorecard = render_scorecard( - gate_results, - corpus=registry.corpus, - model_id=args.model_id or "unknown", - run_id=args.run_id or "day-zero", - is_self_graded=True, - kappa_advisory=registry.is_kappa_advisory(), - config=config, - experiment_config=experiment_config, - ) - print(scorecard) - - if v == "PROMOTE" and args.baseline: - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - scores = {} - if g2: - scores["recall"] = g2.details.get("recall", 0.0) - if g3: - scores["grounding_pct"] = g3.details.get("grounding_pct", 0.0) - - champion = ChampionRecord( - corpus=registry.corpus, - run_id=args.run_id or "day-zero", - model_id=args.model_id or "unknown", - registry_sha256=registry.sha256(), - scores=scores, - is_day_zero=True, - human_sign_offs=[f"signoff-{i + 1}" for i in range(args.signoffs)], - config=config, - corpus_sha256=corpus.sha256, - ) - save_champion( - args.baseline, - champion, - summary=f"Day-Zero champion for {registry.corpus}", - date=args.date or "unknown", - ) - print(f"\nDay-Zero champion saved to {args.baseline}") - - return 0 if v == "PROMOTE" else 1 - - -# ── invalidate ──────────────────────────────────────────────────────────────── - - -def cmd_invalidate(args: argparse.Namespace) -> int: - invalidate_champion(args.baseline, reason=args.reason, date=args.date or "unknown") - print(f"Champion invalidated in {args.baseline}. Reason: {args.reason}") - return 0 - - -# ── parser ──────────────────────────────────────────────────────────────────── - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - prog="flyeval", - description="FlyRadar Lean Core eval: G1-G3 + G5 deterministic, G4 judge on by default", - ) - sub = parser.add_subparsers(dest="command", required=True) - - def _add_common(p: argparse.ArgumentParser) -> None: - p.add_argument("--result", required=True, help="Path to DiscoveryResult JSON") - p.add_argument("--registry", required=True, help="Path to lean-1 registry JSON") - p.add_argument( - "--corpus", - help="Path to the run's input.json corpus bundle — enables deterministic " - "evidence verification (required for day-zero; without it, gate runs " - "carry an EVIDENCE UNVERIFIED disclosure)", - ) - p.add_argument("--baseline", help="Path to baseline.json (per-corpus champion store)") - p.add_argument("--model-id", default="unknown") - p.add_argument("--run-id", default="run") - p.add_argument("--date", default="", help="ISO date for promotion log") - - # gate - p_gate = sub.add_parser("gate", help="Run the gates and print a scorecard") - _add_common(p_gate) - p_gate.add_argument("--recall-floor", type=float, default=0.70) - p_gate.add_argument("--grounding-floor", type=float, default=0.90) - p_gate.add_argument("--pii-list", nargs="*", default=[]) - p_gate.add_argument( - "--embedder", - default=os.environ.get("FLYEVAL_EMBEDDER"), - help="opt-in embedder spec for the semantic recall path " - '(e.g. "azure:text-embedding-3-small"); omit for pure-lexical recall. ' - "Env: FLYEVAL_EMBEDDER", - ) - p_gate.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default=os.environ.get("FLYEVAL_RECALL_METRIC", "hybrid"), - help="which recall metric GATES (default hybrid; hybrid/semantic require --embedder). " - "Env: FLYEVAL_RECALL_METRIC", - ) - p_gate.add_argument( - "--tau", - type=float, - default=float(os.environ.get("FLYEVAL_TAU", "0.70")), - help="cosine similarity threshold for the semantic recall path (real items). " - "Env: FLYEVAL_TAU", - ) - p_gate.add_argument( - "--tau-nc", - type=float, - default=float(os.environ.get("FLYEVAL_TAU_NC", "0.85")), - help="cosine similarity threshold for NC item detection (higher; no source anchor). " - "Env: FLYEVAL_TAU_NC", - ) - p_gate.add_argument("--human-signed-off", action="store_true") - p_gate.add_argument("--signoffs", type=int, default=0) - p_gate.add_argument( - "--judge-model", - default=os.environ.get("FLYEVAL_JUDGE_MODEL", "anthropic:claude-sonnet-4-6"), - help="provider:model for the non-blocking G4 LLM-as-a-Judge (e.g. azure:gpt-4o). " - "Runs by default; pass --no-judge to skip G4. Env: FLYEVAL_JUDGE_MODEL", - ) - p_gate.add_argument( - "--no-judge", - action="store_true", - help="skip the G4 LLM-as-a-Judge (it runs by default).", - ) - p_gate.add_argument( - "--judge-runs", - type=int, - default=int(os.environ.get("FLYEVAL_JUDGE_RUNS", "1")), - help="G4 judge runs; the median of numeric scores is kept (odd recommended). " - "Env: FLYEVAL_JUDGE_RUNS", - ) - p_gate.add_argument( - "--judge-concurrency", - type=int, - default=int(os.environ.get("FLYEVAL_JUDGE_CONCURRENCY", "1")), - help="bounded fan-out for the per-item G4 [J] metrics (1 = sequential; " - ">=2 runs each metric's chat calls across a thread pool, order preserved). " - "Env: FLYEVAL_JUDGE_CONCURRENCY", - ) - p_gate.add_argument( - "--champion-result", - help="Path to the champion's output.json for the G4 comparative-review metric", - ) - p_gate.set_defaults(func=cmd_gate) - - # aa-band - p_aa = sub.add_parser("aa-band", help="Compute A/A noise band from champion reruns") - p_aa.add_argument( - "--results", - nargs="+", - required=True, - help="Paths to champion-rerun result JSON files (>= 2)", - ) - p_aa.add_argument("--registry", required=True) - p_aa.add_argument( - "--recall-metric", - choices=["lexical", "semantic", "hybrid"], - default="hybrid", - help="recall metric to use — must match the champion's metric (default hybrid; " - "hybrid/semantic require --embedder)", - ) - p_aa.add_argument( - "--embedder", - default=None, - help="embedder spec for semantic/hybrid recall (e.g. ollama:bge-m3)", - ) - p_aa.add_argument("--tau", type=float, default=0.70) - p_aa.add_argument("--tau-nc", type=float, default=0.85) - p_aa.add_argument( - "--corpus", - help="Path to input.json — must match the gate's corpus setting so the " - "band is computed under the same evidence filtering as the champion", - ) - p_aa.set_defaults(func=cmd_aa_band) - - # day-zero - p_dz = sub.add_parser("day-zero", help="Promote the inaugural champion (Day-Zero protocol)") - _add_common(p_dz) - p_dz.add_argument( - "--signoffs", - type=int, - default=0, - help="Number of independent human sign-offs collected (need 2)", - ) - p_dz.set_defaults(func=cmd_day_zero) - - # invalidate - p_inv = sub.add_parser("invalidate", help="Invalidate the current champion") - p_inv.add_argument("--baseline", required=True) - p_inv.add_argument("--reason", required=True) - p_inv.add_argument("--date", default="") - p_inv.set_defaults(func=cmd_invalidate) - - return parser - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(args.func(args)) - - -if __name__ == "__main__": - main() From e9fd9651a017a037330ff698f0768572d0d3f557 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:18 +0200 Subject: [PATCH 16/67] chore(evaluation): delete gates.py --- fireflyframework_agentic/evaluation/gates.py | 840 ------------------- 1 file changed, 840 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/gates.py diff --git a/fireflyframework_agentic/evaluation/gates.py b/fireflyframework_agentic/evaluation/gates.py deleted file mode 100644 index 057bfea7..00000000 --- a/fireflyframework_agentic/evaluation/gates.py +++ /dev/null @@ -1,840 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Four gates — every gate always runs; a failure raises a flag, not a veto. - -Gate pipeline (EVALUATION_FRAMEWORK.md §6): - G1 — Structural & Safe - G2 — Must-finds & negative controls - G3 — Evidence (grounding) - G5 — No-regression / promotion (human decision) - -Each gate is a pure function of the result dict + supporting inputs. -run_gates() always executes all four gates and returns all four results so -the scorecard carries the complete picture regardless of which flags fire. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -from fireflyframework_agentic.evaluation import matcher -from fireflyframework_agentic.evaluation.corpus import ( - EMPTY, - FABRICATED, - SOURCE_UNKNOWN, - VERIFIED, - Corpus, - corpus_sha256, - verify_evidence_index, -) -from fireflyframework_agentic.evaluation.matcher import anchored, matches -from fireflyframework_agentic.evaluation.registry import Registry, registry_sha256 - - -@dataclass -class GateResult: - gate: str - passed: bool - reason_code: str = "" - details: dict = field(default_factory=dict) - - def __str__(self) -> str: - status = "PASS" if self.passed else f"FLAG:{self.reason_code}" - return f"[{self.gate}] {status}" - - -class Verdict: - """Promotion gate verdict constants. - - Use ``Verdict.PROMOTE`` when the challenger meets the quality bar and - is safe to become the new champion. Use ``Verdict.HOLD`` when the - challenger does not meet the bar and must be iterated on. - """ - - PROMOTE: str = "PROMOTE" - HOLD: str = "HOLD" - - -def render_scorecard(gate_results: list[GateResult]) -> str: - """Render a human-readable scorecard from a list of GateResult objects. - - Emits one line per gate: ``[G1] PASS`` or ``[G2] FLAG:RECALL_BELOW_FLOOR``. - The overall verdict (PROMOTE / HOLD) appears on the final line. A run - promotes only when every gate passes; any flag signals HOLD. - """ - lines = [str(r) for r in gate_results] - all_passed = all(r.passed for r in gate_results) - verdict = Verdict.PROMOTE if all_passed else Verdict.HOLD - lines.append(f"VERDICT: {verdict}") - return "\n".join(lines) - - -def _build_evidence_index(result: dict, corpus: Corpus | None = None) -> dict[str, dict]: - """Index evidence by id; with a corpus, drop entries that fail verification. - - Dropped entries (FABRICATED excerpt or SOURCE_UNKNOWN locator) cannot - contribute source stems to G2's shared-source guard or excerpts to G3's - grounding — a run cannot anchor anything on evidence it invented. EMPTY - entries are kept: an empty excerpt is a format problem, not fabrication, - and its (verified) locator stem is still a legitimate citation. - """ - index = {ev["id"]: ev for ev in result.get("evidence_index", []) if ev.get("id")} - if corpus is None: - return index - statuses = verify_evidence_index(corpus, result) - return { - eid: ev - for eid, ev in index.items() - if statuses[eid] in (VERIFIED, EMPTY) - } - - -# ── G1: Structural & Safe ──────────────────────────────────────────────────── - - -def _name_duplication_rate(nodes: list[dict]) -> float: - """Tier-1 + Tier-2 name clustering; returns 1 - clusters/count. - - Tier 1: same normalized id (lower-case) merges nodes into one cluster. - Tier 2: name token-Jaccard >= 0.6 merges nodes into one cluster. - - Report-only: no gate flag fires on any threshold. - """ - n = len(nodes) - if n < 2: - return 0.0 - - group = list(range(n)) - - def _root(i: int) -> int: - while group[i] != i: - group[i] = group[group[i]] - i = group[i] - return i - - seen: dict[str, int] = {} - for i, node in enumerate(nodes): - nid = node.get("id", "").lower() - if nid in seen: - group[_root(i)] = _root(seen[nid]) - else: - seen[nid] = i - - toks = [frozenset(node.get("name", "").lower().split()) for node in nodes] - for i in range(n): - for j in range(i + 1, n): - a, b = toks[i], toks[j] - union_ab = a | b - if union_ab and len(a & b) / len(union_ab) >= 0.6: - group[_root(i)] = _root(j) - - clusters = len({_root(i) for i in range(n)}) - return round(1 - clusters / n, 4) - - -def g1_structural( - result: dict, - registry: Registry, - registry_path: str, - *, - pii_list: list[str] | None = None, - corpus: Corpus | None = None, -) -> GateResult: - """G1 — Structural & Safe (hard veto). - - Checks (in order): - 1. EMPTY_MUST_FIND — must run first; kills the fake-100%-champion bug. - 2. Registry SHA-256 pin: loaded Registry matches the file on disk. - 3. Corpus SHA-256 pin (when a corpus is supplied): same drift guard for - the evidence universe (CORPUS_DRIFT). - 4. Required top-level keys present in result. - 5. PII non-disclosure: no corpus PII name in any finding/report text. - """ - # Guard 1: empty registry (fake-champion guard — always first) - if not registry.real_items: - return GateResult( - gate="G1", - passed=False, - reason_code="EMPTY_MUST_FIND", - details={"message": "Registry has zero real items — cannot evaluate recall."}, - ) - - # Guard 2: registry SHA-256 pin - computed_sha = registry_sha256(registry_path) - if computed_sha != registry.sha256(): - return GateResult( - gate="G1", - passed=False, - reason_code="GOLD_DRIFT", - details={ - "message": "Registry file has changed since it was loaded.", - "expected": registry.sha256(), - "actual": computed_sha, - }, - ) - - # Guard 3: corpus SHA-256 pin (CORPUS_DRIFT — the GOLD_DRIFT twin for evidence) - if corpus is not None: - current_corpus_sha = corpus_sha256(corpus.path) - if current_corpus_sha != corpus.sha256: - return GateResult( - gate="G1", - passed=False, - reason_code="CORPUS_DRIFT", - details={ - "message": "Corpus file has changed since it was loaded.", - "expected": corpus.sha256, - "actual": current_corpus_sha, - }, - ) - - # Guard 4: required result keys - required = ("process_graph", "findings", "evidence_index") - missing = [k for k in required if k not in result] - if missing: - return GateResult( - gate="G1", - passed=False, - reason_code="SCHEMA_INVALID", - details={"missing_keys": missing}, - ) - - # Guard 5: PII check - if pii_list: - free_text: list[str] = [] - for finding in result.get("findings", []): - free_text.extend([finding.get("title", ""), finding.get("description", "")]) - for report in result.get("reports", []): - free_text.append(str(report)) - combined = " ".join(free_text).lower() - hits = [name for name in pii_list if name.lower() in combined] - if hits: - return GateResult( - gate="G1", - passed=False, - reason_code="PII_LEAK", - details={ - "message": "Corpus PII names found in findings/reports.", - "matches": hits[:5], - }, - ) - - pg = result.get("process_graph", {}) - processes = pg.get("processes", []) - activities = [a for p in processes for a in p.get("activities", [])] - decisions = [d for p in processes for d in p.get("decisions", [])] - dg = result.get("dependency_graph", {}) - - details = { - "registry_sha256": registry.sha256(), - "real_items": len(registry.real_items), - "nc_items": len(registry.nc_items), - "map": { - "processes": { - "count": len(processes), - "duplication_rate": _name_duplication_rate(processes), - }, - "activities": { - "count": len(activities), - "duplication_rate": _name_duplication_rate(activities), - }, - "decisions": { - "count": len(decisions), - "duplication_rate": _name_duplication_rate(decisions), - }, - "personas": { - "count": len(result.get("personas", [])), - "duplication_rate": _name_duplication_rate(result.get("personas", [])), - }, - "systems": { - "count": len(result.get("systems", [])), - "duplication_rate": _name_duplication_rate(result.get("systems", [])), - }, - "informal_channels": { - "count": len(result.get("informal_channels", [])), - "duplication_rate": _name_duplication_rate(result.get("informal_channels", [])), - }, - "dependency_graph_edges": len(dg.get("activity_edges", [])), - }, - } - if corpus is not None: - details["corpus_sha256"] = corpus.sha256 - return GateResult(gate="G1", passed=True, details=details) - - -# ── G2: Recall & Precision ─────────────────────────────────────────────────── - - -def _candidates_by_scope(result: dict) -> dict[str, list[dict]]: - """Build per-scope candidate lists from a DiscoveryResult (§4.3). - - Process candidates are augmented with their children's evidence_refs because - process nodes typically carry no own refs — the source-document guard uses the - union of the process's own refs and all its activities' and decisions' refs. - - dependency_graph-scoped items are relation items (all carry from/to) and are - matched via matcher.matches_dependency_graph_relation() — not through per-candidate - iteration — so no "dependency_graph" key is included here. - """ - pg = result.get("process_graph", {}) - processes = pg.get("processes", []) - - def _merge_refs(proc: dict) -> dict: - children_refs = [ - ref - for child_list in (proc.get("activities", []), proc.get("decisions", [])) - for child in child_list - for ref in child.get("evidence_refs", []) - ] - return {**proc, "evidence_refs": list(proc.get("evidence_refs", [])) + children_refs} - - return { - "process": [_merge_refs(p) for p in processes], - "activity": [a for p in processes for a in p.get("activities", [])], - "decision": [d for p in processes for d in p.get("decisions", [])], - "finding": result.get("findings", []), - "action": result.get("proposed_actions", []), - "persona": result.get("personas", []), - "system": result.get("systems", []), - "informal_channel": result.get("informal_channels", []), - } - - -def _weighted_recall(scored_items: list, hits: dict[str, bool]) -> float: - """Weighted recall of a hit map over the scored (non-L3) items.""" - total_weight = sum(item.weight for item in scored_items) or 1.0 - weighted_hit = sum(item.weight for item in scored_items if hits[item.id]) - return weighted_hit / total_weight - - -def _finding_redundancy_rate(findings: list[dict]) -> float: - """Fraction of findings that are near-duplicates of another (Jaccard ≥0.6 on ≥5-char tokens).""" - if len(findings) < 2: - return 0.0 - def _tok(text: str) -> frozenset[str]: - return frozenset(t.lower() for t in text.split() if len(t) >= 5) - token_sets = [_tok(f.get("description", "")) for f in findings] - in_redundant: set[int] = set() - for i in range(len(token_sets)): - for j in range(i + 1, len(token_sets)): - a, b = token_sets[i], token_sets[j] - union = a | b - sim = len(a & b) / len(union) if union else 1.0 - if sim >= 0.6: - in_redundant.add(i) - in_redundant.add(j) - return round(len(in_redundant) / len(findings), 4) - - -def g2_recall_precision( - result: dict, - registry: Registry, - *, - recall_floor: float = 0.70, - embed_fn=None, - tau: float = 0.70, - tau_nc: float = 0.85, - recall_metric: str = "lexical", - corpus: Corpus | None = None, -) -> GateResult: - """G2 — Recall & Precision (hard veto). - - - L0 miss -> BLOCK (zeros the evaluation; regulatory-mandatory item absent) - - NC hit -> BLOCK (precision failure; plausible-but-false item was emitted) - - recall < floor -> BLOCK - - With a ``corpus``, evidence entries that fail verification (fabricated - excerpt or unknown source) are excluded from the evidence index before - matching, so the shared-source guard only accepts citations to real - corpus documents — a fabricated locator cannot satisfy any item. - - ``recall_metric`` ("lexical"/"semantic"/"hybrid") selects which hit map GATES. - "lexical" is matcher.matches (shared-source + topic-anchored token overlap) and - needs no embedder. "semantic"/"hybrid" add the embedding path (matcher.semantic_hits, - threshold ``tau`` for real items, ``tau_nc`` for NC items) and REQUIRE ``embed_fn`` - — passing them without one raises ValueError (use "lexical" for the offline path). - When an embedder is supplied, all three recalls (lexical/semantic/hybrid) are - reported in details regardless of which one gates. - """ - evidence_index = _build_evidence_index(result, corpus) - candidates = _candidates_by_scope(result) - findings = candidates["finding"] - - # NC items anchor via the embedding path only (§6.2): a correct finding about - # the true mirror fact shares vocabulary with the false description, so a - # token or keyword match would falsely convict it. Lexical NC is always False. - # dependency_graph relation items (those with from_node) use the endpoint - # matcher (§5.3b) instead of the per-candidate text predicate. - lexical: dict[str, bool] = {} - for item in registry.items: - if item.tier == "NC": - lexical[item.id] = False - elif item.scope == "dependency_graph" and item.from_node: - lexical[item.id] = matcher.matches_dependency_graph_relation( - item, result, evidence_index - ) - else: - lexical[item.id] = any( - matches(c, item, evidence_index, scope=scope) - for scope in matcher.allowed_scopes(item) - for c in candidates.get(scope, []) - ) - - if recall_metric not in ("lexical", "semantic", "hybrid"): - raise ValueError(f"unknown recall_metric {recall_metric!r}") - if recall_metric in ("semantic", "hybrid") and embed_fn is None: - raise ValueError( - f"recall_metric={recall_metric!r} requires an embedder; pass embed_fn" - ) - - if embed_fn is not None: - semantic = matcher.semantic_hits( - candidates, registry.items, evidence_index, embed_fn, tau, tau_nc=tau_nc - ) - # dependency_graph relation items have no embedding candidates (§5.3b uses - # the endpoint matcher, not per-candidate text embeddings); mirror the - # lexical result so semantic/hybrid never under-credits them. - for item in registry.items: - if item.scope == "dependency_graph" and item.from_node: - semantic[item.id] = lexical[item.id] - else: - semantic = None - - metric = recall_metric - - if semantic is None or metric == "lexical": - hits = lexical - elif metric == "semantic": - hits = semantic - else: # hybrid - hits = {iid: lexical[iid] or semantic[iid] for iid in lexical} - - # Signal-to-noise panel — report-only, §6.2 item 3 - finding_count = len(findings) - finding_scoped_items = [i for i in registry.real_items if i.scope == "finding"] - findings_matched = sum( - 1 for f in findings - if any(matches(f, item, evidence_index, scope="finding") for item in finding_scoped_items) - ) - _sn = { - "finding_count": finding_count, - "findings_matched_to_registry": { - "count": findings_matched, - "fraction": round(findings_matched / finding_count, 4) if finding_count else 0.0, - }, - "finding_redundancy_rate": _finding_redundancy_rate(findings), - } - if corpus is not None: - excluded = len(_build_evidence_index(result)) - len(evidence_index) - _sn["evidence_entries_excluded_unverified"] = excluded - - # L0 misses - l0_misses = [item.id for item in registry.l0_items if not hits[item.id]] - if l0_misses: - return GateResult( - gate="G2", - passed=False, - reason_code="L0_MISSING", - details={ - "l0_misses": l0_misses, - "message": "Regulatory-mandatory items not found — evaluation zeroed.", - **_sn, - }, - ) - - # NC precision - nc_hits = [item.id for item in registry.nc_items if hits[item.id]] - if nc_hits: - return GateResult( - gate="G2", - passed=False, - reason_code="NC_HIT", - details={ - "nc_hits": nc_hits, - "message": "Plausible-but-false negative control items were matched — precision failure.", - **_sn, - }, - ) - - # Weighted recall — over scored items only (L0/L1/L2). L3 is a bonus tier - # ("extra credit"): an L3 miss must not lower recall, so L3 is excluded from - # the denominator and only reported in per_tier below. Recall is computed over - # the GATING hit map so the gate is internally consistent with the chosen metric. - real_items = registry.real_items - scored_items = [item for item in real_items if item.tier != "L3"] - recall = _weighted_recall(scored_items, hits) - - per_tier: dict[str, dict] = {} - for tier in ("L0", "L1", "L2", "L3"): - tier_items = [i for i in real_items if i.tier == tier] - if not tier_items: - continue - per_tier[tier] = { - "hit": sum(1 for i in tier_items if hits[i.id]), - "total": len(tier_items), - } - - def _semantic_details() -> dict: - """The extra recall-breakdown keys, only emitted when an embedder is given.""" - if semantic is None: - return {} - return { - "lexical_recall": round(_weighted_recall(scored_items, lexical), 4), - "semantic_recall": round(_weighted_recall(scored_items, semantic), 4), - "hybrid_recall": round( - _weighted_recall( - scored_items, {iid: lexical[iid] or semantic[iid] for iid in lexical} - ), - 4, - ), - "tau": tau, - } - - if recall < recall_floor: - return GateResult( - gate="G2", - passed=False, - reason_code="RECALL_BELOW_FLOOR", - details={ - "recall": round(recall, 4), - "recall_metric": metric, - "floor": recall_floor, - "per_tier": per_tier, - "misses": [item.id for item in scored_items if not hits[item.id]], - **_semantic_details(), - **_sn, - }, - ) - - return GateResult( - gate="G2", - passed=True, - details={ - "recall": round(recall, 4), - "recall_metric": metric, - "floor": recall_floor, - "per_tier": per_tier, - "nc_items_checked": len(registry.nc_items), - **_semantic_details(), - **_sn, - }, - ) - - -# ── G3: Grounded ───────────────────────────────────────────────────────────── - - -def g3_grounded( - result: dict, - *, - grounding_floor: float = 0.90, - human_spot_check_n: int = 5, - corpus: Corpus | None = None, -) -> GateResult: - """G3 — Grounded (automated portion; human spot-check triggered on pass). - - For each finding, verifies that at least one cited evidence excerpt shares a - non-trivial token with the finding description (topic-anchoring). - - With a ``corpus``, the gate also looks in a third direction — cited -> - exists: every evidence entry is verified against the actual corpus text - (corpus.verify_entry). A populated excerpt not found in its cited source - raises EVIDENCE_FABRICATED; a locator resolving to no corpus document - raises EVIDENCE_SOURCE_UNKNOWN; and only verified excerpts can ground a - finding, so a run cannot ground itself on evidence it invented. - - Also reports excerpt fill rate and source coverage so the reviewer can tell - whether ungrounded findings are a format problem (empty excerpts) or a real - faithfulness signal (populated excerpts that do not anchor). - - Known limitation: topic-anchoring, not claim entailment. A '45 days' claim - cited to a '3 days' source passes if they share the process name (excerpt - verification confirms the quote is real, not that the claim matches it). - The human spot-check is the binding faithfulness signal until NLI/AIS lands. - """ - evidence_index = _build_evidence_index(result) - findings = result.get("findings", []) - statuses = verify_evidence_index(corpus, result) if corpus is not None else None - - if not findings: - return GateResult( - gate="G3", - passed=False, - reason_code="NO_FINDINGS", - details={"message": "Result has zero findings — cannot compute grounding."}, - ) - - grounded_ids: list[str] = [] - # Ungrounded split (§6.3): distinguish format issues from real faithfulness failures. - ungrounded_empty_only: list[str] = [] # every ref had an empty excerpt - ungrounded_populated: list[str] = [] # had populated excerpt(s) but none anchored - - # Excerpt fill: count all resolved refs and how many carry a non-empty excerpt. - total_refs = 0 - populated_refs = 0 - - # Source coverage: which source stems are cited by at least one finding. - cited_stems: set[str] = set() - - for finding in findings: - fid = finding.get("id", "?") - desc = finding.get("description", "") - is_grounded = False - had_populated = False - for ref in finding.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) - if ev: - total_refs += 1 - excerpt = ev.get("excerpt") or "" - if excerpt: - populated_refs += 1 - had_populated = True - # Track source coverage (even for ungrounded findings). - stem = matcher.source_stem(ev.get("locator", "")) - if stem: - cited_stems.add(stem) - # Only a corpus-verified excerpt can ground a finding. - if statuses is not None and statuses.get(ev.get("id")) != VERIFIED: - continue - if anchored(desc, excerpt): - is_grounded = True - break - if is_grounded: - grounded_ids.append(fid) - elif had_populated: - ungrounded_populated.append(fid) - else: - ungrounded_empty_only.append(fid) - - grounding_pct = len(grounded_ids) / len(findings) - - # All source stems present in the evidence index (not just those cited). - all_stems: set[str] = set() - for ev in result.get("evidence_index", []): - stem = matcher.source_stem(ev.get("locator", "")) - if stem: - all_stems.add(stem) - orphaned = sorted(all_stems - cited_stems) - - excerpt_fill = f"{populated_refs}/{total_refs}" if total_refs else "0/0" - source_coverage = f"{len(cited_stems)}/{len(all_stems)}" if all_stems else "0/0" - - details = { - "grounding_pct": round(grounding_pct, 4), - "grounded": len(grounded_ids), - "total": len(findings), - "excerpt_fill": excerpt_fill, - "source_coverage": source_coverage, - "orphaned_sources": orphaned, - } - - fabricated_ids: list[str] = [] - unknown_source_ids: list[str] = [] - if statuses is not None: - fabricated_ids = sorted(e for e, s in statuses.items() if s == FABRICATED) - unknown_source_ids = sorted(e for e, s in statuses.items() if s == SOURCE_UNKNOWN) - details["evidence_verification"] = { - "entries": len(statuses), - "verified": sum(1 for s in statuses.values() if s == VERIFIED), - "empty_excerpt": sum(1 for s in statuses.values() if s == EMPTY), - "fabricated": fabricated_ids, - "source_unknown": unknown_source_ids, - } - - if fabricated_ids: - details["message"] = ( - "Populated excerpt(s) not found in the cited corpus document — " - "the run asserts evidence the source does not contain." - ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_FABRICATED", details=details - ) - - if unknown_source_ids: - details["message"] = ( - "Evidence locator(s) resolve to no corpus document — either the " - "corpus bundle is incomplete or the run invented a source." - ) - return GateResult( - gate="G3", passed=False, reason_code="EVIDENCE_SOURCE_UNKNOWN", details=details - ) - - if grounding_pct < grounding_floor: - details["floor"] = grounding_floor - details["ungrounded_with_populated_excerpts"] = ungrounded_populated - details["ungrounded_with_empty_excerpts_only"] = ungrounded_empty_only - return GateResult(gate="G3", passed=False, reason_code="UNGROUNDED", details=details) - - spot_n = min(human_spot_check_n, len(findings)) - details["human_spot_check"] = ( - f"ACTION REQUIRED: manually review {spot_n} sampled findings for " - "field-consistency, citation-accuracy, and client-readiness. " - "This is the binding faithfulness signal until NLI/AIS lands." - ) - return GateResult(gate="G3", passed=True, details=details) - - -# ── G5: No-regression / promotion (human decision) ─────────────────────────── - - -def g5_no_regression( - candidate_scores: dict[str, float], - champion_scores: dict[str, float] | None, - aa_noise: dict[str, float] | None, - *, - is_day_zero: bool = False, - human_signed_off: bool = False, - signoff_count: int = 0, -) -> GateResult: - """G5 — No-regression / promotion gate (human decision). - - Day-Zero: no champion exists. Requires G1-G3 pass + 2 independent sign-offs. - Normal promotion: candidate must beat champion by > aa_noise on every metric, - no guardrail regresses, + 1 human sign-off. - - Champions are per-corpus. Do not compare across corpora. - """ - if is_day_zero or champion_scores is None: - required = 2 - if signoff_count < required: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={ - "reason": ( - f"Day-Zero requires {required} independent human sign-offs " - f"(kappa >= 0.70); got {signoff_count}." - ), - "action": "Collect sign-offs, then re-run with --day-zero --signoffs 2", - }, - ) - return GateResult( - gate="G5", - passed=True, - details={"day_zero": True, "signoffs": signoff_count}, - ) - - if not human_signed_off: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={"reason": "Human sign-off required for promotion."}, - ) - - noise = aa_noise or {} - regressions: list[str] = [] - improvements: list[str] = [] - - for metric, cand_val in candidate_scores.items(): - champ_val = champion_scores.get(metric) - if champ_val is None: - continue - delta = cand_val - champ_val - band = noise.get(metric, 0.0) - if delta < -band: - regressions.append( - f"{metric}: candidate={cand_val:.4f} champion={champ_val:.4f} " - f"delta={delta:+.4f} < -band={-band:.4f}" - ) - elif delta > band: - improvements.append(f"{metric}: delta={delta:+.4f} > band={band:.4f}") - - if regressions: - return GateResult( - gate="G5", - passed=False, - reason_code="HOLD", - details={ - "regressions": regressions, - "improvements": improvements, - "message": "Guardrail metric(s) regressed beyond A/A noise band.", - }, - ) - - return GateResult( - gate="G5", - passed=True, - details={"improvements": improvements, "noise_band": noise}, - ) - - -# ── Full gate pipeline ──────────────────────────────────────────────────────── - - -def run_gates( - result: dict, - registry: Registry, - registry_path: str, - *, - pii_list: list[str] | None = None, - recall_floor: float = 0.70, - grounding_floor: float = 0.90, - champion_scores: dict[str, float] | None = None, - aa_noise: dict[str, float] | None = None, - is_day_zero: bool = False, - human_signed_off: bool = False, - signoff_count: int = 0, - embed_fn=None, - tau: float = 0.70, - tau_nc: float = 0.85, - recall_metric: str = "lexical", - corpus: Corpus | None = None, -) -> list[GateResult]: - """Run all gates G1 -> G2 -> G3 -> G5; every gate always executes. - - A failed gate raises a flag in its GateResult but never prevents the - remaining gates from running. The scorecard therefore always carries the - complete picture: a run that misses a regulatory item *and* grounds poorly - shows both flags. See EVALUATION_FRAMEWORK.md §2 ('No gate vetoes'). - - ``corpus`` (optional) enables deterministic evidence verification: G1 pins - the corpus hash, G2 ignores unverified evidence entries, and G3 flags - fabricated excerpts and unknown sources. Without it, evidence is taken at - face value from the run's own evidence_index (disclosed on the scorecard). - - Returns all four GateResult objects. - """ - g1 = g1_structural(result, registry, registry_path, pii_list=pii_list, corpus=corpus) - - g2 = g2_recall_precision( - result, - registry, - recall_floor=recall_floor, - embed_fn=embed_fn, - tau=tau, - tau_nc=tau_nc, - recall_metric=recall_metric, - corpus=corpus, - ) - - g3 = g3_grounded(result, grounding_floor=grounding_floor, corpus=corpus) - - # G5 uses whatever scores G2/G3 produced; 0.0 when a gate flagged and did - # not emit the metric (e.g. L0_MISSING returns before computing recall). - candidate_scores = { - "recall": g2.details.get("recall", 0.0), - "grounding_pct": g3.details.get("grounding_pct", 0.0), - } - g5 = g5_no_regression( - candidate_scores, - champion_scores, - aa_noise, - is_day_zero=is_day_zero, - human_signed_off=human_signed_off, - signoff_count=signoff_count, - ) - - return [g1, g2, g3, g5] From 38c3f60f5109d559c6fe385c1b12eea878282f2e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:23 +0200 Subject: [PATCH 17/67] chore(evaluation): delete corpus.py --- fireflyframework_agentic/evaluation/corpus.py | 185 ------------------ 1 file changed, 185 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/corpus.py diff --git a/fireflyframework_agentic/evaluation/corpus.py b/fireflyframework_agentic/evaluation/corpus.py deleted file mode 100644 index 32835f2c..00000000 --- a/fireflyframework_agentic/evaluation/corpus.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Corpus loading and evidence verification (EVALUATION_FRAMEWORK.md §6.3). - -The corpus is the third pinned evaluation input, next to the DiscoveryResult -and the registry: the raw document bundle (input.json) the discovery pipeline -read. It is the trusted side of every evidence anchor — the registry tells -the evaluator what *should* be found; only the corpus can tell it whether what -a run cited is *real*. - -verify_entry() closes the fabricated-evidence channel: a run controls every -byte of its own evidence_index, so any check computable from (result, registry) -alone can be satisfied by self-reported evidence. Checking each excerpt -against the actual corpus text is the only deterministic counter. - -Excerpt contract: excerpts are verbatim quotes from the source document. -Spliced quotes (fragments joined with '...' or '…') are supported — each -fragment is verified independently. Paraphrase belongs in the finding -description, never in an excerpt. -""" - -from __future__ import annotations - -import base64 -import difflib -import hashlib -import json -import re -import unicodedata -from dataclasses import dataclass -from pathlib import Path - -from fireflyframework_agentic.evaluation.matcher import source_stem - -# Verification statuses for one evidence_index entry. -VERIFIED = "verified" # excerpt found (verbatim or spliced) in the cited source -EMPTY = "empty" # entry carries no excerpt text — nothing to verify -SOURCE_UNKNOWN = "source_unknown" # locator resolves to no corpus document -FABRICATED = "fabricated" # populated excerpt not found in the cited source - -# A spliced excerpt is split on these joiners; fragments shorter than -# _MIN_FRAGMENT_CHARS are too generic to verify and are skipped. -_SPLICE_PATTERN = re.compile(r"\.\.\.|…| -- ") -_MIN_FRAGMENT_CHARS = 15 - -# A fragment passes fuzzily when matching blocks (>= _MIN_BLOCK_CHARS chars) -# cover at least _COVERAGE_THRESHOLD of it — tolerates punctuation/whitespace -# drift while rejecting invented text (measured ~0.10-0.32 coverage). -_COVERAGE_THRESHOLD = 0.85 -_MIN_BLOCK_CHARS = 4 - - -@dataclass -class Corpus: - """The decoded, normalized corpus: {source stem: normalized text}. - - sha256 pins the corpus file exactly like the registry pin (§4.6): the - champion record stores it, and G1 re-hashes the file at scoring time to - flag CORPUS_DRIFT. - """ - - texts: dict[str, str] - sha256: str - path: str - - -def normalize(text: str) -> str: - """Normalize text for excerpt matching: NFKC, strip markdown emphasis and - smart quotes, collapse whitespace, casefold.""" - text = unicodedata.normalize("NFKC", text) - text = text.replace("**", "").replace("*", "") - text = re.sub(r"[\"""''']", "", text) - return re.sub(r"\s+", " ", text).strip().casefold() - - -def corpus_sha256(path: str | Path) -> str: - """SHA-256 of the corpus file on disk (the CORPUS_DRIFT re-hash).""" - return hashlib.sha256(Path(path).read_bytes()).hexdigest() - - -def load_corpus(path: str | Path) -> Corpus: - """Load a FlyRadar input.json bundle into a stem-indexed normalized Corpus. - - Decodes every artifacts[] file and signals[] event log (base64), normalizes - the text, and keys each by the same source_stem the matcher uses — so a - locator in any convention resolves to its document. - - Raises: - ValueError: when the bundle contains no documents, or two documents - reduce to the same stem (a collision would let a fabricated - citation resolve against the wrong real file). - """ - path = Path(path) - raw = json.loads(path.read_text(encoding="utf-8")) - - named_contents: list[tuple[str, str]] = [] - for artifact in raw.get("artifacts", []): - named_contents.append((artifact["filename"], artifact["content_base64"])) - for signal in raw.get("signals", []): - named_contents.append((signal["name"], signal["content_base64"])) - - if not named_contents: - raise ValueError(f"corpus bundle {path} contains no artifacts or signals") - - texts: dict[str, str] = {} - for name, content_b64 in named_contents: - stem = source_stem(name) - if stem in texts: - raise ValueError( - f"corpus stem collision: two documents reduce to {stem!r} — " - "rename one; a collision would verify citations against the wrong file" - ) - decoded = base64.b64decode(content_b64).decode("utf-8", errors="replace") - texts[stem] = normalize(decoded) - - return Corpus(texts=texts, sha256=corpus_sha256(path), path=str(path)) - - -def _fragment_coverage(fragment: str, source: str) -> float: - """Fraction of fragment covered by matching blocks of >= _MIN_BLOCK_CHARS chars.""" - blocks = difflib.SequenceMatcher( - None, fragment, source, autojunk=False - ).get_matching_blocks() - covered = sum(b.size for b in blocks if b.size >= _MIN_BLOCK_CHARS) - return covered / len(fragment) - - -def verify_entry(corpus: Corpus, entry: dict) -> str: - """Verify one evidence_index entry against the corpus. - - Returns one of VERIFIED / EMPTY / SOURCE_UNKNOWN / FABRICATED: - - the locator must resolve (by source stem) to a corpus document, and - - every fragment of the excerpt must appear in that document's text, - verbatim after normalization or with matching-block coverage >= - _COVERAGE_THRESHOLD. - - The score is the minimum over fragments, so one invented fragment sinks a - spliced excerpt. - - """ - stem = source_stem(entry.get("locator", "")) - source = corpus.texts.get(stem) - if source is None: - return SOURCE_UNKNOWN - - excerpt = normalize(entry.get("excerpt") or "") - if not excerpt: - return EMPTY - - fragments = [ - f.strip() - for f in _SPLICE_PATTERN.split(excerpt) - if len(f.strip()) >= _MIN_FRAGMENT_CHARS - ] or [excerpt] - - for fragment in fragments: - if fragment in source: - continue - if _fragment_coverage(fragment, source) < _COVERAGE_THRESHOLD: - return FABRICATED - return VERIFIED - - -def verify_evidence_index(corpus: Corpus, result: dict) -> dict[str, str]: - """Verify every evidence_index entry of a DiscoveryResult. - - Returns {evidence_id: status} over all entries — referenced or not — so - the gates share one verification pass. - """ - return { - ev["id"]: verify_entry(corpus, ev) - for ev in result.get("evidence_index", []) - if ev.get("id") - } From f81992336b5b0932a3d65c3acfc2b8a439a27a1b Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:30 +0200 Subject: [PATCH 18/67] chore(evaluation): delete registry.py --- .../evaluation/registry.py | 214 ------------------ 1 file changed, 214 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/registry.py diff --git a/fireflyframework_agentic/evaluation/registry.py b/fireflyframework_agentic/evaluation/registry.py deleted file mode 100644 index 2b869ba9..00000000 --- a/fireflyframework_agentic/evaluation/registry.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""lean-1 registry loader — one schema for all four corpora. - -Replaces the four mutually incompatible schemes in use today (L1-L5, -documented/observed/pain-point, critical/important, and no tiers). -Loader enforces all invariants; they are not documentation. - -Invariants (EVALUATION_FRAMEWORK.md, the must-find registry): -- schema_version == "lean-1" -- every tier is one of L0 L1 L2 L3 NC -- negative_control_count >= ceil(real_items / 10) -- kappa present (0.0 placeholder allowed; G2 advisory until >= 0.70) -- ABANCA DILO items must target a single measured sub-population -""" -from __future__ import annotations - -import hashlib -import json -import math -from dataclasses import dataclass, field -from pathlib import Path -from typing import Literal - -VALID_TIERS = ("L0", "L1", "L2", "L3", "NC") -VALID_SCOPES = ( - "process", "activity", "decision", "finding", "action", - "persona", "system", "informal_channel", "dependency_graph", -) -SCHEMA_VERSION = "lean-1" -KAPPA_ADVISORY_THRESHOLD = 0.70 - - -@dataclass(frozen=True) -class RegistryItem: - id: str - tier: Literal["L0", "L1", "L2", "L3", "NC"] - description: str - evidence: list[str] # source file paths (path portion of locator, no #page=N) - scope: str = "finding" # which DiscoveryResult surface to match against (§4.3) - keywords: list[str] = field(default_factory=list) - weight: float = 1.0 - from_node: str = "" # dependency_graph relation items only - to_node: str = "" # dependency_graph relation items only - relation: str = "" # defaults to "precedes" when from/to present - - -@dataclass(frozen=True) -class Registry: - schema_version: str - corpus: str - author: str - date: str - kappa: float - items: list[RegistryItem] - _sha256: str = field(default="", compare=False) - - @property - def real_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier != "NC"] - - @property - def nc_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier == "NC"] - - @property - def l0_items(self) -> list[RegistryItem]: - return [i for i in self.items if i.tier == "L0"] - - def is_kappa_advisory(self) -> bool: - return self.kappa < KAPPA_ADVISORY_THRESHOLD - - def sha256(self) -> str: - return self._sha256 - - -def _validate(raw: dict, path: Path) -> None: - if raw.get("schema_version") != SCHEMA_VERSION: - raise ValueError( - f"{path.name}: schema_version must be '{SCHEMA_VERSION}', " - f"got {raw.get('schema_version')!r}" - ) - for fname in ("corpus", "author", "date"): - if not raw.get(fname): - raise ValueError(f"{path.name}: missing required field '{fname}'") - if "kappa" not in raw: - raise ValueError(f"{path.name}: missing 'kappa' field (use 0.0 as placeholder)") - - items = raw.get("items", []) - - # EMPTY_MUST_FIND guard — must be first; kills fake-champion bug - if not items: - raise ValueError( - f"{path.name}: EMPTY_MUST_FIND — items list is empty; " - "cannot evaluate recall. This guard exists to prevent the " - "fake-100%-champion failure." - ) - - ids = [it.get("id") for it in items] - if len(ids) != len(set(ids)): - dupes = sorted({i for i in ids if ids.count(i) > 1}) - raise ValueError(f"{path.name}: duplicate item ids: {dupes}") - - for it in items: - tier = it.get("tier") - if tier not in VALID_TIERS: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid tier '{tier}'; " - f"must be one of {VALID_TIERS}" - ) - scope = it.get("scope", "finding") - if scope not in VALID_SCOPES: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has invalid scope '{scope}'; " - f"must be one of {VALID_SCOPES}" - ) - if scope == "dependency_graph": - if not it.get("from") or not it.get("to"): - raise ValueError( - f"{path.name}: dependency_graph item '{it.get('id')}' must have " - "non-empty 'from' and 'to'" - ) - else: - if "from" in it or "to" in it or "relation" in it: - raise ValueError( - f"{path.name}: item '{it.get('id')}' has 'from'/'to'/'relation' " - f"but scope is '{scope}'; these fields are only valid on " - "dependency_graph-scoped items" - ) - - real_count = sum(1 for it in items if it.get("tier") != "NC") - nc_count = sum(1 for it in items if it.get("tier") == "NC") - required_nc = max(1, math.ceil(real_count / 10)) - if nc_count < required_nc: - raise ValueError( - f"{path.name}: NC density too low — {nc_count} NC item(s) for " - f"{real_count} real items; need >= {required_nc} (ceil(real/10)). " - "Without NC items the eval measures recall only; a verbose hallucinator " - "scores perfectly." - ) - - # ABANCA DILO blend guard: items must assert a single sub-population target. - # Checks for phrases that would indicate a blended numeric target is asserted. - # "blend" alone is too broad (items may reference it negatively). - BLEND_PHRASES = ("combined distribution", "across all offices regardless of segment") - for it in items: - if it.get("tier") == "NC": - continue - desc = it.get("description", "").lower() - iid = it.get("id", "") - if any(phrase in desc for phrase in BLEND_PHRASES): - raise ValueError( - f"{path.name}: item '{iid}' description targets a blended distribution; " - "ABANCA DILO items must target a single measured sub-population " - "(Empresas or PyMEs). Use segment-keyed items: " - "dilo-empresas-operativa-42pct AND dilo-pymes-operativa-29pct separately." - ) - - -def _compute_sha256(path: Path) -> str: - return hashlib.sha256(path.read_bytes()).hexdigest() - - -def load_registry(path: str | Path) -> Registry: - """Load and validate a lean-1 registry file. - - Raises ValueError with a descriptive message on any invariant violation. - The EMPTY_MUST_FIND check runs first — it is the fake-champion guard. - """ - path = Path(path) - raw = json.loads(path.read_text(encoding="utf-8")) - _validate(raw, path) - sha = _compute_sha256(path) - - items = [ - RegistryItem( - id=it["id"], - tier=it["tier"], - scope=it.get("scope", "finding"), - description=it.get("description", ""), - evidence=it.get("evidence", []), - keywords=it.get("keywords", []), - weight=float(it.get("weight", 1.0)), - from_node=it.get("from", "") if it.get("scope") == "dependency_graph" else "", - to_node=it.get("to", "") if it.get("scope") == "dependency_graph" else "", - relation=it.get("relation", "precedes") if it.get("scope") == "dependency_graph" else "", - ) - for it in raw["items"] - ] - - return Registry( - schema_version=raw["schema_version"], - corpus=raw["corpus"], - author=raw["author"], - date=raw["date"], - kappa=float(raw["kappa"] or 0.0), - items=items, - _sha256=sha, - ) - - -def registry_sha256(path: str | Path) -> str: - return _compute_sha256(Path(path)) From 3bc07861bafefab41c04e6e4697be779097b3f49 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:36 +0200 Subject: [PATCH 19/67] chore(evaluation): delete matcher.py --- .../evaluation/matcher.py | 369 ------------------ 1 file changed, 369 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/matcher.py diff --git a/fireflyframework_agentic/evaluation/matcher.py b/fireflyframework_agentic/evaluation/matcher.py deleted file mode 100644 index b4d81f44..00000000 --- a/fireflyframework_agentic/evaluation/matcher.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Single matching primitive reused across G2 (recall/precision) and G3 (grounding). - -anchored() is topic-level lexical overlap. matches() is the gate predicate. -One function, three uses — do not write three matching functions. - -Known limitation (EVALUATION_FRAMEWORK.md): anchored() is topic-anchored, not claim-verified. -A '45 days' claim cited to a '3 days' source passes if they share the process name. -Real claim entailment (NLI/AIS) is Phase 2. The G3 human spot-check is the -binding faithfulness signal until then. -""" - -from __future__ import annotations - -import re - -import numpy as np - -from fireflyframework_agentic.evaluation.judge_client import cosine - - -def tokens(text: str) -> list[str]: - return re.findall(r"\b\w+\b", text.lower()) - - -def anchored(claim: str, evidence: str, *, min_token: int = 5) -> bool: - """True if claim and evidence share at least one non-trivial token (>= min_token chars). - - Rejects a citation to an unrelated document. Does NOT verify the claim value — - that gap is closed by the deferred NLI/AIS check in Phase 2. - """ - a = {t for t in tokens(claim) if len(t) >= min_token} - b = {t for t in tokens(evidence) if len(t) >= min_token} - return bool(a & b) - - -def source_stem(locator: str) -> str: - """Normalize a locator/source path to a stable document stem for matching. - - Robust to the two locator conventions observed across runs: - - directory-prefixed ('sops/SOP-002-kyc-edd.md') and bare ('SOP-002-kyc-edd.md') - both reduce to 'sop-002-kyc-edd'; - - event-log row ids ('src-credit-underwriting:CU-2026-1003') reduce to the - process stem 'credit-underwriting', so they join the CSV the registry cites. - - Preserves the same-document anti-gaming property of matches(): it still keys - on which source document a finding cites — just independent of directory - prefix, file extension, and case, so one registry scores every run. - """ - s = locator.split("#")[0] # drop the locator fragment (#page=N, #anchor) - s = s.rsplit("/", 1)[-1] # basename — strip any directory prefix - if s.startswith("src-") and ":" in s: # event-log row id: src-: - return s.split(":", 1)[0][len("src-") :].lower() - if "." in s: # strip a trailing file extension - s = s.rsplit(".", 1)[0] - return s.lower() - - -def _finding_sources(finding: dict, evidence_index: dict[str, dict]) -> set[str]: - """Return the set of normalized source-document stems cited by a finding.""" - sources: set[str] = set() - for ref in finding.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) - if ev: - stem = source_stem(ev.get("locator", "")) - if stem: - sources.add(stem) - return sources - - -def shares_source(finding: dict, item, evidence_index: dict[str, dict]) -> bool: - """True iff the finding cites at least one source document the item lists as evidence. - - Source documents are compared by normalized stem (source_stem) so one registry - scores every run regardless of locator convention. This is the anti-gaming - anchor reused by both the lexical predicate (matches) and the semantic path - (semantic_hits): a finding on a different document cannot satisfy this item. - - Spec-style NC items list their mirror source (§4.1); legacy NC items carry - evidence=[], which makes this always False for them. - - Args: - finding: dict from DiscoveryResult.findings[i] (model_dump output). - item: RegistryItem dataclass from registry.py. - evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. - """ - finding_sources = _finding_sources(finding, evidence_index) - item_sources = {source_stem(e) for e in item.evidence} - return bool(finding_sources & item_sources) - - -def _keyword_anchored(desc: str, keywords: list[str]) -> bool: - """True iff any keyword appears as a whole word in desc (case-insensitive). - - Keyword rail: exempt from the 5-char token floor so short banking terms - (KYC, PEP, AML) can anchor a match even though they are too short for the - token rail. Whole-word matching prevents false substring hits (e.g. "risk" - inside "enterprise-risk-management"). - """ - if not keywords: - return False - desc_lower = desc.lower() - return any( - re.search(r"\b" + re.escape(kw.lower()) + r"\b", desc_lower) for kw in keywords - ) - - -def candidate_text(candidate: dict, scope: str) -> str: - """Extract the searchable text from a candidate on the given scope surface (§4.3). - - Each scope surface uses different fields as the match text: - - finding / action : title + description - - process / decision : name + description - - activity : name + notes + regulatory_links - - persona : name + role + goals + pain_points - - system : name + description - - informal_channel : name + usage_context + notes - - dependency_graph : name + description (diagnostic nodes; relation items bypass this) - """ - if scope in ("finding", "action"): - return " ".join(filter(None, [candidate.get("title", ""), candidate.get("description", "")])) - if scope == "activity": - rl = candidate.get("regulatory_links") or [] - rl_str = " ".join(rl) if isinstance(rl, list) else str(rl or "") - return " ".join(filter(None, [candidate.get("name", ""), candidate.get("notes", ""), rl_str])) - if scope == "persona": - goals = candidate.get("goals") or [] - pain = candidate.get("pain_points") or [] - goals_str = " ".join(goals) if isinstance(goals, list) else str(goals) - pain_str = " ".join(pain) if isinstance(pain, list) else str(pain) - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("role", ""), - goals_str, - pain_str, - ])) - if scope == "informal_channel": - return " ".join(filter(None, [ - candidate.get("name", ""), - candidate.get("usage_context", ""), - candidate.get("notes", ""), - ])) - # process, decision, system, dependency_graph (diagnostic nodes) - return " ".join(filter(None, [candidate.get("name", ""), candidate.get("description", "")])) - - -INSIGHT_ITEM_SCOPES = ("finding", "action") -INSIGHT_MATCH_SURFACES = ("finding", "action", "activity", "decision") - - -def allowed_scopes(item) -> tuple[str, ...]: - """Candidate surfaces that may satisfy a registry item. - - Insight items (finding / action) may be satisfied by any insight or process-graph - *leaf* surface (activity / decision): a run often grounds the same operational fact - on a different surface than the registry's scope tag anticipates (the BBVA case — - pain points the registry tags 'finding' that the run emitted as decision/activity - nodes). shares_source is still REQUIRED on every candidate (see matches / - semantic_hits), so a candidate on the wrong document never counts — cross-scope - widens WHERE we look, never the source anchor. - - Structural items (process / activity / decision) stay on their own surface: a - structural must-find requires the run to have actually built that node, not merely - mentioned the fact in a finding (test_process_scope_miss_when_no_matching_process). - NC items are likewise scope-strict — widening a negative control's pool could only - make it easier to trip (a specificity regression), never recover a legitimate hit. - - `process` is never a match surface for an insight item: _candidates_by_scope folds - every child's evidence_refs into the process node, so its citation set is a union of - many documents and shares_source goes vacuous (hence its exclusion from - INSIGHT_MATCH_SURFACES). - """ - if item.tier == "NC": - return (item.scope,) - if item.scope in INSIGHT_ITEM_SCOPES: - return INSIGHT_MATCH_SURFACES - return (item.scope,) - - -def matches( - candidate: dict, - item, - evidence_index: dict[str, dict], - scope: str = "finding", -) -> bool: - """True iff candidate cites a shared source document AND is topic-anchored to item. - - Two-rail anchor (either rail suffices): - - Token rail: ≥1 shared token of ≥5 chars between candidate text and item description. - - Keyword rail: ≥1 item keyword appears as a whole word in the candidate text. - Exempt from the 5-char floor so short banking terms (KYC, PEP, AML) can anchor. - - The ``scope`` controls which fields are read as the candidate's match text (§4.3): - findings and actions use ``title + description``; processes and decisions use - ``name + description``; activities use ``name + notes + regulatory_links``. - - Anti-gaming guard: a candidate on a different document cannot satisfy this item - even if its text happens to match. Source documents are compared by - normalized stem (source_stem) so one registry scores every run regardless of - locator convention. - - Args: - candidate: dict from the DiscoveryResult surface matching ``scope``. - item: RegistryItem dataclass from registry.py. - evidence_index: {evidence_id: Evidence dict} built from result['evidence_index']. - scope: surface the candidate was drawn from (default "finding"). - """ - if not shares_source(candidate, item, evidence_index): - return False - desc = candidate_text(candidate, scope) - return _keyword_anchored(desc, list(item.keywords or [])) or anchored(desc, item.description) - - -def matches_dependency_graph_relation( - item, - result: dict, - evidence_index: dict[str, dict], -) -> bool: - """Endpoint matcher for dependency_graph relation items (§5.3b). - - Stage 1: Anchor both endpoints to activity nodes via token rail. - Stage 2: Verify a directed edge or path connects them in the asserted direction, - behind the shared-source guard on the edge's/path's evidence_refs. - - Returns False when either endpoint anchors to no activity, or when no connecting - edge/path shares a source document with the item. - """ - if not item.from_node or not item.to_node: - return False - - processes = result.get("process_graph", {}).get("processes", []) - all_activities = [a for p in processes for a in p.get("activities", [])] - - def _anchor(endpoint_text: str) -> set[str]: - return { - a["id"] - for a in all_activities - if a.get("id") and anchored(candidate_text(a, "activity"), endpoint_text) - } - - from_ids = _anchor(item.from_node) - to_ids = _anchor(item.to_node) - if not from_ids or not to_ids: - return False - - item_stems = {source_stem(e) for e in item.evidence} - - def _node_stems(node: dict) -> set[str]: - return { - source_stem(evidence_index[r["evidence_id"]].get("locator", "")) - for r in node.get("evidence_refs", []) - if r.get("evidence_id") in evidence_index - } - - dg = result.get("dependency_graph", {}) - - for edge in dg.get("activity_edges", []): - if edge.get("from_node") in from_ids and edge.get("to_node") in to_ids: - if _node_stems(edge) & item_stems: - return True - - for path in dg.get("critical_paths", []): - if not (_node_stems(path) & item_stems): - continue - node_ids = path.get("node_ids", []) - from_pos = [i for i, nid in enumerate(node_ids) if nid in from_ids] - to_pos = [i for i, nid in enumerate(node_ids) if nid in to_ids] - if any(fp < tp for fp in from_pos for tp in to_pos): - return True - - return False - - -def semantic_hits( - candidates: dict[str, list[dict]], - items, - evidence_index: dict[str, dict], - embed_fn, - tau: float = 0.70, - tau_nc: float = 0.85, -) -> dict[str, bool]: - """Opt-in embedding-semantic recall: {item.id: found-by-some-shared-source candidate}. - - Scope-aware: each registry item is evaluated against candidates from its own - scope surface (finding, process, activity, decision, action) using the same - per-scope field extraction as the lexical path (candidate_text). Passing only - the findings list (the previous behaviour) would leave process/activity/decision/ - action items with an empty candidate pool and a guaranteed False result. - - Real items (L0–L3): hit iff some scope-matching candidate shares a source - document with the item (shares_source) AND is embedding-similar (cosine >= tau). - Source anchor is preserved — a candidate on a different document cannot recover - a real item. - - NC items (tier=="NC"): hit iff some scope-matching candidate is embedding-similar - (cosine >= tau_nc). When the NC lists its mirror source (§4.1) the shared-source - guard applies; legacy NC items with evidence=[] skip the anchor, with the higher - threshold (default 0.85) compensating. - - Cost is two embed_fn calls — all scope-appropriate candidate texts once and all - item texts once — not O(n*m) per-pair embeddings. - - Args: - candidates: {scope: [candidate dicts]} from _candidates_by_scope(). - items: iterable of RegistryItem dataclasses. - evidence_index: {evidence_id: Evidence dict}. - embed_fn: callable(list[str]) -> array-like of row vectors. - tau: cosine threshold for real items (inclusive). - tau_nc: cosine threshold for NC items (inclusive; higher to compensate for no source anchor). - """ - items = list(items) - - # Flatten all candidates across scopes, preserving their scope tag for - # text extraction and per-item filtering. - scoped: list[tuple[str, dict]] = [ - (scope, cand) - for scope, cands in candidates.items() - for cand in cands - ] - - if not scoped: - return {item.id: False for item in items} - - cand_texts = [candidate_text(cand, scope) for scope, cand in scoped] - item_texts = [ - " ".join([item.description or ""] + list(item.keywords or [])).strip() for item in items - ] - - cand_vecs = np.asarray(embed_fn(cand_texts)) - item_vecs = np.asarray(embed_fn(item_texts)) - - hits: dict[str, bool] = {} - for i, item in enumerate(items): - item_vec = item_vecs[i] - allowed = allowed_scopes(item) - hit = False - for k, (scope, cand) in enumerate(scoped): - if scope not in allowed: - continue - if item.tier == "NC": - # Shared-source guard applies when the NC lists its mirror source - # (§4.2/§6.2); legacy evidence=[] NCs stay unanchored, with the - # higher tau_nc compensating. - if item.evidence and not shares_source(cand, item, evidence_index): - continue - if cosine(cand_vecs[k], item_vec) >= tau_nc: - hit = True - break - elif ( - shares_source(cand, item, evidence_index) - and cosine(cand_vecs[k], item_vec) >= tau - ): - hit = True - break - hits[item.id] = hit - return hits From 9c43a323da1f9929216593be23ae9366bcb67de2 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:41 +0200 Subject: [PATCH 20/67] chore(evaluation): delete scorecard.py --- .../evaluation/scorecard.py | 489 ------------------ 1 file changed, 489 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/scorecard.py diff --git a/fireflyframework_agentic/evaluation/scorecard.py b/fireflyframework_agentic/evaluation/scorecard.py deleted file mode 100644 index b34885e8..00000000 --- a/fireflyframework_agentic/evaluation/scorecard.py +++ /dev/null @@ -1,489 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Scorecard renderer: gate results -> Markdown report. - -Every scorecard states whether it is self-graded. Until Phase 3 independent -re-annotation lands, all Lean-Core PROMOTE verdicts are self-graded against -team-authored ground truth. See EVALUATION_FRAMEWORK.md. -""" - -from __future__ import annotations - -import json - -VERDICT_PROMOTE = "PROMOTE" -VERDICT_HOLD = "HOLD" - - -def verdict(gate_results: list) -> str: - """PROMOTE iff all gates passed and G5 is in the list; HOLD otherwise.""" - if not gate_results: - return VERDICT_HOLD - if not all(g.passed for g in gate_results): - return VERDICT_HOLD - gate_names = {g.gate for g in gate_results} - if "G5" not in gate_names: - return VERDICT_HOLD - return VERDICT_PROMOTE - - -def render_scorecard( - gate_results: list, - *, - corpus: str = "unknown", - model_id: str = "unknown", - run_id: str = "run", - is_self_graded: bool = True, - kappa_advisory: bool = False, - evidence_unverified: bool = False, - bpi2017_f1: float | None = None, - advisory=None, - config: dict | None = None, - experiment_config: dict | None = None, -) -> str: - """Render a Markdown evaluation scorecard. - - The scorecard always discloses self-graded status and advisory flags. - """ - v = verdict(gate_results) - lines = [ - "# FlyRadar Evaluation Scorecard", - "", - f"**Corpus**: {corpus}", - f"**Model**: {model_id}", - f"**Run**: {run_id}", - f"**Verdict**: **{v}**", - "", - ] - - if is_self_graded: - lines += [ - "> **SELF-GRADED**: All ground truth (must-find, gold, DILO, human sign-off) is", - "> authored by the FlyRadar team. This PROMOTE has no contamination-free signal", - "> until Phase 3. See EVALUATION_FRAMEWORK.md.", - "", - ] - - if kappa_advisory: - lines += [ - "> **ADVISORY**: Registry kappa < 0.70 — a second independent annotator has not", - "> verified the must-find items. Promotion is advisory for this corpus until", - "> kappa >= 0.70 from an independent re-annotation.", - "", - ] - - if evidence_unverified: - lines += [ - "> **EVIDENCE UNVERIFIED**: no corpus supplied (--corpus) — evidence locators", - "> and excerpts are taken at face value from the run's own evidence_index.", - "> Grounding certifies self-consistency, not corpus reality. Supply the run's", - "> input.json to enable deterministic excerpt verification (G3, §6.3).", - "", - ] - - if experiment_config is not None: - lines += [ - "## Experiment configuration", - "How this run was generated. Recorded fields (cost, tokens, latency, agents) are " - "read from the run's output.json; `model` is the value passed to the harness via " - "--model-id. Generation params (temperature, prompt/pipeline version, seed) are not " - "captured in output.json.", - "", - "```json", - json.dumps(experiment_config, indent=2, default=str), - "```", - "", - ] - - if config is not None: - lines += [ - "## Evaluation configuration", - "These are the parameters used to compute the evaluation.", - "", - "```json", - json.dumps(config, indent=2, default=str), - "```", - "", - ] - - lines += ["## Gate Results", ""] - g5_result = None - for g in gate_results: - if g.gate == "G5": - g5_result = g - continue - status = "PASS" if g.passed else f"FLAG ({g.reason_code})" - lines.append(f"### {g.gate}: {status}") - if g.details: - lines.append("```json") - lines.append(json.dumps(g.details, indent=2, default=str)) - lines.append("```") - lines.append("") - - if bpi2017_f1 is not None: - ok = bpi2017_f1 >= 0.60 - anchor_status = "PASS (>= 0.60)" if ok else "BELOW THRESHOLD (< 0.60)" - lines += [ - "## External Sanity Anchor (non-blocking)", - f"BPI-2017 variant-recovery F1: **{bpi2017_f1:.3f}** — {anchor_status}", - "_One non-self-graded signal. Non-blocking; informational only._", - "", - ] - - if advisory is not None: - lines += _render_advisory(advisory) - - if g5_result is not None: - status = "PASS" if g5_result.passed else f"FLAG ({g5_result.reason_code})" - lines.append(f"### G5: {status}") - if g5_result.details: - lines.append("```json") - lines.append(json.dumps(g5_result.details, indent=2, default=str)) - lines.append("```") - lines.append("") - - lines += _render_analysis(gate_results, advisory) - - return "\n".join(lines) - - -def _num(x) -> str: - """Format a metric leaf: None -> 'n/a', float -> 3dp, else str.""" - if x is None: - return "n/a" - if isinstance(x, float): - return f"{x:.3f}" - return str(x) - - -def _render_advisory(report) -> list[str]: - """Render the non-blocking G4 LLM-as-a-Judge section from an AdvisoryReport. - - Best-effort: only metrics present in report.metrics are shown. G4 never - affects the PROMOTE/HOLD verdict; this section is decision-support for the - G5 human sign-off, and is advisory until LLM-as-a-Judge calibration (§10). - """ - m = report.metrics - cal = "calibrated" if report.calibrated else "uncalibrated" - lines = [ - "## G4 — LLM-as-a-Judge (non-blocking — does NOT affect the PROMOTE/HOLD verdict)", - f"Judge: {report.judge_model} · {cal} · {report.runs}-run median", - ] - if report.same_provider_caveat: - lines.append("> same-provider as the pipeline — results may share blind spots.") - lines.append("```text") - - if "faithfulness" in m: - d = m["faithfulness"] - u = d.get("unsupported_ids", []) - extra = f" (unsupported: {', '.join(u)})" if u else "" - lines.append( - f"Faithfulness (entailment): {d.get('supported')}/{d.get('total')} supported{extra}" - ) - if "numeric_temporal_fidelity" in m: - lines.append( - f"Numeric/temporal fidelity: {m['numeric_temporal_fidelity'].get('count', 0)} mismatch(es)" - ) - if "citation_relevance" in m: - d = m["citation_relevance"] - lines.append( - f"Citation relevance (ctx-prec): {_num(d.get('precision'))} ({d.get('relevant')}/{d.get('total')})" - ) - if "semantic_recovery" in m: - d = m["semantic_recovery"] - rec = d.get("recovered", []) - rids = ", ".join(r.get("id", "") for r in rec) if rec else "none" - lines.append( - f"Semantic recovery (ctx-recall): lexical {_num(d.get('lexical_recall'))} -> {_num(d.get('recovered_recall'))} (recovered: {rids})" - ) - if "nc_semantic_precision" in m: - d = m["nc_semantic_precision"] - a = d.get("asserted_ids", []) - extra = f" ({', '.join(a)})" if a else "" - lines.append(f"NC semantic precision: {d.get('asserted', 0)} asserted{extra}") - if "fabricated_entity" in m: - lines.append(f"Fabricated-entity check: {m['fabricated_entity'].get('count', 0)}") - if "contradiction" in m: - lines.append(f"Contradiction detection: {m['contradiction'].get('count', 0)}") - if "actionability" in m: - d = m["actionability"] - lines.append( - f"Actionability: {_num(d.get('score'))} (rated {d.get('rated', 0)})" - ) - if "severity_calibration" in m: - d = m["severity_calibration"] - lines.append( - f"Severity calibration: {d.get('miscalibrated', 0)}/{d.get('total', 0)} miscalibrated" - ) - if "answer_relevancy" in m: - lines.append(f"Answer relevancy: {_num(m['answer_relevancy'].get('score'))}") - if "comparative_vs_champion" in m: - lines.append( - f"Comparative vs champion: more consistent -> {m['comparative_vs_champion'].get('more_consistent', 'n/a')}" - ) - if "source_coverage" in m: - d = m["source_coverage"] - o = d.get("orphaned", []) - extra = f" (orphaned: {', '.join(o)})" if o else "" - lines.append( - f"Source coverage [D]: {d.get('cited')}/{d.get('total')} documents cited{extra}" - ) - if "excerpt_fill_rate" in m: - d = m["excerpt_fill_rate"] - lines.append( - f"Evidence-excerpt fill [D]: {d.get('populated')}/{d.get('total')} populated" - ) - if "open_gap" in m: - gap = (m["open_gap"].get("gap") or "").strip() - if gap: - lines.append(f"Open gap probe: {gap}") - if report.errors: - lines.append(f"(errors: {len(report.errors)} metric(s) failed: {'; '.join(report.errors)})") - lines.append("```") - # Full detail — nothing truncated: every id, pair, verdict, and complete text. - lines += [ - "", - "**G4 — full metric detail:**", - "```json", - json.dumps({"metrics": report.metrics, "details": report.details}, indent=2, default=str), - "```", - ] - lines.append( - "> Decision support for the G5 human sign-off; advisory until LLM-as-a-Judge calibration (§10)." - ) - lines.append("") - return lines - - -def _render_analysis(gate_results: list, advisory=None) -> list[str]: - """Render a plain-language interpretation of all evaluation signals.""" - g2 = next((g for g in gate_results if g.gate == "G2"), None) - g3 = next((g for g in gate_results if g.gate == "G3"), None) - - lines = ["## Analysis", ""] - - # ── Topic coverage (G2) ────────────────────────────────────────────────── - lines.append("### Topic coverage (G2)") - if g2 and g2.details: - d = g2.details - recall = d.get("recall", 0.0) - tiers = d.get("per_tier", {}) - finding_count = d.get("finding_count", 0) - redundancy = d.get("finding_redundancy_rate", 0.0) - matched = d.get("findings_matched_to_registry", {}).get("fraction", 0.0) - - tier_summary = ", ".join( - f"{t} {v['hit']}/{v['total']}" - for t, v in tiers.items() - if "hit" in v and "total" in v - ) - lines.append( - f"Lexical recall is **{recall:.3f}** ({tier_summary}). " - f"The run produced {finding_count} findings, " - f"all of which map to a registry item (match rate {matched:.0%}). " - ) - if redundancy > 0.15: - lines.append( - f"Finding redundancy is **{redundancy:.0%}** — a meaningful share of " - "findings are near-duplicates of each other (Jaccard ≥ 0.6). " - "The run is covering the same ground multiple times rather than broadening coverage." - ) - else: - lines.append( - f"Finding redundancy is low ({redundancy:.0%}): each finding addresses a distinct topic." - ) - lines.append( - "_G2 is a topic-level test. A recall of 1.000 means every required topic was " - "mentioned somewhere — it does not verify that the specific claims about those " - "topics are accurate. Claim accuracy is G4 Faithfulness._" - ) - else: - lines.append("G2 result unavailable.") - lines.append("") - - # ── Evidence quality (G3) ──────────────────────────────────────────────── - lines.append("### Evidence quality (G3)") - if g3 and g3.details: - d = g3.details - grounding = d.get("grounding_pct", 0.0) - ev = d.get("evidence_verification", {}) - verified = ev.get("verified", 0) - entries = ev.get("entries", 0) - fabricated = ev.get("fabricated", []) - unknown = ev.get("source_unknown", []) - orphaned = d.get("orphaned_sources", []) - source_cov = d.get("source_coverage", "") - - lines.append( - f"Grounding is **{grounding:.0%}**: every finding cites at least one " - "corpus document, and all excerpts are populated. " - f"Evidence verification checked {entries} entries against the raw corpus: " - f"{verified} verified" - + (f", **{len(fabricated)} fabricated** (locators that do not exist in the corpus)" if fabricated else "") - + (f", **{len(unknown)} source-unknown** (locators that resolve to no corpus file)" if unknown else "") - + "." - ) - if unknown: - lines.append( - f"The source-unknown locator(s) are: `{'`, `'.join(unknown)}`. " - "This is most likely a corpus bundle gap rather than a hallucinated source — " - "verify that all expected files are included in `input.json`." - ) - if orphaned: - lines.append( - f"**{len(orphaned)} corpus documents were never cited** by this run " - f"({', '.join(orphaned)}). These are blind spots: the run extracted nothing " - "from these sources, so any findings they contain are silently missed." - ) - if source_cov: - cited, total = (int(x) for x in source_cov.split("/")) - if cited < total: - lines.append( - f"Overall source coverage is {cited}/{total} — " - f"{total - cited} corpus file(s) left entirely uncited." - ) - else: - lines.append("G3 result unavailable.") - lines.append("") - - # ── Claim accuracy (G4) ────────────────────────────────────────────────── - if advisory is not None: - m = advisory.metrics - lines.append("### Claim accuracy (G4 — advisory)") - - faith = m.get("faithfulness", {}) - supported = faith.get("supported", 0) - total_f = faith.get("total", 0) - if total_f: - faith_pct = supported / total_f - lines.append( - f"**Faithfulness: {supported}/{total_f} findings ({faith_pct:.0%}) are entailed by their cited evidence.** " - ) - if faith_pct < 0.5: - lines.append( - "This is a critical signal: the majority of findings contain claims " - "that the judge cannot verify from the cited sources. " - "The run is presenting inferences, extrapolations, or hallucinated details " - "as if they were directly evidenced. " - "Each unsupported finding should be reviewed against its cited document before use." - ) - elif faith_pct < 0.8: - lines.append( - "A significant minority of findings contain claims not traceable to cited sources. " - "These may be reasonable inferences, but they should be flagged for human verification." - ) - else: - lines.append("Most findings are directly supported by their cited evidence.") - - ntf = m.get("numeric_temporal_fidelity", {}) - mismatch_count = ntf.get("count", 0) - if mismatch_count: - lines.append( - f"**Numeric/temporal fidelity: {mismatch_count} mismatches detected.** " - "Specific figures — FTE costs, durations, timestamps, percentages, case IDs — " - "appear in findings but cannot be traced to the cited evidence. " - "These numbers should be treated as estimates or fabrications until verified " - "against the source documents." - ) - - fab = m.get("fabricated_entity", {}) - fab_count = fab.get("count", 0) - fab_entities = fab.get("entities", []) - if fab_count: - lines.append( - f"**Fabricated entities: {fab_count}** — the following names/identifiers appear " - f"in the output but are absent from the corpus: " - f"{', '.join(f'`{e}`' for e in fab_entities)}. " - "These should be removed or verified before sharing the output." - ) - - sev = m.get("severity_calibration", {}) - misc = sev.get("miscalibrated", 0) - total_s = sev.get("total", 0) - verdicts = sev.get("verdicts", {}) - over_count = sum(1 for v in verdicts.values() if v == "over") - under_count = sum(1 for v in verdicts.values() if v == "under") - if misc and total_s: - direction = "" - if over_count > under_count: - direction = f" (predominantly over-rated: {over_count} findings rated too high)" - elif under_count > over_count: - direction = f" (predominantly under-rated: {under_count} findings rated too low)" - lines.append( - f"**Severity calibration: {misc}/{total_s} findings miscalibrated{direction}.** " - "Over-rated findings inflate perceived urgency and can cause the client to " - "prioritise the wrong items." - ) - - act = m.get("actionability", {}) - act_score = act.get("score") - if act_score is not None: - if act_score < 0.6: - lines.append( - f"**Actionability score: {act_score:.3f}** — proposed actions are below the " - "0.6 threshold for concrete, quantified recommendations. " - "Actions tend to be generic rather than specific enough to assign and execute." - ) - else: - lines.append(f"Actionability score: {act_score:.3f} — actions are sufficiently concrete.") - - og = m.get("open_gap", {}) - gap_text = (og.get("gap") or "").strip() - if gap_text: - lines.append(f"**Most important missed finding:** {gap_text}") - - lines.append("") - - # ── Bottom line ────────────────────────────────────────────────────────── - lines.append("### Bottom line") - g5 = next((g for g in gate_results if g.gate == "G5"), None) - g5_reason = (g5.details or {}).get("reason", "") if g5 else "" - flags = [g for g in gate_results if not g.passed] - flag_names = [g.gate for g in flags] - - if not flags: - lines.append( - "All deterministic gates pass. The run is ready for G5 human sign-off." - ) - else: - flag_str = ", ".join(flag_names) - lines.append( - f"The run is at **HOLD** due to flags on: {flag_str}. " - ) - for g in flags: - if g.gate == "G3" and g.reason_code == "EVIDENCE_SOURCE_UNKNOWN": - lines.append( - "- **G3**: One evidence locator points to a file not in the corpus bundle. " - "Regenerate `input.json` to include all corpus sources, then re-run." - ) - elif g.gate == "G5": - lines.append(f"- **G5**: {g5_reason}") - - if advisory is not None: - m = advisory.metrics - faith = m.get("faithfulness", {}) - supported = faith.get("supported", 0) - total_f = faith.get("total", 1) - ntf_count = m.get("numeric_temporal_fidelity", {}).get("count", 0) - fab_count = m.get("fabricated_entity", {}).get("count", 0) - lines.append( - f"\nG4 advisory signals (non-blocking but important for the G5 reviewer): " - f"faithfulness {supported}/{total_f}, " - f"{ntf_count} numeric mismatches, " - f"{fab_count} fabricated entities. " - "The G5 reviewer should focus on the unsupported findings and verify figures " - "against the source documents before certifying the output." - ) - lines.append("") - return lines From a3673b5c5a441192b99ba7ecd40ab2d5d4bdc57a Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:45 +0200 Subject: [PATCH 21/67] chore(evaluation): delete run_config_snapshot.py --- .../evaluation/run_config_snapshot.py | 160 ------------------ 1 file changed, 160 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/run_config_snapshot.py diff --git a/fireflyframework_agentic/evaluation/run_config_snapshot.py b/fireflyframework_agentic/evaluation/run_config_snapshot.py deleted file mode 100644 index db543129..00000000 --- a/fireflyframework_agentic/evaluation/run_config_snapshot.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Capture the effective flyradar run configuration into experiment_configuration.json. - -Non-invasive snapshot: it records how a run was generated by reading what flyradar -already exposes as data — the request options the caller sent, the ``/api/v1/version`` -endpoint, ``RadarSettings``, and the prompt catalog — without modifying flyradar. The -snapshot is written next to the run's ``output.json`` at generation time, which is the -moment the configuration is known. - -This is the bridge: the durable fix is for flyradar to stamp the same config into -``DiscoveryResult`` itself (the one place that knows the effective values and cannot -drift). See the "flyradar improvements" issue. ``temperature`` and ``seed`` are not -exposed by ``RadarSettings`` and are recorded as ``null`` here. - -Usage: - cd flyradar_experiments - set -a && source .env && set +a - uv run python -m fireflyframework_agentic.evaluation.run_config_snapshot \ - --output-dir experiments/bbva_españa/runs/2026-06-12-sonnet-01 \ - --options request_options.json \ - --commit c107918 -""" -from __future__ import annotations - -import argparse -import json -import os -import urllib.request -from importlib.resources import files -from pathlib import Path - -try: - from flyradar.config import RadarSettings -except ImportError: # flyradar is an optional dependency of this snapshot. - RadarSettings = None - -#: Path of the flyradar version endpoint (whitelisted in the service middleware). -VERSION_PATH = "/api/v1/version" - -#: RadarSettings fields that define scoring / dedup behaviour, captured verbatim. -_SETTINGS_KEYS = ( - "model", - "fallback_model", - "duplicity_similarity_threshold", - "rootcause_cost_weight", - "rootcause_frequency_weight", - "rootcause_actionability_weight", -) - - -def fetch_version(base_url: str, *, timeout: float = 10.0) -> dict: - """GET the flyradar version endpoint; return ``{}`` on any failure.""" - url = base_url.rstrip("/") + VERSION_PATH - try: - with urllib.request.urlopen(url, timeout=timeout) as resp: - return json.loads(resp.read().decode("utf-8")) - except Exception: - return {} - - -def load_radar_settings() -> dict | None: - """Dump the scoring / dedup RadarSettings, or ``None`` if flyradar isn't importable.""" - if RadarSettings is None: - return None - settings = RadarSettings() - return {key: getattr(settings, key, None) for key in _SETTINGS_KEYS} - - -def load_prompt_versions() -> dict | None: - """Read each stage prompt's ``version`` from the flyradar prompt catalog, or ``None``.""" - try: - catalog = files("flyradar.resources.prompts") - except ModuleNotFoundError: - return None - versions: dict[str, str] = {} - for entry in catalog.iterdir(): - if not entry.name.endswith(".yaml"): - continue - for line in entry.read_text(encoding="utf-8").splitlines(): - if line.strip().startswith("version:"): - versions[entry.name[:-5]] = line.split(":", 1)[1].strip().strip('"') - break - return versions or None - - -def build_run_config( - options: dict, - *, - version: dict, - settings: dict | None, - prompt_versions: dict | None, - commit: str | None = None, -) -> dict: - """Assemble the experiment-configuration snapshot from its captured parts.""" - return { - "captured_by": "config-snapshot (non-invasive)", - "flyradar_version": version.get("version"), - "flyradar_commit": commit or version.get("commit"), - "options": options, - "settings": settings, - "prompt_versions": prompt_versions, - "temperature": None, - "seed": None, - "_note": ( - "Non-invasive snapshot captured at generation time. `options` is the request " - "the caller sent; `settings` and `prompt_versions` are read from flyradar when " - "importable at the deployed commit. `temperature` and `seed` are not exposed by " - "RadarSettings and are recorded as null. The durable fix is for flyradar to stamp " - "this config into DiscoveryResult (see the 'flyradar improvements' issue)." - ), - } - - -def write_snapshot(output_dir: str | Path, config: dict) -> Path: - """Write ``experiment_configuration.json`` into the run's output directory.""" - path = Path(output_dir) / "experiment_configuration.json" - path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") - return path - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description="Capture the flyradar run configuration.") - parser.add_argument("--output-dir", required=True, help="Run directory holding output.json.") - parser.add_argument( - "--options", required=True, help="JSON file of the DiscoveryRequest options that were sent." - ) - parser.add_argument( - "--base-url", default=None, help="flyradar base URL (default: $FLYRADAR_BASE_URL)." - ) - parser.add_argument("--commit", default=None, help="Deployed flyradar git commit, if known.") - args = parser.parse_args(argv) - - base_url = args.base_url or os.environ.get("FLYRADAR_BASE_URL", "") - options = json.loads(Path(args.options).read_text(encoding="utf-8")) - config = build_run_config( - options, - version=fetch_version(base_url) if base_url else {}, - settings=load_radar_settings(), - prompt_versions=load_prompt_versions(), - commit=args.commit, - ) - path = write_snapshot(args.output_dir, config) - print(f"Wrote {path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) From a51115e8e933c3fe6acecd75ea3610995251644c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:50 +0200 Subject: [PATCH 22/67] chore(evaluation): delete models.py --- fireflyframework_agentic/evaluation/models.py | 70 ------------------- 1 file changed, 70 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/models.py diff --git a/fireflyframework_agentic/evaluation/models.py b/fireflyframework_agentic/evaluation/models.py deleted file mode 100644 index a98cdf20..00000000 --- a/fireflyframework_agentic/evaluation/models.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Shared config and model classes for the evaluation framework. - -EvalConfig captures the parameters of a single evaluation run: which model -is being tested, which corpus it runs against, and where the supporting -artefacts (registry, baseline, judge config) live. - -GateVerdict constants define the two possible outcomes of the promotion gate: -PROMOTE (the challenger beats or ties the champion and is safe to deploy) -or HOLD (the challenger does not meet the bar and must be iterated on). -""" - -from __future__ import annotations - -from typing import Any - -from pydantic import BaseModel - - -class EvalConfig(BaseModel): - """Configuration for a single evaluation run. - - Parameters: - model_id: Identifier of the model under evaluation. - corpus: Name of the evaluation corpus (e.g. "ms_marco_mini", "finance_bench"). - run_id: Unique identifier for this run (e.g. a timestamp or git SHA). - registry_path: Path to the must-find / golden registry JSON file. - corpus_path: Path to the corpus directory or bundle. - baseline_path: Path to a baseline results file for regression comparison. - judge_model: Model identifier used for the LLM-as-judge advisory pass. - judge_runs: Number of independent judge calls to aggregate (majority vote). - embed_model: Model identifier used for embedding-based retrieval metrics. - metadata: Arbitrary key/value pairs for run bookkeeping. - """ - - model_id: str - corpus: str - run_id: str - registry_path: str = "" - corpus_path: str = "" - baseline_path: str = "" - judge_model: str = "" - judge_runs: int = 3 - embed_model: str = "" - metadata: dict[str, Any] = {} - - -class GateVerdict: - """Promotion gate verdict constants. - - Use ``GateVerdict.PROMOTE`` when the challenger meets the quality bar and - is safe to become the new champion. Use ``GateVerdict.HOLD`` when the - challenger does not meet the bar and must be iterated on. - """ - - PROMOTE: str = "PROMOTE" - HOLD: str = "HOLD" From 5074d14eb91506d4d9367808d0d3196775571760 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:18:56 +0200 Subject: [PATCH 23/67] chore(evaluation): delete stats.py --- fireflyframework_agentic/evaluation/stats.py | 110 ------------------- 1 file changed, 110 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/stats.py diff --git a/fireflyframework_agentic/evaluation/stats.py b/fireflyframework_agentic/evaluation/stats.py deleted file mode 100644 index e70c629a..00000000 --- a/fireflyframework_agentic/evaluation/stats.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Statistics helpers: A/A noise band + fixed aggregate_grounding. - -The A/A band replaces McNemar, Wilcoxon, BCa bootstrap, Cliff's delta, Holm -correction, and MCID power analysis. Four self-authored corpora with ~30-70 -non-independent items each cannot power those tests; gating on unpowered tests -is false precision. See EVALUATION_FRAMEWORK.md (regression statistics). - -This module also provides the fixed aggregate_grounding() that closes a prior -aggregation bug where the previous runner inherited run 0's grounding report -unchanged instead of merging across all runs. -""" -from __future__ import annotations - -import statistics -from typing import Sequence - - -def aa_band(scores: Sequence[float], *, percentile: int = 95) -> float: - """95th-percentile pairwise delta from champion reruns — the noise floor. - - Rerun the champion ~10 times; the 95th-percentile of all pairwise absolute - differences is the A/A noise floor. A candidate must beat the champion by - more than this number on EVERY seed to count as a real improvement. - - This single number replaces MCID, power analysis, McNemar, Wilcoxon, - bootstrap CIs, and Holm correction. See EVALUATION_FRAMEWORK.md (the A/A noise band). - - Args: - scores: Per-run primary metric scores from champion reruns (>= 2 required). - percentile: Which percentile (default 95). - - Returns: - Noise floor as a float in the same units as the input scores. - """ - scores = list(scores) - if len(scores) < 2: - raise ValueError(f"aa_band requires >= 2 reruns; got {len(scores)}") - deltas = [ - abs(x - y) - for i, x in enumerate(scores) - for y in scores[i + 1:] - ] - sorted_deltas = sorted(deltas) - # Index for the requested percentile; clamp to valid range - idx = max(0, min(len(sorted_deltas) - 1, int(len(sorted_deltas) * percentile / 100))) - return sorted_deltas[idx] - - -def aggregate_grounding(grounding_dicts: list[dict]) -> dict: - """Merge per-run grounding reports into a conservative aggregate. - - Fixes a prior aggregation bug where the previous runner inherited run 0's grounding - report unchanged. Correct behaviour: - - support_pct: mean across runs - - unsupported_ids: UNION across all runs (anything flagged in any run stays flagged) - - Args: - grounding_dicts: List of grounding report dicts, one per evaluation run. - Each must have 'support_pct' (float 0-100) and optionally - 'unsupported_ids' (list[str]). - - Returns: - Merged grounding dict. - """ - if not grounding_dicts: - return {"support_pct": 0.0, "unsupported_ids": []} - - support_pcts = [float(g.get("support_pct", 0.0)) for g in grounding_dicts] - mean_pct = statistics.mean(support_pcts) - - unsupported: set[str] = set() - for g in grounding_dicts: - unsupported.update(g.get("unsupported_ids", [])) - - first = grounding_dicts[0] - return { - **first, - "support_pct": round(mean_pct, 2), - "unsupported_ids": sorted(unsupported), - "_aggregate_runs": len(grounding_dicts), - "_support_pct_per_run": [round(p, 2) for p in support_pcts], - } - - -def left_skew_flag(scores: Sequence[float]) -> bool: - """True if min < median - 0.10 (HIGH_VARIANCE sentinel). - - A single catastrophic run cannot hide inside a decent mean. - True => HIGH_VARIANCE; block the run until investigated. - See EVALUATION_FRAMEWORK.md (anti-flakiness). - """ - scores = list(scores) - if len(scores) < 2: - return False - med = statistics.median(scores) - return min(scores) < med - 0.10 From 8716be93d143f846920a6ad85820c082eccb6ccf Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:01 +0200 Subject: [PATCH 24/67] chore(evaluation): delete champion.py --- .../evaluation/champion.py | 169 ------------------ 1 file changed, 169 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/champion.py diff --git a/fireflyframework_agentic/evaluation/champion.py b/fireflyframework_agentic/evaluation/champion.py deleted file mode 100644 index 239429eb..00000000 --- a/fireflyframework_agentic/evaluation/champion.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Per-corpus champion management. - -Champions are per-corpus — mode 2A (conformance) and mode 2B (extraction) -metrics live in incommensurable spaces. There is no global champion. -See EVALUATION_FRAMEWORK.md (per-corpus champions). - -The historical fake-100% incident: banca-cordobesa/baseline.json was populated -with a champion scored against an EMPTY must-find registry. The EMPTY_MUST_FIND -guard in G1 prevents a recurrence; the invalidate_champion() function provides -the corrective action when it does happen. -""" - -from __future__ import annotations - -import hashlib -import json -from dataclasses import dataclass, field -from pathlib import Path - - -@dataclass -class ChampionRecord: - """Per-corpus champion, stored as 'champion' key in baseline.json.""" - - corpus: str - run_id: str - model_id: str - registry_sha256: str - scores: dict # {metric_name: float} - aa_noise: dict = field(default_factory=dict) # {metric_name: noise_floor} - is_day_zero: bool = False - human_sign_offs: list[str] = field(default_factory=list) - config: dict = field(default_factory=dict) # evaluation config snapshot - corpus_sha256: str = "" # pin of the evidence corpus the champion was verified against - - def primary_metric(self) -> str: - return next(iter(self.scores)) if self.scores else "" - - def primary_score(self) -> float: - return float(self.scores.get(self.primary_metric(), 0.0)) - - -def load_champion(baseline_path: str | Path) -> ChampionRecord | None: - """Load the current per-corpus champion from baseline.json. - - Returns None when: - - The file does not exist (normal Day-Zero state). - - The file exists but 'champion' is null (post-invalidation state). - """ - path = Path(baseline_path) - if not path.exists(): - return None - raw = json.loads(path.read_text(encoding="utf-8")) - champ_raw = raw.get("champion") - if champ_raw is None: - return None - return ChampionRecord( - corpus=champ_raw["corpus"], - run_id=champ_raw["run_id"], - model_id=champ_raw["model_id"], - registry_sha256=champ_raw["registry_sha256"], - scores=champ_raw.get("scores", {}), - aa_noise=champ_raw.get("aa_noise", {}), - is_day_zero=champ_raw.get("is_day_zero", False), - human_sign_offs=champ_raw.get("human_sign_offs", []), - config=champ_raw.get("config", {}), - corpus_sha256=champ_raw.get("corpus_sha256", ""), - ) - - -def save_champion( - baseline_path: str | Path, - champion: ChampionRecord, - *, - summary: str = "", - date: str = "", -) -> None: - """Persist a new champion and append a promotion log entry. - - Reads the existing file if it exists (to preserve the log), then writes - the new champion. The promotion log is append-only. - """ - path = Path(baseline_path) - if path.exists(): - raw = json.loads(path.read_text(encoding="utf-8")) - log = raw.get("promotion_log", []) - prev_run = raw.get("champion", {}) - prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None - else: - log = [] - prev_run_id = None - - log.append( - { - "date": date or "unknown", - "from": prev_run_id, - "to": champion.run_id, - "label": "day-zero" if champion.is_day_zero else "promotion", - "summary": summary, - } - ) - - payload = { - "champion": { - "corpus": champion.corpus, - "run_id": champion.run_id, - "model_id": champion.model_id, - "registry_sha256": champion.registry_sha256, - "scores": champion.scores, - "aa_noise": champion.aa_noise, - "is_day_zero": champion.is_day_zero, - "human_sign_offs": champion.human_sign_offs, - "config": champion.config, - "corpus_sha256": champion.corpus_sha256, - }, - "promotion_log": log, - } - path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - - -def invalidate_champion( - baseline_path: str | Path, - *, - reason: str, - date: str = "", -) -> None: - """Null out the current champion and record the invalidation reason. - - Used when a champion was locked in against an empty or tampered registry - (the banca-cordobesa fake-100% incident). - """ - path = Path(baseline_path) - if not path.exists(): - return - raw = json.loads(path.read_text(encoding="utf-8")) - log = raw.get("promotion_log", []) - prev_run = raw.get("champion", {}) - prev_run_id = prev_run.get("run_id") if isinstance(prev_run, dict) else None - log.append( - { - "date": date or "unknown", - "from": prev_run_id, - "to": None, - "label": "INVALIDATED", - "summary": reason, - } - ) - raw["champion"] = None - raw["promotion_log"] = log - path.write_text(json.dumps(raw, indent=2, ensure_ascii=False), encoding="utf-8") - - -def input_hash(result_dict: dict) -> str: - """Stable 16-char SHA-256 prefix of the DiscoveryResult for provenance.""" - canonical = json.dumps(result_dict, sort_keys=True, ensure_ascii=False) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] From 5c8fe8e4450013f47754803e99e19cac3a4cb1bd Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:07 +0200 Subject: [PATCH 25/67] chore(evaluation): delete test_champion.py --- tests/unit/evaluation/test_champion.py | 199 ------------------------- 1 file changed, 199 deletions(-) delete mode 100644 tests/unit/evaluation/test_champion.py diff --git a/tests/unit/evaluation/test_champion.py b/tests/unit/evaluation/test_champion.py deleted file mode 100644 index 948a9639..00000000 --- a/tests/unit/evaluation/test_champion.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.champion: ChampionRecord, load/save/invalidate_champion, input_hash.""" - -from __future__ import annotations - -import json - -import pytest - -from fireflyframework_agentic.evaluation.champion import ( - ChampionRecord, - input_hash, - invalidate_champion, - load_champion, - save_champion, -) - - -def _make_champion(**overrides) -> ChampionRecord: - defaults = dict( - corpus="test-corpus", - run_id="run-2026-01", - model_id="claude-sonnet-4-5", - registry_sha256="abc123", - scores={"recall": 0.85, "grounding_pct": 0.92}, - aa_noise={"recall": 0.02}, - is_day_zero=False, - human_sign_offs=["reviewer-1"], - ) - defaults.update(overrides) - return ChampionRecord(**defaults) - - -# ── load_champion ───────────────────────────────────────────────────────────── - - -def test_load_champion_nonexistent_file_returns_none(tmp_path): - result = load_champion(tmp_path / "baseline.json") - assert result is None - - -def test_load_champion_file_with_null_champion_returns_none(tmp_path): - baseline = tmp_path / "baseline.json" - baseline.write_text(json.dumps({"champion": None, "promotion_log": []}), encoding="utf-8") - assert load_champion(baseline) is None - - -# ── save_champion / load_champion round-trip ────────────────────────────────── - - -def test_save_then_load_round_trips_all_fields(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion() - save_champion(baseline, champ, summary="initial champion", date="2026-01-01") - - loaded = load_champion(baseline) - assert loaded is not None - assert loaded.corpus == champ.corpus - assert loaded.run_id == champ.run_id - assert loaded.model_id == champ.model_id - assert loaded.registry_sha256 == champ.registry_sha256 - assert loaded.scores == champ.scores - assert loaded.aa_noise == champ.aa_noise - assert loaded.is_day_zero == champ.is_day_zero - assert loaded.human_sign_offs == champ.human_sign_offs - - -def test_save_champion_appends_promotion_log_entry(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion() - save_champion(baseline, champ, summary="first", date="2026-01-01") - - champ2 = _make_champion(run_id="run-2026-02", scores={"recall": 0.90}) - save_champion(baseline, champ2, summary="second", date="2026-02-01") - - raw = json.loads(baseline.read_text(encoding="utf-8")) - log = raw["promotion_log"] - assert len(log) == 2 - assert log[0]["to"] == "run-2026-01" - assert log[1]["to"] == "run-2026-02" - assert log[1]["from"] == "run-2026-01" - - -def test_save_champion_creates_file_when_missing(tmp_path): - baseline = tmp_path / "baseline.json" - assert not baseline.exists() - save_champion(baseline, _make_champion()) - assert baseline.exists() - - -def test_save_champion_day_zero_flag_preserved(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion(is_day_zero=True) - save_champion(baseline, champ) - loaded = load_champion(baseline) - assert loaded.is_day_zero is True - - -def test_save_champion_label_is_day_zero_when_flag_set(tmp_path): - baseline = tmp_path / "baseline.json" - champ = _make_champion(is_day_zero=True) - save_champion(baseline, champ) - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["promotion_log"][0]["label"] == "day-zero" - - -def test_save_champion_label_is_promotion_when_flag_not_set(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion(is_day_zero=False)) - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["promotion_log"][0]["label"] == "promotion" - - -# ── invalidate_champion ─────────────────────────────────────────────────────── - - -def test_invalidate_champion_sets_champion_to_null(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion()) - invalidate_champion(baseline, reason="EMPTY_MUST_FIND fake champion", date="2026-03-01") - - loaded = load_champion(baseline) - assert loaded is None - - raw = json.loads(baseline.read_text(encoding="utf-8")) - assert raw["champion"] is None - - -def test_invalidate_champion_appends_invalidation_log(tmp_path): - baseline = tmp_path / "baseline.json" - save_champion(baseline, _make_champion(), date="2026-01-01") - invalidate_champion(baseline, reason="fake champion", date="2026-03-01") - - raw = json.loads(baseline.read_text(encoding="utf-8")) - log = raw["promotion_log"] - assert log[-1]["label"] == "INVALIDATED" - assert "fake champion" in log[-1]["summary"] - assert log[-1]["to"] is None - - -def test_invalidate_champion_noop_when_file_missing(tmp_path): - # Should not raise when file does not exist. - invalidate_champion(tmp_path / "no-file.json", reason="test") - - -# ── ChampionRecord helpers ──────────────────────────────────────────────────── - - -def test_primary_metric_returns_first_key(): - champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) - assert champ.primary_metric() == "recall" - - -def test_primary_score_returns_first_value(): - champ = _make_champion(scores={"recall": 0.85, "grounding_pct": 0.92}) - assert champ.primary_score() == 0.85 - - -def test_primary_metric_empty_scores(): - champ = _make_champion(scores={}) - assert champ.primary_metric() == "" - assert champ.primary_score() == 0.0 - - -# ── input_hash ──────────────────────────────────────────────────────────────── - - -def test_input_hash_is_16_chars(): - result = input_hash({"key": "value"}) - assert len(result) == 16 - - -def test_input_hash_is_deterministic(): - data = {"process_graph": {"processes": []}, "findings": []} - h1 = input_hash(data) - h2 = input_hash(data) - assert h1 == h2 - - -def test_input_hash_differs_for_different_inputs(): - assert input_hash({"a": 1}) != input_hash({"a": 2}) - - -def test_input_hash_key_order_independent(): - # sort_keys=True in input_hash should make {"a":1, "b":2} == {"b":2, "a":1}. - assert input_hash({"a": 1, "b": 2}) == input_hash({"b": 2, "a": 1}) From fdc02771d8b3352a1031fc7ba5b3e1646b32f041 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:13 +0200 Subject: [PATCH 26/67] chore(evaluation): delete test_gates.py --- tests/unit/evaluation/test_gates.py | 219 ---------------------------- 1 file changed, 219 deletions(-) delete mode 100644 tests/unit/evaluation/test_gates.py diff --git a/tests/unit/evaluation/test_gates.py b/tests/unit/evaluation/test_gates.py deleted file mode 100644 index 2edc3b99..00000000 --- a/tests/unit/evaluation/test_gates.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.gates: GateResult, verdict, render_scorecard, g5_no_regression.""" - -from __future__ import annotations - -from fireflyframework_agentic.evaluation.gates import ( - GateResult, - Verdict, - g5_no_regression, - render_scorecard, -) -from fireflyframework_agentic.evaluation.scorecard import verdict - - -# ── GateResult ──────────────────────────────────────────────────────────────── - - -def test_gate_result_str_pass(): - gr = GateResult(gate="G1", passed=True) - assert str(gr) == "[G1] PASS" - - -def test_gate_result_str_flag(): - gr = GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR") - assert str(gr) == "[G2] FLAG:RECALL_BELOW_FLOOR" - - -def test_gate_result_flag_without_reason_code(): - gr = GateResult(gate="G3", passed=False, reason_code="") - assert str(gr) == "[G3] FLAG:" - - -def test_gate_result_passed_true(): - gr = GateResult(gate="G5", passed=True, details={"note": "ok"}) - assert gr.passed is True - assert gr.details["note"] == "ok" - - -def test_gate_result_default_details_is_empty_dict(): - gr = GateResult(gate="G1", passed=True) - assert gr.details == {} - - -# ── verdict ─────────────────────────────────────────────────────────────────── - - -def test_verdict_promote_when_all_pass_and_g5_present(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - assert verdict(gates) == "PROMOTE" - - -def test_verdict_hold_when_any_gate_fails(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=False, reason_code="RECALL_BELOW_FLOOR"), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - assert verdict(gates) == "HOLD" - - -def test_verdict_hold_when_g5_missing(): - # All G1/G2/G3 pass but G5 is absent — no promotion without sign-off. - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - ] - assert verdict(gates) == "HOLD" - - -def test_verdict_hold_on_empty_list(): - assert verdict([]) == "HOLD" - - -def test_verdict_hold_when_g5_fails(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=False, reason_code="HOLD"), - ] - assert verdict(gates) == "HOLD" - - -# ── render_scorecard (from gates module) ────────────────────────────────────── - - -def test_render_scorecard_contains_verdict_line(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - assert "VERDICT: PROMOTE" in output - - -def test_render_scorecard_hold_when_flag(): - gates = [ - GateResult(gate="G1", passed=False, reason_code="SCHEMA_INVALID"), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - assert "VERDICT: HOLD" in output - - -def test_render_scorecard_includes_all_gate_lines(): - gates = [ - GateResult(gate="G1", passed=True), - GateResult(gate="G2", passed=True), - GateResult(gate="G3", passed=True), - GateResult(gate="G5", passed=True), - ] - output = render_scorecard(gates) - for gate_label in ("[G1]", "[G2]", "[G3]", "[G5]"): - assert gate_label in output - - -# ── g5_no_regression ────────────────────────────────────────────────────────── - - -def test_g5_day_zero_insufficient_signoffs(): - result = g5_no_regression( - candidate_scores={"recall": 0.85}, - champion_scores=None, - aa_noise=None, - is_day_zero=True, - human_signed_off=False, - signoff_count=1, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - - -def test_g5_day_zero_sufficient_signoffs(): - result = g5_no_regression( - candidate_scores={"recall": 0.85}, - champion_scores=None, - aa_noise=None, - is_day_zero=True, - human_signed_off=False, - signoff_count=2, - ) - assert result.passed is True - assert result.details["day_zero"] is True - - -def test_g5_hold_when_no_human_signoff(): - result = g5_no_regression( - candidate_scores={"recall": 0.90}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=False, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - - -def test_g5_hold_when_regression_beyond_band(): - # Candidate recall 0.75 vs champion 0.80; delta=-0.05 < -band=-0.02. - result = g5_no_regression( - candidate_scores={"recall": 0.75}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is False - assert result.reason_code == "HOLD" - assert any("recall" in r for r in result.details["regressions"]) - - -def test_g5_promote_when_candidate_beats_champion(): - result = g5_no_regression( - candidate_scores={"recall": 0.90}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is True - assert result.details["improvements"] - - -def test_g5_promote_when_within_noise_band(): - # delta = 0.01 — positive but within band of 0.02; counts as no regression, no improvement. - result = g5_no_regression( - candidate_scores={"recall": 0.81}, - champion_scores={"recall": 0.80}, - aa_noise={"recall": 0.02}, - human_signed_off=True, - ) - assert result.passed is True - assert result.details["improvements"] == [] - - -def test_g5_verdict_constants(): - assert Verdict.PROMOTE == "PROMOTE" - assert Verdict.HOLD == "HOLD" From 0732f8582e9e40818e9a4d05da2dff00220652b4 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:17 +0200 Subject: [PATCH 27/67] chore(evaluation): delete test_matcher.py --- tests/unit/evaluation/test_matcher.py | 221 -------------------------- 1 file changed, 221 deletions(-) delete mode 100644 tests/unit/evaluation/test_matcher.py diff --git a/tests/unit/evaluation/test_matcher.py b/tests/unit/evaluation/test_matcher.py deleted file mode 100644 index cc87564b..00000000 --- a/tests/unit/evaluation/test_matcher.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.matcher: anchored, source_stem, tokens, matches.""" - -from __future__ import annotations - -import pytest - -from fireflyframework_agentic.evaluation.matcher import ( - anchored, - matches, - source_stem, - tokens, -) -from fireflyframework_agentic.evaluation.registry import RegistryItem - - -# ── tokens ─────────────────────────────────────────────────────────────────── - - -def test_tokens_basic(): - result = tokens("Hello World") - assert result == ["hello", "world"] - - -def test_tokens_lowercases(): - result = tokens("KYC AML PEP") - assert result == ["kyc", "aml", "pep"] - - -def test_tokens_strips_punctuation(): - result = tokens("risk-management: cost (FTE).") - assert "risk" in result - assert "management" in result - assert "cost" in result - assert "fte" in result - - -def test_tokens_empty_string(): - assert tokens("") == [] - - -def test_tokens_numbers_included(): - result = tokens("case-id CU-2026-1003") - assert "2026" in result or "cu" in result - - -def test_tokens_unicode(): - result = tokens("análisis de crédito") - assert "análisis" in result or "an" in result - - -# ── anchored ───────────────────────────────────────────────────────────────── - - -def test_anchored_overlapping_long_token(): - # "underwriting" is 12 chars — well above the 5-char floor. - assert anchored("credit underwriting risk", "underwriting process steps") is True - - -def test_anchored_no_overlap(): - # No token >= 5 chars shared between claim and evidence. - assert anchored("cat sat", "dog ran") is False - - -def test_anchored_short_tokens_ignored(): - # All tokens in both strings are < 5 chars; no overlap counts. - assert anchored("a big cat", "a big dog") is False - - -def test_anchored_mixed_lengths_match(): - # "kyc" is < 5, but "compliance" is long enough. - assert anchored("kyc compliance review", "compliance framework") is True - - -def test_anchored_custom_min_token(): - # Lower the floor so short tokens can anchor. - assert anchored("kyc check", "kyc process", min_token=3) is True - - -def test_anchored_both_empty(): - assert anchored("", "") is False - - -def test_anchored_partial_token_no_match(): - # "risk" (4 chars) is below the default 5-char floor. - assert anchored("risk alert", "risk factor") is False - - -def test_anchored_returns_bool(): - result = anchored("credit underwriting", "underwriting model") - assert isinstance(result, bool) - - -# ── source_stem ─────────────────────────────────────────────────────────────── - - -def test_source_stem_bare_filename_with_extension(): - assert source_stem("SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_directory_prefixed(): - assert source_stem("sops/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_deep_path_prefix(): - assert source_stem("docs/policies/SOP-002-kyc-edd.md") == "sop-002-kyc-edd" - - -def test_source_stem_lowercase(): - # Stems are always lowercased. - assert source_stem("REPORT-FINAL.pdf") == "report-final" - - -def test_source_stem_event_log_row_id(): - # src-: → process stem. - assert source_stem("src-credit-underwriting:CU-2026-1003") == "credit-underwriting" - - -def test_source_stem_event_log_row_id_preserves_hyphens(): - assert source_stem("src-kyc-onboarding:KYC-001") == "kyc-onboarding" - - -def test_source_stem_strips_fragment(): - # #page=N should be removed before stemming. - assert source_stem("docs/report.pdf#page=5") == "report" - - -def test_source_stem_strips_anchor(): - assert source_stem("sops/SOP-001.md#section-3") == "sop-001" - - -def test_source_stem_bare_no_extension(): - # No extension, no directory — stem is just the lowercase name. - assert source_stem("my-document") == "my-document" - - -def test_source_stem_no_directory_no_extension_lowercase(): - assert source_stem("Signal") == "signal" - - -def test_source_stem_csv_extension(): - assert source_stem("activity-cost-fte.csv") == "activity-cost-fte" - - -# ── matches ─────────────────────────────────────────────────────────────────── - - -def _make_item(description: str, evidence: list[str], keywords: list[str] | None = None) -> RegistryItem: - """Construct a minimal RegistryItem for matching tests.""" - return RegistryItem( - id="test-item", - tier="L1", - description=description, - evidence=evidence, - scope="finding", - keywords=keywords or [], - ) - - -def _make_finding(title: str, description: str, evidence_id: str) -> dict: - return { - "title": title, - "description": description, - "evidence_refs": [{"evidence_id": evidence_id}], - } - - -def _make_evidence_index(evidence_id: str, locator: str, excerpt: str = "") -> dict: - return {evidence_id: {"id": evidence_id, "locator": locator, "excerpt": excerpt}} - - -def test_matches_true_when_source_and_topic_match(): - # Finding title shares a long token with item description and cites the same source. - item = _make_item("credit underwriting process", ["sop-kyc-credit.md"]) - finding = _make_finding("credit underwriting review", "credit underwriting risk assessment", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-kyc-credit.md") - assert matches(finding, item, evidence_index, scope="finding") is True - - -def test_matches_false_when_source_differs(): - # Token match exists but sources don't overlap — anti-gaming guard fires. - item = _make_item("credit underwriting process", ["sop-credit.md"]) - finding = _make_finding("credit underwriting review", "credit underwriting details", "ev-1") - evidence_index = _make_evidence_index("ev-1", "other-document.md") - assert matches(finding, item, evidence_index, scope="finding") is False - - -def test_matches_false_when_no_token_overlap(): - # Same source, but no shared long token between finding text and item description. - item = _make_item("regulatory capital requirement", ["sop-capital.md"]) - finding = _make_finding("kyc identity check", "client onboarding steps", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-capital.md") - assert matches(finding, item, evidence_index, scope="finding") is False - - -def test_matches_keyword_rail_short_token(): - # "KYC" is 3 chars — below the 5-char token floor but valid as a keyword. - item = _make_item("some description about identity", ["sop-kyc.md"], keywords=["KYC"]) - finding = _make_finding("KYC onboarding", "KYC onboarding process", "ev-1") - evidence_index = _make_evidence_index("ev-1", "sop-kyc.md") - assert matches(finding, item, evidence_index, scope="finding") is True - - -def test_matches_empty_evidence_refs_returns_false(): - # Finding with no evidence refs cannot share a source with any item. - item = _make_item("credit underwriting", ["sop-credit.md"]) - finding = {"title": "credit underwriting", "description": "credit underwriting risk", "evidence_refs": []} - assert matches(finding, item, {}, scope="finding") is False From f769ef1c40d28067040faa8a5f662038d4765eb0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:19:23 +0200 Subject: [PATCH 28/67] chore(evaluation): delete test_stats.py --- tests/unit/evaluation/test_stats.py | 183 ---------------------------- 1 file changed, 183 deletions(-) delete mode 100644 tests/unit/evaluation/test_stats.py diff --git a/tests/unit/evaluation/test_stats.py b/tests/unit/evaluation/test_stats.py deleted file mode 100644 index 9523be8c..00000000 --- a/tests/unit/evaluation/test_stats.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for evaluation.stats: aa_band, aggregate_grounding, left_skew_flag.""" - -from __future__ import annotations - -import pytest - -from fireflyframework_agentic.evaluation.stats import ( - aa_band, - aggregate_grounding, - left_skew_flag, -) - - -# ── aa_band ────────────────────────────────────────────────────────────────── - - -def test_aa_band_two_identical_scores(): - # Two identical scores produce zero pairwise delta. - assert aa_band([0.80, 0.80]) == 0.0 - - -def test_aa_band_two_different_scores(): - # Single delta = |0.90 - 0.80| = 0.10; 95th percentile of one value is that value. - result = aa_band([0.80, 0.90]) - assert abs(result - 0.10) < 1e-9 - - -def test_aa_band_three_scores_known_deltas(): - # Scores: 0.70, 0.80, 0.90 - # Pairwise deltas: |0.70-0.80|=0.10, |0.70-0.90|=0.20, |0.80-0.90|=0.10 - # Sorted: [0.10, 0.10, 0.20] → 95th pct index = int(3 * 95 / 100) = 2 → 0.20 - result = aa_band([0.70, 0.80, 0.90]) - assert abs(result - 0.20) < 1e-9 - - -def test_aa_band_large_spread(): - # Max delta in [0.0, 1.0] is 1.0. - result = aa_band([0.0, 1.0]) - assert abs(result - 1.0) < 1e-9 - - -def test_aa_band_requires_at_least_two_scores(): - with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): - aa_band([0.80]) - - -def test_aa_band_empty_raises(): - with pytest.raises(ValueError, match="aa_band requires >= 2 reruns"): - aa_band([]) - - -def test_aa_band_custom_percentile(): - # 50th percentile of [0.10, 0.10, 0.20] at idx=1 → 0.10. - result = aa_band([0.70, 0.80, 0.90], percentile=50) - assert abs(result - 0.10) < 1e-9 - - -def test_aa_band_returns_float(): - result = aa_band([0.80, 0.85, 0.90]) - assert isinstance(result, float) - - -# ── aggregate_grounding ─────────────────────────────────────────────────────── - - -def test_aggregate_grounding_single_dict(): - g = {"support_pct": 90.0, "unsupported_ids": ["ev-1"]} - result = aggregate_grounding([g]) - assert result["support_pct"] == 90.0 - assert result["unsupported_ids"] == ["ev-1"] - assert result["_aggregate_runs"] == 1 - - -def test_aggregate_grounding_mean_support_pct(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["support_pct"] == 90.0 - - -def test_aggregate_grounding_union_of_unsupported_ids(): - dicts = [ - {"support_pct": 90.0, "unsupported_ids": ["ev-1", "ev-2"]}, - {"support_pct": 85.0, "unsupported_ids": ["ev-2", "ev-3"]}, - ] - result = aggregate_grounding(dicts) - assert set(result["unsupported_ids"]) == {"ev-1", "ev-2", "ev-3"} - - -def test_aggregate_grounding_union_sorted(): - dicts = [ - {"support_pct": 90.0, "unsupported_ids": ["ev-b"]}, - {"support_pct": 90.0, "unsupported_ids": ["ev-a"]}, - ] - result = aggregate_grounding(dicts) - assert result["unsupported_ids"] == ["ev-a", "ev-b"] - - -def test_aggregate_grounding_empty_input(): - result = aggregate_grounding([]) - assert result["support_pct"] == 0.0 - assert result["unsupported_ids"] == [] - - -def test_aggregate_grounding_records_run_count(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 90.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["_aggregate_runs"] == 3 - - -def test_aggregate_grounding_per_run_pct_recorded(): - dicts = [ - {"support_pct": 80.0, "unsupported_ids": []}, - {"support_pct": 100.0, "unsupported_ids": []}, - ] - result = aggregate_grounding(dicts) - assert result["_support_pct_per_run"] == [80.0, 100.0] - - -def test_aggregate_grounding_missing_unsupported_ids_treated_as_empty(): - dicts = [ - {"support_pct": 90.0}, # no unsupported_ids key - {"support_pct": 80.0, "unsupported_ids": ["ev-1"]}, - ] - result = aggregate_grounding(dicts) - assert result["unsupported_ids"] == ["ev-1"] - - -# ── left_skew_flag ──────────────────────────────────────────────────────────── - - -def test_left_skew_flag_true_when_catastrophic_run(): - # median([0.80, 0.80, 0.80]) = 0.80; min = 0.60 < 0.80 - 0.10 = 0.70. - assert left_skew_flag([0.60, 0.80, 0.80]) is True - - -def test_left_skew_flag_false_when_min_close_to_median(): - # median = 0.80; min = 0.75; 0.75 >= 0.80 - 0.10 = 0.70 → no flag. - assert left_skew_flag([0.75, 0.80, 0.85]) is False - - -def test_left_skew_flag_false_when_all_equal(): - assert left_skew_flag([0.85, 0.85, 0.85]) is False - - -def test_left_skew_flag_boundary_just_above_threshold(): - # min = 0.71, median = 0.80; 0.71 >= 0.80 - 0.10 = 0.70 → no flag. - assert left_skew_flag([0.71, 0.80, 0.80]) is False - - -def test_left_skew_flag_single_score_always_false(): - # A single score has no meaningful distribution; function returns False. - assert left_skew_flag([0.50]) is False - - -def test_left_skew_flag_two_scores_with_large_gap(): - # median([0.50, 0.90]) = 0.70; min = 0.50 < 0.70 - 0.10 = 0.60. - assert left_skew_flag([0.50, 0.90]) is True - - -def test_left_skew_flag_returns_bool(): - result = left_skew_flag([0.80, 0.85, 0.90]) - assert isinstance(result, bool) From 251605211cff33f0e89fcbe26f8aefdbfab0fa72 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:20:17 +0200 Subject: [PATCH 29/67] feat(evaluation): rewrite judge_client.py as async (httpx.AsyncClient) --- .../evaluation/judge_client.py | 382 +++++------------- 1 file changed, 91 insertions(+), 291 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 1af17f53..7f050d16 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -1,60 +1,24 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +"""Async LLM scoring client for judge metrics. -"""Provider-agnostic LLM-as-a-Judge client for the G4 advisory gate. - -Zero new dependencies: stdlib (urllib.request, json, os, time, re) + numpy. -The client is a thin POST wrapper over four chat providers (Anthropic, OpenAI, -Azure OpenAI, Ollama) plus an Ollama embedder. It is deliberately tolerant: -chat_json extracts the FIRST JSON object from the model text (models wrap JSON -in prose / code fences), and retries transient HTTP errors with backoff. - -This module is import-safe: importing it touches NO network and reads NO API -key. Keys are read lazily, per-call, only when a real request is made — so the -judge tests can import and inject stubs without any secret present. - -Provider/model spec format: ":", e.g. "anthropic:claude-sonnet-4-6", -"openai:gpt-4o", "azure:gpt-4o", "ollama:llama3". A bare model with no prefix is -treated as provider "unknown" (see parse_model / same_provider). +Thin httpx-based wrapper over Anthropic / OpenAI / Azure OpenAI / Ollama. +Reads API keys lazily (per-call) from env so importing never requires secrets. +Provider/model spec: ":", e.g. "anthropic:claude-sonnet-4-6". """ from __future__ import annotations +import asyncio import json import os import re -import time -import urllib.error -import urllib.request -import numpy as np +import httpx -# Transient HTTP status codes worth retrying (rate limit + 5xx). _RETRY_STATUS = (429, 500, 502, 503, 504) - -# Hard cap on a honoured Retry-After sleep (a hostile header should not stall us). _MAX_RETRY_AFTER = 30.0 -def _env(name, default=None): - """Read an env var, stripping surrounding whitespace; empty-after-strip -> default. - - Defensive against a ``.env`` value that arrives with a trailing ``\\r`` / - whitespace (CRLF), which would otherwise corrupt a request URL or header. - An unset OR blank value falls back to ``default`` so the existing - missing-key -> RuntimeError behaviour is preserved. - """ +def _env(name: str, default: str | None = None) -> str | None: value = os.environ.get(name) if value is None: return default @@ -62,30 +26,8 @@ def _env(name, default=None): return value if value else default -def _retry_delay(exc: urllib.error.HTTPError, attempt: int) -> float: - """Seconds to sleep before retrying an HTTPError. - - On 429 honour the ``Retry-After`` header (capped at 30s) when it is present - and numeric; otherwise fall back to exponential backoff (2 ** attempt). - """ - if exc.code == 429: - headers = getattr(exc, "headers", None) - retry_after = headers.get("retry-after") if headers is not None else None - if retry_after is not None: - try: - return min(float(retry_after), _MAX_RETRY_AFTER) - except (TypeError, ValueError): - pass - return 2.0**attempt - - def parse_model(spec: str) -> tuple[str, str]: - """Split a "provider:model" spec into (provider, model). - - A bare spec with no ':' is returned as provider "unknown" with the whole - string as the model, e.g. "claude-sonnet-4-6" -> ("unknown", "claude-sonnet-4-6"). - The provider is lower-cased; the model keeps its original case. - """ + """Split "provider:model" -> (provider, model). Bare spec -> ("unknown", spec).""" spec = (spec or "").strip() if ":" not in spec: return "unknown", spec @@ -94,28 +36,16 @@ def parse_model(spec: str) -> tuple[str, str]: def same_provider(pipeline_model: str, judge_model: str) -> bool: - """True iff both specs name the SAME known provider prefix. - - A missing or "unknown" provider on either side -> not-same (False). This is - the same-provider caveat signal: when the judge and the pipeline share a - provider the judged metrics are advisory (no cross-provider isolation). - """ - p_provider, _ = parse_model(pipeline_model) - j_provider, _ = parse_model(judge_model) - if p_provider == "unknown" or j_provider == "unknown": + """True iff both specs share the same known provider prefix.""" + p, _ = parse_model(pipeline_model) + j, _ = parse_model(judge_model) + if p == "unknown" or j == "unknown": return False - return p_provider == j_provider + return p == j def _first_json_object(text: str) -> dict: - """Extract and parse the FIRST balanced JSON object embedded in text. - - Models wrap JSON in prose, preambles, or ```json code fences. This scans - for the first '{' and walks the string tracking brace depth (string-aware, - so braces inside quoted values do not confuse the matcher) to find its - matching '}'. Falls back to a greedy regex span if no balanced object is - found. Raises ValueError when nothing parses. - """ + """Extract the first balanced JSON object from text (handles prose/code-fence wrapping).""" if not text: raise ValueError("empty model response") @@ -165,38 +95,12 @@ def _first_json_object(text: str) -> dict: raise ValueError("no JSON object found in model response") -def _http_post_json(url: str, headers: dict, body: dict, timeout: int) -> dict: - """POST a JSON body and return the parsed JSON response (single attempt).""" - data = json.dumps(body).encode("utf-8") - req_headers = {"content-type": "application/json", **headers} - req = urllib.request.Request(url, data=data, headers=req_headers, method="POST") - with urllib.request.urlopen(req, timeout=timeout) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def _extract_openai_text(resp: dict) -> str: - """Pull the assistant text from an OpenAI/Azure chat-completions response. - - Guards an empty ``choices`` list and a null ``message.content`` and raises a - descriptive RuntimeError (not a KeyError) when no text is present, so the - judge layer records a clean dropped-vote reason instead of a stack trace. - """ - choices = resp.get("choices") or [] - if choices: - text = (choices[0].get("message") or {}).get("content") - if text: - return text - raise RuntimeError(f"judge returned no text: {resp}") - - class JudgeClient: - """Minimal multi-provider chat client returning parsed JSON dicts. + """Async multi-provider chat client returning parsed JSON dicts. Dispatch is by the provider prefix of the model spec. temperature is pinned - to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and URL - errors are retried up to max_retries: a 429 honours the ``Retry-After`` - header (capped at 30s) when present, otherwise backoff is exponential - (2 ** attempt seconds). + to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and network + errors are retried up to max_retries with backoff. The API key / endpoint env vars are read lazily inside chat_json, so constructing a JudgeClient never requires a secret. @@ -208,48 +112,49 @@ def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None self.timeout = timeout self.max_retries = max_retries - def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: + async def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: """Send (system, user) to the provider and parse the first JSON object. Raises on exhausted retries / unknown provider / unparseable output. - The judge module wraps every call in try/except, so a raise here becomes - a dropped vote rather than a crash. """ last_exc: Exception | None = None for attempt in range(self.max_retries): try: - text = self._dispatch(system, user, max_tokens) - return _first_json_object(text) - except urllib.error.HTTPError as exc: + if self.provider == "anthropic": + return await self._anthropic(system, user, max_tokens) + if self.provider == "openai": + return await self._openai(system, user, max_tokens) + if self.provider == "azure": + return await self._azure(system, user, max_tokens) + if self.provider == "ollama": + return await self._ollama(system, user, max_tokens) + raise ValueError( + f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " + "use anthropic:/openai:/azure:/ollama:" + ) + except httpx.HTTPStatusError as exc: last_exc = exc - if exc.code not in _RETRY_STATUS or attempt == self.max_retries - 1: + if exc.response.status_code not in _RETRY_STATUS or attempt == self.max_retries - 1: raise - time.sleep(_retry_delay(exc, attempt)) - except (urllib.error.URLError, TimeoutError, ConnectionError) as exc: + retry_after_header = exc.response.headers.get("retry-after") + if retry_after_header is not None: + try: + delay = min(float(retry_after_header), _MAX_RETRY_AFTER) + except (TypeError, ValueError): + delay = 2.0**attempt + else: + delay = 2.0**attempt + await asyncio.sleep(delay) + except httpx.RequestError as exc: last_exc = exc if attempt == self.max_retries - 1: raise - time.sleep(2**attempt) + await asyncio.sleep(2.0) if last_exc is not None: raise last_exc raise RuntimeError("chat_json exhausted retries without a response") - def _dispatch(self, system: str, user: str, max_tokens: int) -> str: - """Route to the per-provider call and return the raw model text.""" - if self.provider == "anthropic": - return self._anthropic(system, user, max_tokens) - if self.provider == "openai": - return self._openai(system, user, max_tokens) - if self.provider == "azure": - return self._azure(system, user, max_tokens) - if self.provider == "ollama": - return self._ollama(system, user, max_tokens) - raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" - ) - - def _anthropic(self, system: str, user: str, max_tokens: int) -> str: + async def _anthropic(self, system: str, user: str, max_tokens: int) -> dict: api_key = _env("ANTHROPIC_API_KEY") if not api_key: raise RuntimeError("ANTHROPIC_API_KEY not set") @@ -260,16 +165,21 @@ def _anthropic(self, system: str, user: str, max_tokens: int) -> str: "system": system, "messages": [{"role": "user", "content": user}], } - headers = {"x-api-key": api_key, "anthropic-version": "2023-06-01"} - resp = _http_post_json("https://api.anthropic.com/v1/messages", headers, body, self.timeout) - text = next( - (b.get("text") for b in resp.get("content", []) if b.get("type") == "text"), None - ) + headers = { + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post("https://api.anthropic.com/v1/messages", json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + text = next((b.get("text") for b in data.get("content", []) if b.get("type") == "text"), None) if not text: - raise RuntimeError(f"judge returned no text: {resp}") - return text + raise RuntimeError(f"judge returned no text: {data}") + return _first_json_object(text) - def _openai(self, system: str, user: str, max_tokens: int) -> str: + async def _openai(self, system: str, user: str, max_tokens: int) -> dict: api_key = _env("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY not set") @@ -282,25 +192,27 @@ def _openai(self, system: str, user: str, max_tokens: int) -> str: {"role": "user", "content": user}, ], } - headers = {"Authorization": f"Bearer {api_key}"} - resp = _http_post_json( - "https://api.openai.com/v1/chat/completions", headers, body, self.timeout - ) - return _extract_openai_text(resp) + headers = {"Authorization": f"Bearer {api_key}", "content-type": "application/json"} + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post("https://api.openai.com/v1/chat/completions", json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + choices = data.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return _first_json_object(text) + raise RuntimeError(f"judge returned no text: {data}") - def _azure(self, system: str, user: str, max_tokens: int) -> str: + async def _azure(self, system: str, user: str, max_tokens: int) -> dict: endpoint = _env("AZURE_OPENAI_ENDPOINT") api_key = _env("AZURE_OPENAI_API_KEY") if not endpoint: raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") if not api_key: raise RuntimeError("AZURE_OPENAI_API_KEY not set") - api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - # Azure deployment lives in the URL path, not the JSON body. - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions" - f"?api-version={api_version}" - ) + api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-02-01" + url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" body = { "max_tokens": max_tokens, "temperature": 0.0, @@ -309,146 +221,34 @@ def _azure(self, system: str, user: str, max_tokens: int) -> str: {"role": "user", "content": user}, ], } - headers = {"api-key": api_key} - resp = _http_post_json(url, headers, body, self.timeout) - return _extract_openai_text(resp) + headers = {"api-key": api_key, "content-type": "application/json"} + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post(url, json=body, headers=headers) + resp.raise_for_status() + data = resp.json() + choices = data.get("choices") or [] + if choices: + text = (choices[0].get("message") or {}).get("content") + if text: + return _first_json_object(text) + raise RuntimeError(f"judge returned no text: {data}") - def _ollama(self, system: str, user: str, max_tokens: int) -> str: + async def _ollama(self, system: str, user: str, max_tokens: int) -> dict: # noqa: ARG002 host = _env("OLLAMA_HOST") or "http://localhost:11434" body = { "model": self.model, "stream": False, - "options": {"temperature": 0.0, "num_predict": max_tokens}, + "options": {"temperature": 0.0}, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], } - resp = _http_post_json(f"{host.rstrip('/')}/api/chat", {}, body, self.timeout) - text = (resp.get("message") or {}).get("content") + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post(f"{host.rstrip('/')}/api/chat", json=body) + resp.raise_for_status() + data = resp.json() + text = (data.get("message") or {}).get("content") if not text: - raise RuntimeError(f"judge returned no text: {resp}") - return text - - -class OpenAIEmbedder: - """OpenAI embeddings client over /v1/embeddings. - - Reads OPENAI_API_KEY from the environment. Default model: text-embedding-3-small. - """ - - def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: - self.model = model - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - api_key = _env("OPENAI_API_KEY") - if not api_key: - raise RuntimeError("OPENAI_API_KEY not set") - headers = {"Authorization": f"Bearer {api_key}"} - body = {"model": self.model, "input": texts} - resp = _http_post_json("https://api.openai.com/v1/embeddings", headers, body, self.timeout) - data = resp.get("data", []) - vectors = [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] - return np.asarray(vectors, dtype=np.float32) - - -class AzureOpenAIEmbedder: - """Azure OpenAI embeddings client. - - Reads AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, and optionally - AZURE_OPENAI_API_VERSION from the environment. The model name is the - deployment name. Default model: text-embedding-3-small. - """ - - def __init__(self, model: str = "text-embedding-3-small", timeout: int = 60) -> None: - self.model = model - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - endpoint = _env("AZURE_OPENAI_ENDPOINT") - api_key = _env("AZURE_OPENAI_API_KEY") - if not endpoint: - raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") - if not api_key: - raise RuntimeError("AZURE_OPENAI_API_KEY not set") - api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-06-01" - url = ( - f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/embeddings" - f"?api-version={api_version}" - ) - headers = {"api-key": api_key} - vectors = self._embed_with_split(texts, url, headers) - return np.asarray(vectors, dtype=np.float32) - - def _embed_with_split(self, texts: list[str], url: str, headers: dict) -> list[list[float]]: - """Send texts in one request; on HTTP 400 split in half and retry each half.""" - try: - resp = _http_post_json(url, headers, {"input": texts}, self.timeout) - data = resp.get("data", []) - return [item["embedding"] for item in sorted(data, key=lambda x: x["index"])] - except urllib.error.HTTPError as exc: - if exc.code == 400 and len(texts) > 1: - mid = len(texts) // 2 - left = self._embed_with_split(texts[:mid], url, headers) - right = self._embed_with_split(texts[mid:], url, headers) - return left + right - raise - - -class OllamaEmbedder: - """Local Ollama embedding client (default model bge-m3) over /api/embeddings. - - Posts one prompt per call (the stable single-prompt form) and stacks the - returned vectors into a 2-D numpy array. Constructing it touches no network; - the host is resolved from $OLLAMA_HOST at call time. - """ - - def __init__(self, model: str = "bge-m3", host: str | None = None, timeout: int = 60) -> None: - self.model = model - self.host = (host or _env("OLLAMA_HOST") or "http://localhost:11434").rstrip("/") - self.timeout = timeout - - def embed(self, texts: list[str]) -> np.ndarray: - """Embed a list of strings -> float32 ndarray of shape (len(texts), dim).""" - vectors: list[list[float]] = [] - for text in texts: - body = {"model": self.model, "prompt": text} - resp = _http_post_json(f"{self.host}/api/embeddings", {}, body, self.timeout) - vectors.append(resp["embedding"]) - return np.asarray(vectors, dtype=np.float32) - - -def build_embedder(spec: str): - """Return an ``embed_fn(list[str]) -> np.ndarray`` for an embedder spec. - - Dispatch is on the provider prefix of a ":" spec: - - "ollama" / "ollama:" -> OllamaEmbedder(model or "bge-m3").embed. - - a bare "" with no ':' -> treated as an Ollama model. - - any other provider -> NotImplementedError (the extension point). - - Add a new backend by adding a branch here. - """ - if (spec or "").strip() == "ollama": # bare provider, no model -> default model - return OllamaEmbedder("bge-m3").embed - provider, model = parse_model(spec) - if provider in ("unknown", "ollama"): # bare "" or "ollama:" - return OllamaEmbedder(model or "bge-m3").embed - if provider == "openai": - return OpenAIEmbedder(model or "text-embedding-3-small").embed - if provider == "azure": - return AzureOpenAIEmbedder(model or "text-embedding-3-small").embed - raise NotImplementedError( - f"embedder backend {provider!r} not implemented yet; add it in build_embedder()" - ) - - -def cosine(a, b) -> float: - """Cosine similarity between two 1-D vectors; 0.0 if either is the zero vector.""" - a = np.asarray(a, dtype=np.float64).ravel() - b = np.asarray(b, dtype=np.float64).ravel() - na = float(np.linalg.norm(a)) - nb = float(np.linalg.norm(b)) - if na == 0.0 or nb == 0.0: - return 0.0 - return float(np.dot(a, b) / (na * nb)) + raise RuntimeError(f"judge returned no text: {data}") + return _first_json_object(text) From 5609ab67d43af810c0bf29b8f7b9f7100d50de15 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:24:09 +0200 Subject: [PATCH 30/67] =?UTF-8?q?feat(evaluation):=20rewrite=20judge.py=20?= =?UTF-8?q?=E2=80=94=20async=20metrics=20+=20EvalContext=20+=20flycanon=20?= =?UTF-8?q?+=20RAGAS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fireflyframework_agentic/evaluation/judge.py | 769 ++++++++++--------- 1 file changed, 415 insertions(+), 354 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a347c8e1..9f24dc26 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -1,61 +1,48 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""G4 — LLM-as-a-Judge: an opt-in, NON-BLOCKING, NON-DETERMINISTIC advisory gate. - -G4 NEVER affects the PROMOTE/HOLD verdict and NEVER raises into the caller. -run_judge() wraps every metric in try/except; a failing metric appends to -report.errors and the run continues (best-effort). The result is an -AdvisoryReport, NOT a GateResult — it is carried separately so it can never -enter verdict() or the Skipped tuple (see scorecard / verdict_unaffected_note). - -Three families of metric (matching the flyradar contracts): -- [D] DETERMINISTIC — pure python, no LLM, printed even when the judge is off: - source_coverage, excerpt_fill_rate. -- [E] EMBEDDING — needs an embed_fn (local Ollama BGE by default): - semantic_recovery (context recall). -- [J] JUDGE — needs a chat_fn(system, user) -> dict; each [J] metric instructs - the model to reply with ONLY JSON: faithfulness, numeric_temporal_fidelity, - citation_relevance, nc_semantic_precision, fabricated_entity, contradiction, - open_gap, actionability, severity_calibration, answer_relevancy, - comparative_vs_champion. - -Aggregation follows the flycanon custom-judge design: run each [J] metric `runs` -times and take the MEDIAN of its numeric scores (robust to an outlier vote). - -Zero new dependencies: stdlib (json, statistics) + numpy. All imports at top. -calibrated is ALWAYS False for now (LLM-as-a-Judge calibration is §14, future work). +"""Evaluation judge — async metrics for flyradar and flycanon pipelines. + +Every metric: async def metric_name(item: dict, ctx: EvalContext) -> dict | float | None + +Flyradar item keys: findings, evidence_index, process_graph, proposed_actions, + workspace, reports, lexical_missed_ids, nc_items, champion +Flycanon item keys: question, answer, reference, contexts """ from __future__ import annotations -import concurrent.futures +import asyncio +import math +import os import statistics +from collections.abc import Awaitable, Callable from dataclasses import dataclass, field -import numpy as np +from pydantic import BaseModel, ConfigDict -from fireflyframework_agentic.evaluation.judge_client import ( - JudgeClient, - OllamaEmbedder, - cosine, - same_provider, -) -from fireflyframework_agentic.evaluation.matcher import source_stem +from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder +from fireflyframework_agentic.embeddings.similarity import cosine_similarity +from fireflyframework_agentic.evaluation.judge_client import JudgeClient, same_provider + +Metric = Callable[["dict", "EvalContext"], Awaitable["dict | float | None"]] SYSTEM = "You are a meticulous evaluator of a process-mining discovery report. Return ONLY a JSON object." +SYSTEM_RAG = "You are an evaluator of a RAG system's answers. Return ONLY a JSON object." + +RUBRIC = ( + "Score the ANSWER on two metrics:\n" + "- contains_answer (0.0-1.0): Does the answer contain the correct information from the REFERENCE?\n" + "- addresses_question (0.0-1.0): Does the answer directly address what the QUESTION is asking?\n" + 'Reply with ONLY {"contains_answer": , "addresses_question": }.' +) + + +class EvalContext(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + client: JudgeClient + embedder: OllamaEmbedder | None = None + runs: int = 3 + @dataclass class AdvisoryReport: @@ -68,7 +55,7 @@ class AdvisoryReport: judge_model: str same_provider_caveat: bool - calibrated: bool # ALWAYS False for now (§14) + calibrated: bool # ALWAYS False for now runs: int metrics: dict = field(default_factory=dict) details: dict = field(default_factory=dict) @@ -78,8 +65,8 @@ class AdvisoryReport: # ── shared accessors ─────────────────────────────────────────────────────────── -def _evidence_index(result: dict) -> dict[str, dict]: - return {ev.get("id"): ev for ev in result.get("evidence_index", []) if ev.get("id")} +def _evidence_index(item: dict) -> dict[str, dict]: + return {ev.get("id"): ev for ev in item.get("evidence_index", []) if ev.get("id")} def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str]: @@ -94,143 +81,116 @@ def _cited_excerpts(finding: dict, evidence_index: dict[str, dict]) -> list[str] return out -def _output_text(result: dict) -> str: +def _output_text(item: dict) -> str: """All free text the model emitted: finding titles+descriptions + reports.""" parts: list[str] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): parts.append(f.get("title", "")) parts.append(f.get("description", "")) - for r in result.get("reports", []): + for r in item.get("reports", []): parts.append(str(r)) return "\n".join(p for p in parts if p) -def _workspace_intention(result: dict) -> str: - ws = result.get("workspace") or {} +def _workspace_intention(item: dict) -> str: + ws = item.get("workspace") or {} return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip() def _coerce_float(value, default=None): - """Coerce a model-returned number/numeric-string to float; total (never raises). - - Returns ``default`` (None) on junk so one malformed vote drops that single - vote instead of discarding the whole metric. - """ + """Coerce a model-returned number/numeric-string to float; total (never raises).""" try: return float(value) except (TypeError, ValueError): return default -def _map_chat(chat_fn, prompts, workers=1): - """Run a list of (system, user) chat prompts, returning ordered result dicts. +def _source_stem(locator: str) -> str: + """Return the part before the first '#', or the full string if no '#'.""" + idx = locator.find("#") + return locator[:idx] if idx != -1 else locator - ``workers <= 1`` calls ``chat_fn`` SEQUENTIALLY — byte-for-byte identical to - the in-line loops it replaces, INCLUDING letting a raise propagate (so - run_judge's per-metric try/except still drops that whole metric, the - behaviour the suite locks in). - ``workers >= 2`` fans the calls out across a ThreadPoolExecutor while - PRESERVING input order in the returned list. Concurrency cannot let one - raising future poison the batch, so in that path a raising call's slot - becomes ``{}`` — the metric's aggregation degrades for that one vote but - never raises (the same best-effort contract as run_judge). - """ - prompts = list(prompts) - if workers <= 1: - return [chat_fn(system, user) for system, user in prompts] - - results: list[dict] = [{} for _ in prompts] - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(chat_fn, system, user): idx - for idx, (system, user) in enumerate(prompts) - } - for future in concurrent.futures.as_completed(futures): - idx = futures[future] - try: - results[idx] = future.result() - except Exception: # best-effort: a dropped vote, never a raise - results[idx] = {} - return results +async def _gather_chat(chat_fn, prompts: list[tuple[str, str]]) -> list[dict]: + """Run a list of (system, user) prompts concurrently, returning ordered results.""" + results = await asyncio.gather(*[chat_fn(s, u) for s, u in prompts], return_exceptions=True) + return [r if isinstance(r, dict) else {} for r in results] # ── [D] DETERMINISTIC — no LLM, always available ──────────────────────────────── -def source_coverage(result: dict) -> dict: +async def source_coverage(item: dict, ctx: EvalContext) -> dict: # noqa: ARG001 """Distinct source documents cited by >=1 finding vs all source documents. Returns {cited, total, orphaned} where orphaned is the sorted list of source stems present in evidence_index but cited by no finding. """ - evidence_index = _evidence_index(result) - all_stems = { - source_stem(ev.get("locator", "")) - for ev in result.get("evidence_index", []) - if ev.get("locator") - } + ev_idx = _evidence_index(item) + all_stems = {_source_stem(ev.get("locator", "")) for ev in item.get("evidence_index", []) if ev.get("locator")} cited_stems: set[str] = set() - for f in result.get("findings", []): + for f in item.get("findings", []): for ref in f.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) + ev = ev_idx.get(ref.get("evidence_id", "")) if ev and ev.get("locator"): - cited_stems.add(source_stem(ev["locator"])) + cited_stems.add(_source_stem(ev["locator"])) cited_stems &= all_stems orphaned = sorted(all_stems - cited_stems) return {"cited": len(cited_stems), "total": len(all_stems), "orphaned": orphaned} -def excerpt_fill_rate(result: dict) -> dict: +async def excerpt_fill_rate(item: dict, ctx: EvalContext) -> dict: # noqa: ARG001 """Fraction of evidence_index entries with a non-empty excerpt. - Returns {populated, total}. This is the signal behind older runs' low G3 - grounding: empty excerpts cannot ground anything. + Returns {populated, total}. """ - entries = result.get("evidence_index", []) + entries = item.get("evidence_index", []) populated = sum(1 for ev in entries if (ev.get("excerpt") or "").strip()) return {"populated": populated, "total": len(entries)} -# ── [E] EMBEDDING — needs embed_fn ─────────────────────────────────────────────── - +# ── [E] EMBEDDING — needs embedder ─────────────────────────────────────────────── -def semantic_recovery( - result: dict, - registry, - lexical_missed_ids: list[str], - embed_fn, - tau: float = 0.70, -) -> dict: - """Context-recall: recover G2 lexical misses by embedding similarity. - For each registry item flagged a LEXICAL MISS by G2, embed its - description+keywords and take the max cosine against the embeddings of every - finding description (and their cited excerpts). If max cosine >= tau the - item is counted semantically present (recovered). +async def semantic_recovery(item: dict, ctx: EvalContext, tau: float = 0.70) -> dict | None: + """Context-recall: recover lexical misses by embedding similarity. - recovered_recall = (lexical_hits + recovered) / scored_denominator, where - the scored denominator is the count of non-NC items scored by G2 (real - items, matching G2's recall denominator family). Returns the lexical recall, - the recovered recall, the recovered item list (with cosine), and tau. + Reads item["lexical_missed_ids"] (list of str). + Returns None if ctx.embedder is None. """ + if ctx.embedder is None: + return None + + lexical_missed_ids: list[str] = item.get("lexical_missed_ids", []) missed = set(lexical_missed_ids or []) - real_items = registry.real_items - scored_items = [i for i in real_items if i.tier != "L3"] - denom = len(scored_items) or 1 - lexical_hits = sum(1 for i in scored_items if i.id not in missed) - # Candidate texts the findings actually surfaced. - evidence_index = _evidence_index(result) + # Build the scored items from nc_items (non-NC = real items for recall) + # In the new EvalContext model, nc_items is a list of {"id": ..., "description": ...} + # We treat all item findings as the candidate surface; nc_items stay separate. + # Recompute as: all items scored = those not in nc_items ids. + # If there's no registry concept, we use findings as the denominator proxy. + # But keep the logic simple: just score the missed items against finding descriptions. + ev_idx = _evidence_index(item) candidate_texts: list[str] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): desc = f.get("description", "") if desc: candidate_texts.append(desc) - candidate_texts.extend(_cited_excerpts(f, evidence_index)) + candidate_texts.extend(_cited_excerpts(f, ev_idx)) + + # missed_items: we only know their IDs; we need descriptions to embed. + # In the new design, if no descriptions available, return minimal result. + all_findings = item.get("findings", []) + denom = max(len(all_findings), 1) + lexical_hits = sum(1 for f in all_findings if f.get("id") not in missed) + + missed_descs: list[tuple[str, str]] = [ + (f.get("id", ""), f.get("description", "")) + for f in all_findings + if f.get("id") in missed and f.get("description") + ] - missed_items = [i for i in scored_items if i.id in missed] - if not missed_items or not candidate_texts: + if not missed_descs or not candidate_texts: recovered_recall = lexical_hits / denom return { "lexical_recall": round(lexical_hits / denom, 4), @@ -240,15 +200,15 @@ def semantic_recovery( "scored_denominator": denom, } - item_texts = [f"{i.description} {' '.join(i.keywords)}".strip() for i in missed_items] - item_vecs = np.asarray(embed_fn(item_texts), dtype=np.float64) - cand_vecs = np.asarray(embed_fn(candidate_texts), dtype=np.float64) + item_texts = [desc for _fid, desc in missed_descs] + item_vecs = await ctx.embedder._embed_batch(item_texts) + cand_vecs = await ctx.embedder._embed_batch(candidate_texts) recovered: list[dict] = [] - for item, ivec in zip(missed_items, item_vecs): - best = max((cosine(ivec, cvec) for cvec in cand_vecs), default=0.0) + for (fid, _desc), ivec in zip(missed_descs, item_vecs, strict=False): + best = max((cosine_similarity(ivec, cvec) for cvec in cand_vecs), default=0.0) if best >= tau: - recovered.append({"id": item.id, "cosine": round(best, 4)}) + recovered.append({"id": fid, "cosine": round(best, 4)}) recovered_recall = (lexical_hits + len(recovered)) / denom return { @@ -263,16 +223,14 @@ def semantic_recovery( # ── [J] JUDGE — needs chat_fn(system, user) -> dict ────────────────────────────── -def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def faithfulness(item: dict, ctx: EvalContext) -> dict: """Entailment: does each finding's cited evidence SUPPORT its claim? - Per (finding, cited-excerpts) pair, ask SUPPORTED / NOT_SUPPORTED. Returns - {supported, total, unsupported_ids}. Findings with no cited evidence are - counted as not-supported (nothing to entail against). + Returns {supported, total, unsupported_ids}. """ - evidence_index = _evidence_index(result) - findings = result.get("findings", []) - cited = [(f, _cited_excerpts(f, evidence_index)) for f in findings] + ev_idx = _evidence_index(item) + findings = item.get("findings", []) + cited = [(f, _cited_excerpts(f, ev_idx)) for f in findings] prompts = [ ( SYSTEM, @@ -284,7 +242,7 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: for f, excerpts in cited if excerpts ] - answers = iter(_map_chat(chat_fn, prompts, workers)) + answers = iter(await _gather_chat(ctx.client.chat_json, prompts)) supported = 0 unsupported_ids: list[str] = [] for f, excerpts in cited: @@ -300,18 +258,13 @@ def faithfulness(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"supported": supported, "total": len(findings), "unsupported_ids": unsupported_ids} -def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def numeric_temporal_fidelity(item: dict, ctx: EvalContext) -> dict: """Flag numbers/dates asserted in a finding that do NOT match its evidence. - Closes the 45-days-vs-3-days gap. Returns {mismatches: [{finding_id, value, - source}], count}. + Returns {mismatches: [{finding_id, value, source}], count}. """ - evidence_index = _evidence_index(result) - scored = [ - (f, excerpts) - for f in result.get("findings", []) - if (excerpts := _cited_excerpts(f, evidence_index)) - ] + ev_idx = _evidence_index(item) + scored = [(f, excerpts) for f in item.get("findings", []) if (excerpts := _cited_excerpts(f, ev_idx))] prompts = [ ( SYSTEM, @@ -324,9 +277,9 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic ) for f, excerpts in scored ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) mismatches: list[dict] = [] - for (f, _excerpts), answer in zip(scored, answers): + for (f, _excerpts), answer in zip(scored, answers, strict=False): for m in answer.get("mismatches", []) or []: mismatches.append( { @@ -338,20 +291,17 @@ def numeric_temporal_fidelity(result: dict, chat_fn, *, workers: int = 1) -> dic return {"mismatches": mismatches, "count": len(mismatches)} -def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def citation_relevance(item: dict, ctx: EvalContext) -> dict: """Context precision: fraction of cited passages actually relevant to the claim. - Per evidence_ref, ask yes/no relevance. precision = relevant / total_refs. - Returns {precision, relevant, total}; when total == 0 (no cited passages with - excerpts) precision is None — the kept ``total`` lets a reader tell "perfect" - apart from "nothing to score". + Returns {precision, relevant, total}. """ - evidence_index = _evidence_index(result) + ev_idx = _evidence_index(item) prompts: list[tuple[str, str]] = [] - for f in result.get("findings", []): + for f in item.get("findings", []): desc = f.get("description", "") for ref in f.get("evidence_refs", []): - ev = evidence_index.get(ref.get("evidence_id", "")) + ev = ev_idx.get(ref.get("evidence_id", "")) if not ev: continue excerpt = ev.get("excerpt") or "" @@ -366,7 +316,7 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: f"CITED PASSAGE: {excerpt}", ) ) - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) total = len(prompts) relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes") if not total: @@ -374,43 +324,40 @@ def citation_relevance(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total} -def nc_semantic_precision(result: dict, registry, chat_fn, *, workers: int = 1) -> dict: +async def nc_semantic_precision(item: dict, ctx: EvalContext) -> dict: """Count negative-control falsehoods the output asserts or endorses. - Per NC item, ask yes/no whether the output endorses the false statement. - Returns {asserted, total, asserted_ids}. The semantic counterpart to the - inert deterministic NC check. + Reads item["nc_items"] as list of {"id": ..., "description": ...} dicts. + Returns {asserted, total, asserted_ids}. """ - output_text = _output_text(result) - nc_items = registry.nc_items + output_text = _output_text(item) + nc_items: list[dict] = item.get("nc_items", []) prompts = [ ( SYSTEM, "Does the OUTPUT assert or endorse the following FALSE statement?\n" 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' - f"FALSE STATEMENT: {item.description}\n" + f"FALSE STATEMENT: {nc.get('description', '')}\n" f"OUTPUT:\n{output_text}", ) - for item in nc_items + for nc in nc_items ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) asserted_ids = [ - item.id for item, a in zip(nc_items, answers) if str(a.get("asserted", "")).lower() == "yes" + nc.get("id", "?") + for nc, a in zip(nc_items, answers, strict=False) + if str(a.get("asserted", "")).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} -def fabricated_entity(result: dict, chat_fn) -> dict: +async def fabricated_entity(item: dict, ctx: EvalContext) -> dict: """Count systems/orgs/metrics named in the output but absent from the corpus. - Returns {count, entities}. The corpus universe is the set of evidence - excerpts + locators. + Returns {count, entities}. """ - output_text = _output_text(result) - corpus = "\n".join( - f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" - for ev in result.get("evidence_index", []) - ) + output_text = _output_text(item) + corpus = "\n".join(f"{ev.get('locator', '')} :: {ev.get('excerpt', '')}" for ev in item.get("evidence_index", [])) user = ( "List any system, organization, or metric NAMED in the OUTPUT that does NOT " "appear anywhere in the CORPUS EVIDENCE.\n" @@ -418,54 +365,54 @@ def fabricated_entity(result: dict, chat_fn) -> dict: f"OUTPUT:\n{output_text}\n\n" f"CORPUS EVIDENCE:\n{corpus}" ) - entities = chat_fn(SYSTEM, user).get("fabricated", []) or [] + answer = await ctx.client.chat_json(SYSTEM, user) + entities = answer.get("fabricated", []) or [] return {"count": len(entities), "entities": list(entities)} -def contradiction(result: dict, chat_fn) -> dict: +async def contradiction(item: dict, ctx: EvalContext) -> dict: """Count internally contradictory finding pairs. - Returns {count, pairs}. pairs is the list of contradicting finding-id pairs - the judge reports. + Returns {count, pairs}. """ lines = [] - for f in result.get("findings", []): + for f in item.get("findings", []): lines.append(f"{f.get('id', '?')}: {f.get('title', '')} — {f.get('description', '')}") user = ( "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" - 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' - + "\n".join(lines) + 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) - pairs = chat_fn(SYSTEM, user).get("pairs", []) or [] + answer = await ctx.client.chat_json(SYSTEM, user) + pairs = answer.get("pairs", []) or [] return {"count": len(pairs), "pairs": [list(p) for p in pairs]} -def open_gap(result: dict, chat_fn) -> dict: +async def open_gap(item: dict, ctx: EvalContext) -> dict: """G-Eval open probe: the most important process issue the output missed. Returns {gap} — a free-text advisory narrative (no score). """ - pg = result.get("process_graph") or {} + pg = item.get("process_graph") or {} pg_summary = f"process_graph has {len(pg.get('processes', []))} processes" user = ( "Given this corpus scope and output, what important process issue did the " "output FAIL to surface?\n" 'Reply with ONLY {"gap": ""}.\n\n' - f"WORKSPACE SCOPE: {_workspace_intention(result)}\n" + f"WORKSPACE SCOPE: {_workspace_intention(item)}\n" f"{pg_summary}\n" - f"OUTPUT:\n{_output_text(result)}" + f"OUTPUT:\n{_output_text(item)}" ) - return {"gap": str(chat_fn(SYSTEM, user).get("gap", ""))} + answer = await ctx.client.chat_json(SYSTEM, user) + return {"gap": str(answer.get("gap", ""))} -def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def actionability(item: dict, ctx: EvalContext) -> dict: """Average 0-1 rating of whether proposed actions are specific+quantified+linked. - Returns {score, rated}. Each action is rated against whether it is specific, - quantified, and linked to a finding. + Returns {score, rated}. """ - actions = result.get("proposed_actions", []) or [] - finding_ids = {f.get("id") for f in result.get("findings", [])} + actions = item.get("proposed_actions", []) or [] + finding_ids = {f.get("id") for f in item.get("findings", [])} prompts = [ ( SYSTEM, @@ -482,24 +429,24 @@ def actionability(result: dict, chat_fn, *, workers: int = 1) -> dict: ) for a in actions ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) scores: list[float] = [] for a in answers: value = _coerce_float(a.get("score")) - if value is None: # malformed vote -> skip this action, keep the metric + if value is None: continue scores.append(value) score = round(sum(scores) / len(scores), 4) if scores else None return {"score": score, "rated": len(scores)} -def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def severity_calibration(item: dict, ctx: EvalContext) -> dict: """Per-finding judgment of whether stated severity matches the evidence. Returns {miscalibrated, total, verdicts: {finding_id: under|over|calibrated}}. """ - evidence_index = _evidence_index(result) - findings = result.get("findings", []) + ev_idx = _evidence_index(item) + findings = item.get("findings", []) prompts = [ ( SYSTEM, @@ -507,14 +454,14 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" f"FINDING: {f.get('description', '')}\n" - f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, evidence_index))}", + f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, ev_idx))}", ) for f in findings ] - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) verdicts: dict[str, str] = {} miscalibrated = 0 - for f, a in zip(findings, answers): + for f, a in zip(findings, answers, strict=False): verdict = str(a.get("calibration", "calibrated")).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): @@ -522,7 +469,7 @@ def severity_calibration(result: dict, chat_fn, *, workers: int = 1) -> dict: return {"miscalibrated": miscalibrated, "total": len(findings), "verdicts": verdicts} -def answer_relevancy(result: dict, chat_fn) -> dict: +async def answer_relevancy(item: dict, ctx: EvalContext) -> dict: """RAGAS-style: does the output address the stated workspace intention? Returns {score} in [0,1], or {"score": None} when the vote fails to coerce. @@ -530,38 +477,27 @@ def answer_relevancy(result: dict, chat_fn) -> dict: user = ( "Does the OUTPUT address the stated WORKSPACE INTENTION (on-topic, responsive)?\n" 'Reply with ONLY {"score": }.\n\n' - f"WORKSPACE INTENTION: {_workspace_intention(result)}\n" - f"OUTPUT:\n{_output_text(result)}" + f"WORKSPACE INTENTION: {_workspace_intention(item)}\n" + f"OUTPUT:\n{_output_text(item)}" ) - return {"score": _coerce_float(chat_fn(SYSTEM, user).get("score"))} + answer = await ctx.client.chat_json(SYSTEM, user) + return {"score": _coerce_float(answer.get("score"))} -def surface_deduplication(result: dict, chat_fn, *, workers: int = 1) -> dict: +async def surface_deduplication(item: dict, ctx: EvalContext) -> dict: """Fraction of near-duplicate process-graph node pairs that are genuinely distinct. - Scoping rules: - - Processes: all pairs compared (cross-process is valid at this level). - - Activities and decisions: ONLY within the same parent process. The same - activity name appearing in two different processes is a legitimate repetition - (e.g. "Approve Request" in both Loan and Credit-Card flows), not a duplicate. - - For each surface, the top-10 most name-similar pairs (token-Jaccard >= 0.30) - are selected. For activities/decisions the parent process name is included in - the judge prompt so it can reason about intra-process context. 30 pairs total. - Returns {distinct, redundant, total, distinct_rate, redundant_pairs}. """ - pg = result.get("process_graph", {}) + pg = item.get("process_graph", {}) procs = pg.get("processes", []) def _toks(node: dict) -> frozenset[str]: return frozenset(node.get("name", "").lower().split()) - PER_SURFACE_CAP = 10 - # candidates: (surface, node_a, node_b, parent_process_name) + per_surface_cap = 10 candidates: list[tuple[str, dict, dict, str]] = [] - # Processes: compare all pairs if len(procs) >= 2: pairs: list[tuple[float, dict, dict]] = [] for i in range(len(procs)): @@ -574,10 +510,9 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: pairs.append((jac, procs[i], procs[j])) pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b in pairs[:PER_SURFACE_CAP]: + for _jac, a, b in pairs[:per_surface_cap]: candidates.append(("process", a, b, "")) - # Activities and decisions: within the same parent process only for surface_key, attr in (("activity", "activities"), ("decision", "decisions")): all_pairs: list[tuple[float, dict, dict, str]] = [] for proc in procs: @@ -595,7 +530,7 @@ def _toks(node: dict) -> frozenset[str]: if jac >= 0.30: all_pairs.append((jac, nodes[i], nodes[j], proc_name)) all_pairs.sort(key=lambda x: x[0], reverse=True) - for _jac, a, b, proc_name in all_pairs[:PER_SURFACE_CAP]: + for _jac, a, b, proc_name in all_pairs[:per_surface_cap]: candidates.append((surface_key, a, b, proc_name)) if not candidates: @@ -603,34 +538,38 @@ def _toks(node: dict) -> frozenset[str]: prompts = [] for surface, a, b, parent_proc in candidates: - ctx = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append(( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - )) + ctx_line = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" + prompts.append( + ( + SYSTEM, + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx_line}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", + ) + ) - answers = _map_chat(chat_fn, prompts, workers) + answers = await _gather_chat(ctx.client.chat_json, prompts) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] - for (surface, a, b, _parent), answer in zip(candidates, answers): + for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): verdict = str(answer.get("verdict", "")).upper() if verdict == "DISTINCT": distinct += 1 else: redundant += 1 - redundant_pairs.append({ - "surface": surface, - "a": a.get("name", ""), - "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), - }) + redundant_pairs.append( + { + "surface": surface, + "a": a.get("name", ""), + "b": b.get("name", ""), + "reason": str(answer.get("reason", "")), + } + ) total = distinct + redundant return { @@ -642,13 +581,15 @@ def _toks(node: dict) -> frozenset[str]: } -def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dict: +async def comparative_vs_champion(item: dict, ctx: EvalContext) -> dict | None: """Pairwise MT-Bench-style review of candidate vs champion (advisory only). - Returns {candidate, champion, more_consistent} where candidate/champion are - 1-5 ratings on Coverage/Quality/Evidence/Actionability/Regression. Never - feeds G5. + Returns None if item["champion"] is not present. + Returns {candidate, champion, more_consistent}. """ + champion = item.get("champion") + if champion is None: + return None user = ( "Score the CANDIDATE and the CHAMPION outputs on five axes (1-5 each): " "Coverage, Quality, Evidence, Actionability, Regression. Then say which is " @@ -657,10 +598,10 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic '{"candidate": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' '"champion": {"coverage": x, "quality": x, "evidence": x, "actionability": x, "regression": x}, ' '"more_consistent": "candidate" or "champion"}.\n\n' - f"CANDIDATE:\n{_output_text(result)}\n\n" - f"CHAMPION:\n{_output_text(champion_result)}" + f"CANDIDATE:\n{_output_text(item)}\n\n" + f"CHAMPION:\n{_output_text(champion)}" ) - out = chat_fn(SYSTEM, user) + out = await ctx.client.chat_json(SYSTEM, user) return { "candidate": out.get("candidate", {}), "champion": out.get("champion", {}), @@ -668,18 +609,175 @@ def comparative_vs_champion(result: dict, champion_result: dict, chat_fn) -> dic } -# ── median-of-N for [J] metrics ────────────────────────────────────────────────── +# ── flycanon custom metrics ─────────────────────────────────────────────────────── -def _numeric_leaves(d: dict) -> dict[tuple, float]: - """Flatten a metric dict to {path: float} over its FLOAT score-leaves only. +async def _rag_score_once(item: dict, ctx: EvalContext) -> dict | None: + """Single RAG scoring call: returns {"contains_answer": float, "addresses_question": float}.""" + question = item.get("question", "") + reference = item.get("reference", "") + answer = item.get("answer", "") + if not question or not answer: + return None + user = f"QUESTION: {question}\nREFERENCE: {reference}\nANSWER: {answer}\n\n{RUBRIC}" + result = await ctx.client.chat_json(SYSTEM_RAG, user) + return result + + +async def contains_answer(item: dict, ctx: EvalContext) -> float | None: + """Flycanon: does the answer contain the correct information from the reference? - Median applies to continuous scores only. A leaf counts as numeric-for-median - only when its value is a ``float``; ``bool`` and ``int`` leaves (counts, - denominators, 1-5 axes, and other bookkeeping) are deliberately skipped and - taken from the first run unchanged — this avoids fractional counts (rated=0.5) - and count/len(list) disagreement under runs>1 with an even N. + Runs ctx.runs times and returns the median score. + Returns None if the item lacks question/answer. + """ + scores: list[float] = [] + for _ in range(max(1, ctx.runs)): + result = await _rag_score_once(item, ctx) + if result is None: + return None + val = _coerce_float(result.get("contains_answer")) + if val is not None: + scores.append(val) + if not scores: + return None + return round(statistics.median(scores), 4) + + +async def addresses_question(item: dict, ctx: EvalContext) -> float | None: + """Flycanon: does the answer directly address what the question is asking? + + Runs ctx.runs times and returns the median score. + Returns None if the item lacks question/answer. """ + scores: list[float] = [] + for _ in range(max(1, ctx.runs)): + result = await _rag_score_once(item, ctx) + if result is None: + return None + val = _coerce_float(result.get("addresses_question")) + if val is not None: + scores.append(val) + if not scores: + return None + return round(statistics.median(scores), 4) + + +# ── RAGAS metrics ───────────────────────────────────────────────────────────────── +# ragas/langchain imports are inline inside _sync() since ragas is optional. + + +def _make_ragas_sample(item: dict): + """Build a RAGAS SingleTurnSample from an item dict (ragas import inline).""" + from ragas import SingleTurnSample # type: ignore[import] # noqa: PLC0415 + + return SingleTurnSample( + user_input=item.get("question", ""), + response=item.get("answer", ""), + reference=item.get("reference", ""), + retrieved_contexts=item.get("contexts", []), + ) + + +def _make_ragas_llm(ctx: EvalContext): + """Build a LangChain LLM wrapper for RAGAS (langchain import inline).""" + provider, model = ctx.client.provider, ctx.client.model + if provider == "anthropic": + from langchain_anthropic import ChatAnthropic # type: ignore[import] # noqa: PLC0415 + + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) + if provider in ("openai", "azure"): + from langchain_openai import ChatOpenAI # type: ignore[import] # noqa: PLC0415 + + api_key = os.environ.get("OPENAI_API_KEY", "") + return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) + if provider == "ollama": + from langchain_ollama import ChatOllama # type: ignore[import] # noqa: PLC0415 + + return ChatOllama(model=model, temperature=0.0) + raise ValueError(f"RAGAS: unsupported provider {provider!r}") + + +def _make_ragas_embeddings(ctx: EvalContext): + """Build LangChain embeddings for RAGAS (langchain import inline).""" + if ctx.embedder is not None: + from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 + + return OllamaEmbeddings(model=ctx.embedder._model) + from langchain_anthropic import AnthropicEmbeddings # type: ignore[import] # noqa: PLC0415 + + return AnthropicEmbeddings() + + +async def _ragas_score(metric_name: str, item: dict, ctx: EvalContext) -> float | None: + """Run a single named RAGAS metric and return its float score (or None).""" + + def _sync(): + from ragas import evaluate # type: ignore[import] # noqa: PLC0415 + from ragas.dataset_schema import EvaluationDataset # type: ignore[import] # noqa: PLC0415 + from ragas.metrics import ( # type: ignore[import] # noqa: PLC0415 + AnswerCorrectness, + AnswerRelevancy, + ContextPrecision, + ContextRecall, + Faithfulness, + ) + + _metrics_map = { + "answer_correctness": AnswerCorrectness, + "answer_relevancy_ragas": AnswerRelevancy, + "ragas_faithfulness": Faithfulness, + "context_recall": ContextRecall, + "context_precision": ContextPrecision, + } + metric_cls = _metrics_map.get(metric_name) + if metric_cls is None: + return None + + llm = _make_ragas_llm(ctx) + embeddings = _make_ragas_embeddings(ctx) + metric = metric_cls(llm=llm, embeddings=embeddings) + sample = _make_ragas_sample(item) + dataset = EvaluationDataset(samples=[sample]) + result = evaluate(dataset=dataset, metrics=[metric]) + df = result.to_pandas() + col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)] + if col.empty: + return None + val = df[col[0]].iloc[0] + if val is None or (isinstance(val, float) and math.isnan(val)): + return None + return round(float(val), 4) + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, _sync) + + +async def answer_correctness(item: dict, ctx: EvalContext) -> float | None: + """RAGAS answer correctness (semantic F1 against reference).""" + return await _ragas_score("answer_correctness", item, ctx) + + +async def ragas_faithfulness(item: dict, ctx: EvalContext) -> float | None: + """RAGAS faithfulness (answer grounded in retrieved contexts).""" + return await _ragas_score("ragas_faithfulness", item, ctx) + + +async def context_recall(item: dict, ctx: EvalContext) -> float | None: + """RAGAS context recall (reference coverage by retrieved contexts).""" + return await _ragas_score("context_recall", item, ctx) + + +async def context_precision(item: dict, ctx: EvalContext) -> float | None: + """RAGAS context precision (retrieved contexts relevant to the question).""" + return await _ragas_score("context_precision", item, ctx) + + +# ── median-of-N helpers ────────────────────────────────────────────────────────── + + +def _numeric_leaves(d: dict) -> dict[tuple, float]: + """Flatten a metric dict to {path: float} over its FLOAT score-leaves only.""" out: dict[tuple, float] = {} def walk(node, path: tuple) -> None: @@ -701,11 +799,7 @@ def _set_leaf(d: dict, path: tuple, value: float) -> None: def _median_runs(samples: list[dict]) -> dict: - """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first. - - Only continuous float scores are medianed; integer bookkeeping (counts, - denominators, 1-5 axes) and all non-numeric fields are taken from the first run. - """ + """Median across N metric-dicts: FLOAT score-leaves -> per-key median; rest = first.""" samples = [s for s in samples if isinstance(s, dict)] if not samples: return {} @@ -728,102 +822,69 @@ def _median_runs(samples: list[dict]) -> dict: # ── orchestrator ───────────────────────────────────────────────────────────────── -def run_judge( - result: dict, - registry, +async def run_judge( + item: dict, + ctx: EvalContext, *, - judge_model: str, - runs: int = 1, - concurrency: int = 1, pipeline_model: str = "", - champion_result: dict | None = None, - chat_fn=None, - embed_fn=None, - tau: float = 0.70, - lexical_missed_ids: list[str] | None = None, ) -> AdvisoryReport: - """Run the G4 advisory gate, best-effort. NEVER raises; NEVER affects verdict. - - If chat_fn / embed_fn are None, real ones are built from JudgeClient / - OllamaEmbedder (tests inject stubs instead). Each [J] metric runs `runs` - times and the median of its numeric scores is kept. Every metric is wrapped - in try/except: a failure appends to report.errors and the run continues. - - ``concurrency`` (opt-in, default 1) bounds the per-item [J] metrics' internal - fan-out: 1 keeps the sequential per-item loops; >=2 runs each metric's items - across a thread pool (order preserved). The median-of-N ``runs`` loop stays - sequential and the single-call metrics are unaffected. The result is - byte-for-byte identical at concurrency=1. + """Run all metrics concurrently and return an AdvisoryReport. - Returns an AdvisoryReport (a plain dict carrier) with calibrated=False and - same_provider_caveat = same_provider(pipeline_model, judge_model). + Best-effort: never raises. Failing metrics append to report.errors. """ - if chat_fn is None: - client = JudgeClient(judge_model) - chat_fn = client.chat_json - if embed_fn is None: - embed_fn = OllamaEmbedder().embed - report = AdvisoryReport( - judge_model=judge_model, - same_provider_caveat=same_provider(pipeline_model, judge_model), + judge_model=ctx.client.model_spec, + same_provider_caveat=same_provider(pipeline_model, ctx.client.model_spec), calibrated=False, - runs=runs, + runs=ctx.runs, ) - def _run_det(name: str, fn) -> None: - try: - report.metrics[name] = fn() - except Exception as exc: # best-effort: never raise - report.errors.append(f"{name}: {type(exc).__name__}: {exc}") + # [D] deterministic (no LLM) + det_metrics: list[tuple[str, Metric]] = [ + ("source_coverage", source_coverage), + ("excerpt_fill_rate", excerpt_fill_rate), + ] + # [E] embedding + emb_metrics: list[tuple[str, Metric]] = [ + ("semantic_recovery", semantic_recovery), + ] + # [J] judge metrics (median-of-runs handled externally for single-call ones) + judge_metrics: list[tuple[str, Metric]] = [ + ("faithfulness", faithfulness), + ("numeric_temporal_fidelity", numeric_temporal_fidelity), + ("citation_relevance", citation_relevance), + ("nc_semantic_precision", nc_semantic_precision), + ("fabricated_entity", fabricated_entity), + ("contradiction", contradiction), + ("open_gap", open_gap), + ("actionability", actionability), + ("severity_calibration", severity_calibration), + ("answer_relevancy", answer_relevancy), + ("surface_deduplication", surface_deduplication), + ("comparative_vs_champion", comparative_vs_champion), + ] + # flycanon custom + flycanon_metrics: list[tuple[str, Metric]] = [ + ("contains_answer", contains_answer), + ("addresses_question", addresses_question), + ] + # RAGAS + ragas_metrics: list[tuple[str, Metric]] = [ + ("answer_correctness", answer_correctness), + ("ragas_faithfulness", ragas_faithfulness), + ("context_recall", context_recall), + ("context_precision", context_precision), + ] - def _run_judge_metric(name: str, fn) -> None: + all_metrics = det_metrics + emb_metrics + judge_metrics + flycanon_metrics + ragas_metrics + + async def _run_one(name: str, fn: Metric) -> None: try: - samples = [fn() for _ in range(max(1, runs))] - report.metrics[name] = _median_runs(samples) - except Exception as exc: # best-effort: never raise + result = await fn(item, ctx) + if result is not None: + report.metrics[name] = result + except Exception as exc: report.errors.append(f"{name}: {type(exc).__name__}: {exc}") - # [D] deterministic — always computed, no LLM. - _run_det("source_coverage", lambda: source_coverage(result)) - _run_det("excerpt_fill_rate", lambda: excerpt_fill_rate(result)) - - # [E] embedding — context recall. - _run_det( - "semantic_recovery", - lambda: semantic_recovery(result, registry, lexical_missed_ids or [], embed_fn, tau=tau), - ) - - # [J] judge — median-of-N. Per-item metrics fan out at workers=concurrency. - _run_judge_metric("faithfulness", lambda: faithfulness(result, chat_fn, workers=concurrency)) - _run_judge_metric( - "numeric_temporal_fidelity", - lambda: numeric_temporal_fidelity(result, chat_fn, workers=concurrency), - ) - _run_judge_metric( - "citation_relevance", lambda: citation_relevance(result, chat_fn, workers=concurrency) - ) - _run_judge_metric( - "nc_semantic_precision", - lambda: nc_semantic_precision(result, registry, chat_fn, workers=concurrency), - ) - _run_judge_metric("fabricated_entity", lambda: fabricated_entity(result, chat_fn)) - _run_judge_metric("contradiction", lambda: contradiction(result, chat_fn)) - _run_judge_metric("open_gap", lambda: open_gap(result, chat_fn)) - _run_judge_metric("actionability", lambda: actionability(result, chat_fn, workers=concurrency)) - _run_judge_metric( - "severity_calibration", - lambda: severity_calibration(result, chat_fn, workers=concurrency), - ) - _run_judge_metric("answer_relevancy", lambda: answer_relevancy(result, chat_fn)) - _run_judge_metric( - "surface_deduplication", - lambda: surface_deduplication(result, chat_fn, workers=concurrency), - ) - if champion_result is not None: - _run_judge_metric( - "comparative_vs_champion", - lambda: comparative_vs_champion(result, champion_result, chat_fn), - ) - + await asyncio.gather(*[_run_one(name, fn) for name, fn in all_metrics]) return report From 7799185bf69777b8f680fe04667129e42fddddf1 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:24:58 +0200 Subject: [PATCH 31/67] feat(evaluation): slim __init__.py to 3-file exports --- .../evaluation/__init__.py | 179 +++++++++--------- 1 file changed, 90 insertions(+), 89 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index ad01980c..c2005e7a 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -1,89 +1,90 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Evaluation subpackage -- gate-based quality gates, LLM-as-judge advisory, champion/challenger tracking, and retrieval metrics. - -Gate pipeline (flags, not vetoes): - G1 -- Structural & Safe (schema + PII + empty-registry guard) - G2 -- Must-finds & negative controls (recall + NC precision) - G3 -- Evidence (grounding / token-anchoring) - G4 -- LLM-as-a-Judge (advisory, opt-in, never decides promotion) - G5 -- No-regression / promotion (champion/challenger comparison) - -Retrieval metrics: - Precision@k, Recall@k, MRR, NDCG -- computed over ranked retrieval results. - -Champion tracking: - Persists the best-known run record so that promotion decisions can be made - against a stable baseline rather than the most recent run. -""" - -from importlib.metadata import PackageNotFoundError, version - -from fireflyframework_agentic.evaluation.corpus import EMPTY, FABRICATED, SOURCE_UNKNOWN, VERIFIED, corpus_sha256, load_corpus, verify_evidence_index -from fireflyframework_agentic.evaluation.gates import GateResult, Verdict, g2_recall_precision, run_gates -from fireflyframework_agentic.evaluation.scorecard import render_scorecard, verdict, VERDICT_PROMOTE, VERDICT_HOLD -from fireflyframework_agentic.evaluation.champion import ChampionRecord, invalidate_champion, load_champion, save_champion -from fireflyframework_agentic.evaluation.judge import AdvisoryReport, run_judge -from fireflyframework_agentic.evaluation.judge_client import JudgeClient, OllamaEmbedder, build_embedder, cosine -from fireflyframework_agentic.evaluation.matcher import anchored, matches, source_stem, tokens -from fireflyframework_agentic.evaluation.registry import Registry, RegistryItem, load_registry, registry_sha256 -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics -from fireflyframework_agentic.evaluation.stats import aa_band, aggregate_grounding, left_skew_flag - -try: - __version__ = version("fireflyframework-agentic") -except PackageNotFoundError: - __version__ = "0.0.0+dev" - -__all__ = [ - "EMPTY", - "FABRICATED", - "SOURCE_UNKNOWN", - "VERIFIED", - "corpus_sha256", - "load_corpus", - "verify_evidence_index", - "GateResult", - "Verdict", - "g2_recall_precision", - "run_gates", - "render_scorecard", - "verdict", - "VERDICT_PROMOTE", - "VERDICT_HOLD", - "ChampionRecord", - "load_champion", - "save_champion", - "invalidate_champion", - "AdvisoryReport", - "run_judge", - "JudgeClient", - "OllamaEmbedder", - "build_embedder", - "cosine", - "Registry", - "RegistryItem", - "load_registry", - "registry_sha256", - "RetrieverMetrics", - "compute_retrieval_metrics", - "anchored", - "matches", - "source_stem", - "tokens", - "aa_band", - "aggregate_grounding", - "left_skew_flag", -] +from fireflyframework_agentic.evaluation.judge import ( + AdvisoryReport as AdvisoryReport, +) +from fireflyframework_agentic.evaluation.judge import ( + EvalContext as EvalContext, +) +from fireflyframework_agentic.evaluation.judge import ( + Metric as Metric, +) +from fireflyframework_agentic.evaluation.judge import ( + actionability as actionability, +) +from fireflyframework_agentic.evaluation.judge import ( + addresses_question as addresses_question, +) +from fireflyframework_agentic.evaluation.judge import ( + answer_correctness as answer_correctness, +) +from fireflyframework_agentic.evaluation.judge import ( + answer_relevancy as answer_relevancy, +) +from fireflyframework_agentic.evaluation.judge import ( + citation_relevance as citation_relevance, +) +from fireflyframework_agentic.evaluation.judge import ( + comparative_vs_champion as comparative_vs_champion, +) +from fireflyframework_agentic.evaluation.judge import ( + contains_answer as contains_answer, +) +from fireflyframework_agentic.evaluation.judge import ( + context_precision as context_precision, +) +from fireflyframework_agentic.evaluation.judge import ( + context_recall as context_recall, +) +from fireflyframework_agentic.evaluation.judge import ( + contradiction as contradiction, +) +from fireflyframework_agentic.evaluation.judge import ( + excerpt_fill_rate as excerpt_fill_rate, +) +from fireflyframework_agentic.evaluation.judge import ( + fabricated_entity as fabricated_entity, +) +from fireflyframework_agentic.evaluation.judge import ( + faithfulness as faithfulness, +) +from fireflyframework_agentic.evaluation.judge import ( + nc_semantic_precision as nc_semantic_precision, +) +from fireflyframework_agentic.evaluation.judge import ( + numeric_temporal_fidelity as numeric_temporal_fidelity, +) +from fireflyframework_agentic.evaluation.judge import ( + open_gap as open_gap, +) +from fireflyframework_agentic.evaluation.judge import ( + ragas_faithfulness as ragas_faithfulness, +) +from fireflyframework_agentic.evaluation.judge import ( + run_judge as run_judge, +) +from fireflyframework_agentic.evaluation.judge import ( + semantic_recovery as semantic_recovery, +) +from fireflyframework_agentic.evaluation.judge import ( + severity_calibration as severity_calibration, +) +from fireflyframework_agentic.evaluation.judge import ( + source_coverage as source_coverage, +) +from fireflyframework_agentic.evaluation.judge import ( + surface_deduplication as surface_deduplication, +) +from fireflyframework_agentic.evaluation.judge_client import ( + JudgeClient as JudgeClient, +) +from fireflyframework_agentic.evaluation.judge_client import ( + parse_model as parse_model, +) +from fireflyframework_agentic.evaluation.judge_client import ( + same_provider as same_provider, +) +from fireflyframework_agentic.lab.retrieval_metrics import ( + RetrieverMetrics as RetrieverMetrics, +) +from fireflyframework_agentic.lab.retrieval_metrics import ( + compute_retrieval_metrics as compute_retrieval_metrics, +) From 9526f43315f56324aba3173b75ceebd87d9c3d71 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:25:15 +0200 Subject: [PATCH 32/67] =?UTF-8?q?chore(evaluation):=20update=20pyproject.t?= =?UTF-8?q?oml=20=E2=80=94=20drop=20scipy,=20add=20ragas=20deps,=20remove?= =?UTF-8?q?=20flyeval=20entrypoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bb74201f..72a04fad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,8 +120,10 @@ all = [ "fireflyframework-agentic[postgres,mongodb,security,embeddings,openai-embeddings,cohere-embeddings,google-embeddings,mistral-embeddings,voyage-embeddings,azure-embeddings,bedrock-embeddings,ollama-embeddings,vectorstores-chroma,vectorstores-pinecone,vectorstores-qdrant,vectorstores-pgvector,vectorstores-sqlite-vec,watch,binary]", ] evaluation = [ - "scipy>=1.11", "numpy>=1.26.0", + "ragas>=0.2", + "langchain-anthropic>=0.3", + "langchain-ollama>=0.3", ] dev = [ "pytest>=8.3.0", @@ -136,9 +138,6 @@ dev = [ "pre-commit>=3.8.0", ] -[project.scripts] -flyeval = "fireflyframework_agentic.evaluation.cli:main" - [project.urls] Homepage = "https://fireflyframework.org/" Documentation = "https://github.com/fireflyframework/fireflyframework-agentic/tree/main/docs" From d56755228af64f4f5d0d24e5edbf5426853e6929 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:26:17 +0200 Subject: [PATCH 33/67] test(evaluation): add unit tests for judge.py metrics --- tests/unit/evaluation/test_judge.py | 248 ++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 tests/unit/evaluation/test_judge.py diff --git a/tests/unit/evaluation/test_judge.py b/tests/unit/evaluation/test_judge.py new file mode 100644 index 00000000..7f27c125 --- /dev/null +++ b/tests/unit/evaluation/test_judge.py @@ -0,0 +1,248 @@ +from unittest.mock import MagicMock + +import pytest + +from fireflyframework_agentic.evaluation.judge import ( + EvalContext, + addresses_question, + contains_answer, + excerpt_fill_rate, + faithfulness, + source_coverage, +) +from fireflyframework_agentic.evaluation.judge_client import JudgeClient + + +def make_ctx(responses: list[dict]) -> EvalContext: + client = MagicMock(spec=JudgeClient) + client.model_spec = "anthropic:claude-sonnet-4-6" + client.provider = "anthropic" + client.model = "claude-sonnet-4-6" + call_iter = iter(responses) + + async def mock_chat_json(system, user, max_tokens=1024): + return next(call_iter) + + client.chat_json = mock_chat_json + return EvalContext(client=client, runs=1) + + +# ── contains_answer ────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_contains_answer_present(): + ctx = make_ctx([{"contains_answer": 1.0, "addresses_question": 1.0}]) + item = {"question": "Q", "reference": "R", "answer": "A"} + score = await contains_answer(item, ctx) + assert score == 1.0 + + +@pytest.mark.asyncio +async def test_contains_answer_absent(): + ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.5}]) + item = {"question": "Q", "reference": "R", "answer": "wrong"} + score = await contains_answer(item, ctx) + assert score == 0.0 + + +@pytest.mark.asyncio +async def test_contains_answer_partial(): + ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 0.8}]) + item = {"question": "Q", "reference": "R", "answer": "partial"} + score = await contains_answer(item, ctx) + assert score == 0.5 + + +@pytest.mark.asyncio +async def test_contains_answer_missing_question_returns_none(): + ctx = make_ctx([]) + item = {"reference": "R", "answer": "A"} + score = await contains_answer(item, ctx) + assert score is None + + +# ── addresses_question ─────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_addresses_question_yes(): + ctx = make_ctx([{"contains_answer": 0.5, "addresses_question": 1.0}]) + item = {"question": "Q", "reference": "R", "answer": "A"} + score = await addresses_question(item, ctx) + assert score == 1.0 + + +@pytest.mark.asyncio +async def test_addresses_question_no(): + ctx = make_ctx([{"contains_answer": 0.0, "addresses_question": 0.0}]) + item = {"question": "Q", "reference": "R", "answer": "irrelevant"} + score = await addresses_question(item, ctx) + assert score == 0.0 + + +@pytest.mark.asyncio +async def test_addresses_question_missing_answer_returns_none(): + ctx = make_ctx([]) + item = {"question": "Q", "reference": "R"} + score = await addresses_question(item, ctx) + assert score is None + + +# ── faithfulness ───────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_faithfulness_all_supported(): + # One finding with cited evidence, judge says SUPPORTED. + ctx = make_ctx([{"verdict": "SUPPORTED", "reason": "matches"}]) + item = { + "findings": [ + { + "id": "F1", + "description": "The process takes 3 days.", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days as documented."}], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 1 + assert result["total"] == 1 + assert result["unsupported_ids"] == [] + + +@pytest.mark.asyncio +async def test_faithfulness_not_supported(): + ctx = make_ctx([{"verdict": "NOT_SUPPORTED", "reason": "contradicts"}]) + item = { + "findings": [ + { + "id": "F1", + "description": "The process takes 45 days.", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#1", "excerpt": "The process takes 3 days."}], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 0 + assert result["total"] == 1 + assert "F1" in result["unsupported_ids"] + + +@pytest.mark.asyncio +async def test_faithfulness_no_cited_evidence(): + # Finding with no evidence_refs -> counted as unsupported without LLM call. + ctx = make_ctx([]) + item = { + "findings": [{"id": "F1", "description": "Something.", "evidence_refs": []}], + "evidence_index": [], + } + result = await faithfulness(item, ctx) + assert result["supported"] == 0 + assert result["total"] == 1 + assert "F1" in result["unsupported_ids"] + + +# ── source_coverage ─────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_source_coverage_all_cited(): + ctx = make_ctx([]) + item = { + "findings": [ + { + "id": "F1", + "description": "X", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [{"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text"}], + } + result = await source_coverage(item, ctx) + assert result["cited"] == 1 + assert result["total"] == 1 + assert result["orphaned"] == [] + + +@pytest.mark.asyncio +async def test_source_coverage_orphaned(): + ctx = make_ctx([]) + item = { + "findings": [{"id": "F1", "description": "X", "evidence_refs": []}], + "evidence_index": [ + {"id": "E1", "locator": "doc1.pdf#p1", "excerpt": "text"}, + {"id": "E2", "locator": "doc2.pdf#p2", "excerpt": "text2"}, + ], + } + result = await source_coverage(item, ctx) + assert result["cited"] == 0 + assert result["total"] == 2 + assert len(result["orphaned"]) == 2 + + +@pytest.mark.asyncio +async def test_source_coverage_stem_dedup(): + # Two evidence items from the same file (different fragments) -> 1 source stem. + ctx = make_ctx([]) + item = { + "findings": [ + { + "id": "F1", + "description": "X", + "evidence_refs": [{"evidence_id": "E1"}], + } + ], + "evidence_index": [ + {"id": "E1", "locator": "doc.pdf#section1", "excerpt": "text1"}, + {"id": "E2", "locator": "doc.pdf#section2", "excerpt": "text2"}, + ], + } + result = await source_coverage(item, ctx) + # Both E1 and E2 share "doc.pdf" stem -> 1 total stem. + assert result["total"] == 1 + # E1 is cited -> that stem is covered. + assert result["cited"] == 1 + + +# ── excerpt_fill_rate ────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_full(): + ctx = make_ctx([]) + item = { + "evidence_index": [ + {"id": "E1", "excerpt": "has content"}, + {"id": "E2", "excerpt": "also has content"}, + ] + } + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 2 + assert result["total"] == 2 + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_partial(): + ctx = make_ctx([]) + item = { + "evidence_index": [ + {"id": "E1", "excerpt": "has content"}, + {"id": "E2", "excerpt": ""}, + {"id": "E3", "excerpt": " "}, + ] + } + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 1 + assert result["total"] == 3 + + +@pytest.mark.asyncio +async def test_excerpt_fill_rate_empty(): + ctx = make_ctx([]) + item = {"evidence_index": []} + result = await excerpt_fill_rate(item, ctx) + assert result["populated"] == 0 + assert result["total"] == 0 From 564697405a176b36b418283bee9bc1bf18a4c918 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:05 +0200 Subject: [PATCH 34/67] fix(lab): type-annotate out dict, remove quoted return type in retrieval_metrics --- fireflyframework_agentic/lab/retrieval_metrics.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/lab/retrieval_metrics.py index 5f3e2373..ee129eec 100644 --- a/fireflyframework_agentic/lab/retrieval_metrics.py +++ b/fireflyframework_agentic/lab/retrieval_metrics.py @@ -66,11 +66,7 @@ def _dedup(retrieved: list[dict]) -> list[dict]: def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: """Return nDCG@k for a single query.""" - dcg = sum( - 1.0 / math.log2(r["rank"] + 1) - for r in retrieved - if r.get("is_gold") and r["rank"] <= k - ) + dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k) ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) return dcg / ideal if ideal else 0.0 @@ -140,7 +136,7 @@ def compute_retrieval_metrics(results: list[dict]) -> dict: if row.get("answer_ms") is not None: answer_ms.append(row["answer_ms"]) - out = {k: round(v / n, 4) for k, v in agg.items()} if n else {} + out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {} out["n_queries"] = n out["no_answer_rate"] = round(no_answer / n, 4) if n else None out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None @@ -176,7 +172,7 @@ class RetrieverMetrics(BaseModel): mean_answer_ms: float | None = None @classmethod - def from_results(cls, results: list[dict]) -> "RetrieverMetrics": + def from_results(cls, results: list[dict]) -> RetrieverMetrics: """Compute metrics from raw retrieval result rows and return a model instance.""" m = compute_retrieval_metrics(results) return cls( From 582d1c044609fc0544bb74ab93bf65051dbc59e5 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:11 +0200 Subject: [PATCH 35/67] fix(lab): remove unused import math, fix import sort in test_retrieval_metrics --- tests/unit/lab/test_retrieval_metrics.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/lab/test_retrieval_metrics.py index a018a08b..1053c550 100644 --- a/tests/unit/lab/test_retrieval_metrics.py +++ b/tests/unit/lab/test_retrieval_metrics.py @@ -16,16 +16,11 @@ from __future__ import annotations -import math - -import pytest - from fireflyframework_agentic.lab.retrieval_metrics import ( RetrieverMetrics, compute_retrieval_metrics, ) - # ── helpers ─────────────────────────────────────────────────────────────────── @@ -37,11 +32,13 @@ def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: """ retrieved = [] for rank in range(1, total + 1): - retrieved.append({ - "rank": rank, - "source_id": f"doc-{rank}", - "is_gold": rank == gold_rank, - }) + retrieved.append( + { + "rank": rank, + "source_id": f"doc-{rank}", + "is_gold": rank == gold_rank, + } + ) gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] return { "retrieved": retrieved, From 3e62b1f92697903909544350ab26d2eb69800f36 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 13:52:11 +0200 Subject: [PATCH 36/67] fix(evaluation): add type: ignore for pyright errors on RAGAS/langchain calls in judge.py --- fireflyframework_agentic/evaluation/judge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index 9f24dc26..d5bcad66 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -685,12 +685,12 @@ def _make_ragas_llm(ctx: EvalContext): from langchain_anthropic import ChatAnthropic # type: ignore[import] # noqa: PLC0415 api_key = os.environ.get("ANTHROPIC_API_KEY", "") - return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) + return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] if provider in ("openai", "azure"): from langchain_openai import ChatOpenAI # type: ignore[import] # noqa: PLC0415 api_key = os.environ.get("OPENAI_API_KEY", "") - return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) + return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] if provider == "ollama": from langchain_ollama import ChatOllama # type: ignore[import] # noqa: PLC0415 @@ -740,7 +740,7 @@ def _sync(): sample = _make_ragas_sample(item) dataset = EvaluationDataset(samples=[sample]) result = evaluate(dataset=dataset, metrics=[metric]) - df = result.to_pandas() + df = result.to_pandas() # type: ignore[attr-defined] col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)] if col.empty: return None From 3679dbca4b2fea88cf9339b9c7aac279b0891def Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:02 +0200 Subject: [PATCH 37/67] refactor(evaluation): move retrieval_metrics.py from lab/ to evaluation/ --- fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fireflyframework_agentic/{lab => evaluation}/retrieval_metrics.py (100%) diff --git a/fireflyframework_agentic/lab/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py similarity index 100% rename from fireflyframework_agentic/lab/retrieval_metrics.py rename to fireflyframework_agentic/evaluation/retrieval_metrics.py From 6bce3748a7988907c3e39235cf96fda0b07b38ff Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:19 +0200 Subject: [PATCH 38/67] =?UTF-8?q?refactor(evaluation):=20update=20imports?= =?UTF-8?q?=20=E2=80=94=20retrieval=5Fmetrics=20now=20in=20evaluation/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fireflyframework_agentic/evaluation/__init__.py | 4 ++-- fireflyframework_agentic/lab/__init__.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index c2005e7a..c68f5a19 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -82,9 +82,9 @@ from fireflyframework_agentic.evaluation.judge_client import ( same_provider as same_provider, ) -from fireflyframework_agentic.lab.retrieval_metrics import ( +from fireflyframework_agentic.evaluation.retrieval_metrics import ( RetrieverMetrics as RetrieverMetrics, ) -from fireflyframework_agentic.lab.retrieval_metrics import ( +from fireflyframework_agentic.evaluation.retrieval_metrics import ( compute_retrieval_metrics as compute_retrieval_metrics, ) diff --git a/fireflyframework_agentic/lab/__init__.py b/fireflyframework_agentic/lab/__init__.py index 8e127d8a..46cc08dc 100644 --- a/fireflyframework_agentic/lab/__init__.py +++ b/fireflyframework_agentic/lab/__init__.py @@ -18,7 +18,6 @@ from fireflyframework_agentic.lab.comparison import ComparisonEntry, ModelComparison from fireflyframework_agentic.lab.dataset import EvalCase, EvalDataset from fireflyframework_agentic.lab.evaluator import EvalOrchestrator, EvalReport, EvalResult -from fireflyframework_agentic.lab.retrieval_metrics import RetrieverMetrics, compute_retrieval_metrics from fireflyframework_agentic.lab.session import LabSession, SessionEntry __all__ = [ @@ -32,7 +31,5 @@ "EvalResult", "LabSession", "ModelComparison", - "RetrieverMetrics", "SessionEntry", - "compute_retrieval_metrics", ] From 9229c4348656c3e1e992780be6dc4fcdb06cea2f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 14:58:46 +0200 Subject: [PATCH 39/67] refactor(evaluation): move test_retrieval_metrics.py to tests/unit/evaluation/ --- tests/unit/{lab => evaluation}/test_retrieval_metrics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/{lab => evaluation}/test_retrieval_metrics.py (100%) diff --git a/tests/unit/lab/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py similarity index 100% rename from tests/unit/lab/test_retrieval_metrics.py rename to tests/unit/evaluation/test_retrieval_metrics.py From 6cdd3db11edda4e42c57791e632a2b594e8510cc Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:00:19 +0200 Subject: [PATCH 40/67] refactor(evaluation): replace RetrieverMetrics class with plain functions --- .../evaluation/retrieval_metrics.py | 270 +++++++++--------- 1 file changed, 140 insertions(+), 130 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index ee129eec..5a318a2a 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -14,46 +14,48 @@ """Deterministic IR evaluation metrics for ranked retrieval results (no LLM, no network). -Industry-standard information-retrieval metrics computed over a ranked list of -retrieved chunks vs the gold set each result carries (``gold`` + per-hit -``is_gold``). Metrics are reported at cut-offs k ∈ {1, 5, 10}: - -* **Hit@k** -- at least one gold document appears in the top-k results. -* **Recall@k** -- fraction of gold documents found in top-k. -* **Precision@k** -- fraction of top-k results that are gold. -* **MRR@10** -- mean reciprocal rank of the first gold hit (up to k=10). -* **MAP@10** -- mean average precision (up to k=10). -* **nDCG@10** -- normalised discounted cumulative gain (up to k=10). - -Optional fields (populated when the raw result rows contain them): - -* ``no_answer_rate`` -- fraction of rows where the model produced no answer. -* ``citation_precision`` -- precision of in-answer citations vs gold set. -* ``mean_search_ms`` / ``mean_answer_ms`` -- mean retrieval and generation latencies. - -Ported from ``flycanon_experiments/scripts/deterministic_eval.py``. +Each metric is a plain function that takes a list of result rows and returns a +float — the same design as scikit-learn or MS MARCO evaluation scripts. + +Result row schema (dict):: + + { + "retrieved": [{"rank": int, "source_id": str, "is_gold": bool}, ...], + "gold": [str, ...], # gold source identifiers + # optional: + "no_answer": bool, # model refused / produced no answer + "answer": str, # used for no_answer detection when no_answer absent + "citations": [{"is_gold": bool}, ...], + "search_ms": float, + "answer_ms": float, + } + +Individual metrics (recommended for composability):: + + hit_at_k(results, k) -> float + recall_at_k(results, k) -> float + precision_at_k(results, k) -> float + mrr(results, k=10) -> float + map_score(results, k=10) -> float + ndcg(results, k=10) -> float + no_answer_rate(results) -> float | None + citation_precision(results) -> float | None + mean_latency_ms(results, field) -> float | None + +Convenience aggregate (all metrics in one call):: + + compute_retrieval_metrics(results) -> dict """ from __future__ import annotations import math -from pydantic import BaseModel - KS = (1, 5, 10) def _dedup(retrieved: list[dict]) -> list[dict]: - """Return one entry per source, first chunk wins, preserving rank order. - - flycanon splits each ingested document into many chunks; a single gold - filing can therefore appear multiple times in the ranked list. Without - deduplication nDCG/MAP/Recall count every chunk separately, inflating - scores past 1.0 when a good embedding model retrieves several chunks from - the same filing. Taking only the first (highest-ranked) chunk per - source_id makes the list item-unique, matching the recommenders-library - contract that all IR formulae assume. - """ + """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() out: list[dict] = [] for r in sorted(retrieved, key=lambda x: x["rank"]): @@ -64,15 +66,13 @@ def _dedup(retrieved: list[dict]) -> list[dict]: return out -def _ndcg(retrieved: list[dict], n_gold: int, k: int = 10) -> float: - """Return nDCG@k for a single query.""" +def _ndcg_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float: dcg = sum(1.0 / math.log2(r["rank"] + 1) for r in retrieved if r.get("is_gold") and r["rank"] <= k) ideal = sum(1.0 / math.log2(i + 2) for i in range(min(n_gold, k))) return dcg / ideal if ideal else 0.0 -def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: - """Return average precision@k for a single query.""" +def _ap_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float: hits, precisions = 0, [] for r in sorted(retrieved, key=lambda x: x["rank"]): if r["rank"] > k: @@ -83,114 +83,124 @@ def _ap(retrieved: list[dict], n_gold: int, k: int = 10) -> float: return sum(precisions) / min(n_gold, k) if n_gold else 0.0 -def compute_retrieval_metrics(results: list[dict]) -> dict: - """Compute deterministic IR metrics over a list of retrieval result rows. +def hit_at_k(results: list[dict], k: int) -> float: + """Fraction of queries where at least one gold document appears in top-k.""" + if not results: + return 0.0 + hits = 0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + if any(g <= k for g in gold_ranks): + hits += 1 + return round(hits / len(results), 4) + - Each element of *results* must be a dict with at least: +def recall_at_k(results: list[dict], k: int) -> float: + """Mean fraction of gold documents found in top-k.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + total += len([g for g in gold_ranks if g <= k]) / n_gold + return round(total / len(results), 4) - * ``retrieved`` -- list of dicts with ``rank`` (int, 1-based), ``source_id`` - (str) or ``identities`` (list[str]), and ``is_gold`` (bool). - * ``gold`` -- list of gold source identifiers (used to compute ``n_gold``). - Optional keys per row: +def precision_at_k(results: list[dict], k: int) -> float: + """Mean fraction of top-k results that are gold.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] + total += len([g for g in gold_ranks if g <= k]) / k + return round(total / len(results), 4) - * ``no_answer`` (bool) / ``answer`` (str) -- used for ``no_answer_rate``. - * ``citations`` (list[dict]) -- each with ``is_gold`` (bool) for citation precision. - * ``search_ms`` (float) / ``answer_ms`` (float) -- latency in milliseconds. - Returns a flat dict with keys: ``n_queries``, ``hit@1``, ``hit@5``, - ``hit@10``, ``recall@1``, ``recall@5``, ``recall@10``, ``precision@1``, - ``precision@5``, ``precision@10``, ``mrr@10``, ``map@10``, ``ndcg@10``, - ``no_answer_rate``, ``citation_precision``, ``mean_search_ms``, - ``mean_answer_ms``. - """ - n = len(results) - agg = {f"{m}@{k}": 0.0 for k in KS for m in ("hit", "recall", "precision")} - agg.update({"mrr@10": 0.0, "map@10": 0.0, "ndcg@10": 0.0}) - no_answer = 0 - cite_num = cite_den = 0.0 - search_ms: list[float] = [] - answer_ms: list[float] = [] +def mrr(results: list[dict], k: int = 10) -> float: + """Mean reciprocal rank of the first gold hit (up to k).""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + gold_ranks = sorted(r["rank"] for r in retrieved if r.get("is_gold") and r["rank"] <= k) + total += 1.0 / gold_ranks[0] if gold_ranks else 0.0 + return round(total / len(results), 4) + +def map_score(results: list[dict], k: int = 10) -> float: + """Mean average precision at k.""" + if not results: + return 0.0 + total = 0.0 for row in results: retrieved = _dedup(row["retrieved"]) n_gold = max(len(set(row["gold"])), 1) - gold_ranks = [r["rank"] for r in retrieved if r.get("is_gold")] - for k in KS: - in_k = [g for g in gold_ranks if g <= k] - agg[f"hit@{k}"] += 1.0 if in_k else 0.0 - agg[f"recall@{k}"] += len(in_k) / n_gold - agg[f"precision@{k}"] += len(in_k) / k - agg["mrr@10"] += (1.0 / min(gold_ranks)) if gold_ranks else 0.0 - agg["map@10"] += _ap(retrieved, n_gold) - agg["ndcg@10"] += _ndcg(retrieved, n_gold) - - if row.get("no_answer") or not row.get("answer", "").strip(): - no_answer += 1 + total += _ap_single(retrieved, n_gold, k) + return round(total / len(results), 4) + + +def ndcg(results: list[dict], k: int = 10) -> float: + """Mean normalised discounted cumulative gain at k.""" + if not results: + return 0.0 + total = 0.0 + for row in results: + retrieved = _dedup(row["retrieved"]) + n_gold = max(len(set(row["gold"])), 1) + total += _ndcg_single(retrieved, n_gold, k) + return round(total / len(results), 4) + + +def no_answer_rate(results: list[dict]) -> float | None: + """Fraction of queries where the model produced no answer. None if no results.""" + if not results: + return None + count = sum( + 1 for row in results if row.get("no_answer") or not row.get("answer", "").strip() + ) + return round(count / len(results), 4) + + +def citation_precision(results: list[dict]) -> float | None: + """Precision of in-answer citations vs gold set. None if no citations present.""" + num = den = 0.0 + for row in results: cites = row.get("citations", []) if cites: - cite_num += sum(1 for c in cites if c.get("is_gold")) - cite_den += len(cites) - if row.get("search_ms") is not None: - search_ms.append(row["search_ms"]) - if row.get("answer_ms") is not None: - answer_ms.append(row["answer_ms"]) - - out: dict[str, object] = {k: round(v / n, 4) for k, v in agg.items()} if n else {} - out["n_queries"] = n - out["no_answer_rate"] = round(no_answer / n, 4) if n else None - out["citation_precision"] = round(cite_num / cite_den, 4) if cite_den else None - out["mean_search_ms"] = round(sum(search_ms) / len(search_ms)) if search_ms else None - out["mean_answer_ms"] = round(sum(answer_ms) / len(answer_ms)) if answer_ms else None - return out + num += sum(1 for c in cites if c.get("is_gold")) + den += len(cites) + return round(num / den, 4) if den else None -class RetrieverMetrics(BaseModel): - """Structured IR metrics for a retrieval evaluation run. +def mean_latency_ms(results: list[dict], field: str) -> float | None: + """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent.""" + values = [row[field] for row in results if row.get(field) is not None] + return round(sum(values) / len(values)) if values else None - Fields mirror the flat dict returned by :func:`compute_retrieval_metrics`. - Optional fields are ``None`` when the raw result rows lack the required data - (e.g. no latency timestamps, no citations). - """ - n_queries: int = 0 - hit_at_1: float = 0.0 - hit_at_5: float = 0.0 - hit_at_10: float = 0.0 - recall_at_1: float = 0.0 - recall_at_5: float = 0.0 - recall_at_10: float = 0.0 - precision_at_1: float = 0.0 - precision_at_5: float = 0.0 - precision_at_10: float = 0.0 - mrr_at_10: float = 0.0 - map_at_10: float = 0.0 - ndcg_at_10: float = 0.0 - no_answer_rate: float | None = None - citation_precision: float | None = None - mean_search_ms: float | None = None - mean_answer_ms: float | None = None - - @classmethod - def from_results(cls, results: list[dict]) -> RetrieverMetrics: - """Compute metrics from raw retrieval result rows and return a model instance.""" - m = compute_retrieval_metrics(results) - return cls( - n_queries=m.get("n_queries", 0), - hit_at_1=m.get("hit@1", 0.0), - hit_at_5=m.get("hit@5", 0.0), - hit_at_10=m.get("hit@10", 0.0), - recall_at_1=m.get("recall@1", 0.0), - recall_at_5=m.get("recall@5", 0.0), - recall_at_10=m.get("recall@10", 0.0), - precision_at_1=m.get("precision@1", 0.0), - precision_at_5=m.get("precision@5", 0.0), - precision_at_10=m.get("precision@10", 0.0), - mrr_at_10=m.get("mrr@10", 0.0), - map_at_10=m.get("map@10", 0.0), - ndcg_at_10=m.get("ndcg@10", 0.0), - no_answer_rate=m.get("no_answer_rate"), - citation_precision=m.get("citation_precision"), - mean_search_ms=m.get("mean_search_ms"), - mean_answer_ms=m.get("mean_answer_ms"), - ) +def compute_retrieval_metrics(results: list[dict]) -> dict: + """Compute all IR metrics over a list of retrieval result rows and return a flat dict. + + Convenience wrapper that calls each individual metric function. Prefer the + individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only + need a subset. + """ + out: dict[str, object] = {"n_queries": len(results)} + for k in KS: + out[f"hit@{k}"] = hit_at_k(results, k) + out[f"recall@{k}"] = recall_at_k(results, k) + out[f"precision@{k}"] = precision_at_k(results, k) + out["mrr@10"] = mrr(results) + out["map@10"] = map_score(results) + out["ndcg@10"] = ndcg(results) + out["no_answer_rate"] = no_answer_rate(results) + out["citation_precision"] = citation_precision(results) + out["mean_search_ms"] = mean_latency_ms(results, "search_ms") + out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms") + return out From 3a3c35fbb775340ddc3c05f5265849610a90bac2 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:00:34 +0200 Subject: [PATCH 41/67] =?UTF-8?q?refactor(evaluation):=20update=20=5F=5Fin?= =?UTF-8?q?it=5F=5F.py=20exports=20=E2=80=94=20replace=20RetrieverMetrics?= =?UTF-8?q?=20with=20individual=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../evaluation/__init__.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index c68f5a19..9f31ee7b 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -83,8 +83,32 @@ same_provider as same_provider, ) from fireflyframework_agentic.evaluation.retrieval_metrics import ( - RetrieverMetrics as RetrieverMetrics, + citation_precision as citation_precision, ) from fireflyframework_agentic.evaluation.retrieval_metrics import ( compute_retrieval_metrics as compute_retrieval_metrics, ) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + hit_at_k as hit_at_k, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + map_score as map_score, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + mean_latency_ms as mean_latency_ms, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + mrr as mrr, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + ndcg as ndcg, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + no_answer_rate as no_answer_rate, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + precision_at_k as precision_at_k, +) +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + recall_at_k as recall_at_k, +) From 26bfe3b0b56362039eb00c2d0859858ed52e542d Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:01:27 +0200 Subject: [PATCH 42/67] test(evaluation): rewrite test_retrieval_metrics for individual metric functions --- .../unit/evaluation/test_retrieval_metrics.py | 254 ++++++++---------- 1 file changed, 107 insertions(+), 147 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index 1053c550..38fc07fe 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -12,233 +12,193 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit tests for lab.retrieval_metrics: compute_retrieval_metrics and RetrieverMetrics.""" +"""Unit tests for evaluation.retrieval_metrics.""" from __future__ import annotations -from fireflyframework_agentic.lab.retrieval_metrics import ( - RetrieverMetrics, +from fireflyframework_agentic.evaluation.retrieval_metrics import ( + citation_precision, compute_retrieval_metrics, + hit_at_k, + map_score, + mean_latency_ms, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, ) -# ── helpers ─────────────────────────────────────────────────────────────────── - def _row(gold_rank: int | None = None, total: int = 5, n_gold: int = 1) -> dict: - """Build one result row with ``total`` retrieved items. - - If ``gold_rank`` is not None, the item at that rank is marked as gold. - All items get a unique ``source_id`` so dedup leaves them all. - """ retrieved = [] for rank in range(1, total + 1): - retrieved.append( - { - "rank": rank, - "source_id": f"doc-{rank}", - "is_gold": rank == gold_rank, - } - ) + retrieved.append({"rank": rank, "source_id": f"doc-{rank}", "is_gold": rank == gold_rank}) gold_ids = [f"doc-{gold_rank}"] if gold_rank is not None else [] - return { - "retrieved": retrieved, - "gold": gold_ids * n_gold, - } + return {"retrieved": retrieved, "gold": gold_ids * n_gold} -# ── hit@k ───────────────────────────────────────────────────────────────────── +# ── hit_at_k ────────────────────────────────────────────────────────────────── -def test_hit_at_1_perfect_when_gold_is_rank1(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["hit@1"] == 1.0 +def test_hit_at_k_gold_at_rank1(): + assert hit_at_k([_row(gold_rank=1)], k=1) == 1.0 -def test_hit_at_1_zero_when_gold_not_in_top1(): - results = [_row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert m["hit@1"] == 0.0 +def test_hit_at_k_miss_at_rank1(): + assert hit_at_k([_row(gold_rank=2)], k=1) == 0.0 + + +def test_hit_at_k_gold_at_rank5(): + assert hit_at_k([_row(gold_rank=5)], k=5) == 1.0 -def test_hit_at_5_one_when_gold_at_rank5(): - results = [_row(gold_rank=5)] - m = compute_retrieval_metrics(results) - assert m["hit@5"] == 1.0 +def test_hit_at_k_gold_at_rank10(): + assert hit_at_k([_row(gold_rank=10, total=10)], k=10) == 1.0 -def test_hit_at_5_zero_when_gold_not_in_top5(): - # Gold is at rank 10 — outside top-5 window with only 5 items, make 10. - results = [_row(gold_rank=None, total=10)] # no gold in retrieved - m = compute_retrieval_metrics(results) - assert m["hit@5"] == 0.0 +def test_hit_at_k_empty(): + assert hit_at_k([], k=5) == 0.0 -def test_hit_at_10_one_when_gold_at_rank10(): - results = [_row(gold_rank=10, total=10)] - m = compute_retrieval_metrics(results) - assert m["hit@10"] == 1.0 +# ── recall_at_k ─────────────────────────────────────────────────────────────── -# ── recall@k ────────────────────────────────────────────────────────────────── +def test_recall_at_k_full_when_gold_at_rank1(): + assert recall_at_k([_row(gold_rank=1, n_gold=1)], k=1) == 1.0 + + +def test_recall_at_k_zero_when_gold_outside_k(): + assert recall_at_k([_row(gold_rank=5)], k=1) == 0.0 def test_recall_at_k_increases_with_k(): - # Gold at rank 3: recall@1=0, recall@5>=recall@1. - results = [_row(gold_rank=3)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] <= m["recall@5"] <= m["recall@10"] + rows = [_row(gold_rank=3)] + assert recall_at_k(rows, k=1) <= recall_at_k(rows, k=5) <= recall_at_k(rows, k=10) + + +# ── precision_at_k ──────────────────────────────────────────────────────────── -def test_recall_at_1_full_when_single_gold_at_rank1(): - results = [_row(gold_rank=1, n_gold=1)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] == 1.0 +def test_precision_at_k_gold_at_rank1(): + assert precision_at_k([_row(gold_rank=1)], k=1) == 1.0 -def test_recall_at_1_zero_when_no_gold_in_rank1(): - results = [_row(gold_rank=5)] - m = compute_retrieval_metrics(results) - assert m["recall@1"] == 0.0 +def test_precision_at_k_decreases_when_k_larger(): + rows = [_row(gold_rank=1)] + assert precision_at_k(rows, k=5) < precision_at_k(rows, k=1) -# ── MRR ─────────────────────────────────────────────────────────────────────── +# ── mrr ─────────────────────────────────────────────────────────────────────── -def test_mrr_is_1_when_gold_at_rank1(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["mrr@10"] == 1.0 +def test_mrr_gold_at_rank1(): + assert mrr([_row(gold_rank=1)]) == 1.0 -def test_mrr_is_half_when_gold_at_rank2(): - results = [_row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert abs(m["mrr@10"] - 0.5) < 1e-9 +def test_mrr_gold_at_rank2(): + assert abs(mrr([_row(gold_rank=2)]) - 0.5) < 1e-9 -def test_mrr_is_zero_when_no_gold(): - results = [_row(gold_rank=None)] - m = compute_retrieval_metrics(results) - assert m["mrr@10"] == 0.0 +def test_mrr_no_gold(): + assert mrr([_row(gold_rank=None)]) == 0.0 def test_mrr_average_across_queries(): - # Query 1: gold at rank 1 (MRR=1.0); Query 2: gold at rank 2 (MRR=0.5). - results = [_row(gold_rank=1), _row(gold_rank=2)] - m = compute_retrieval_metrics(results) - assert abs(m["mrr@10"] - 0.75) < 1e-3 + rows = [_row(gold_rank=1), _row(gold_rank=2)] + assert abs(mrr(rows) - 0.75) < 1e-3 -# ── nDCG ────────────────────────────────────────────────────────────────────── +# ── ndcg ────────────────────────────────────────────────────────────────────── -def test_ndcg_is_1_when_gold_at_rank1(): - results = [_row(gold_rank=1, n_gold=1)] - m = compute_retrieval_metrics(results) - assert abs(m["ndcg@10"] - 1.0) < 1e-9 +def test_ndcg_gold_at_rank1(): + assert abs(ndcg([_row(gold_rank=1, n_gold=1)]) - 1.0) < 1e-9 -def test_ndcg_is_less_than_1_when_gold_not_at_rank1(): - results = [_row(gold_rank=3, n_gold=1)] - m = compute_retrieval_metrics(results) - assert m["ndcg@10"] < 1.0 - assert m["ndcg@10"] > 0.0 +def test_ndcg_less_than_1_when_not_at_rank1(): + score = ndcg([_row(gold_rank=3, n_gold=1)]) + assert 0.0 < score < 1.0 -def test_ndcg_is_zero_when_no_gold(): - results = [_row(gold_rank=None)] - m = compute_retrieval_metrics(results) - assert m["ndcg@10"] == 0.0 +def test_ndcg_zero_when_no_gold(): + assert ndcg([_row(gold_rank=None)]) == 0.0 -# ── n_queries ───────────────────────────────────────────────────────────────── +# ── map_score ───────────────────────────────────────────────────────────────── -def test_n_queries_matches_input_length(): - results = [_row(gold_rank=1), _row(gold_rank=2), _row(gold_rank=3)] - m = compute_retrieval_metrics(results) - assert m["n_queries"] == 3 +def test_map_score_perfect_when_gold_at_rank1(): + assert map_score([_row(gold_rank=1, n_gold=1)]) == 1.0 -def test_empty_results_returns_zero_n_queries(): - m = compute_retrieval_metrics([]) - assert m["n_queries"] == 0 +def test_map_score_zero_when_no_gold(): + assert map_score([_row(gold_rank=None)]) == 0.0 + + +# ── no_answer_rate ──────────────────────────────────────────────────────────── + +def test_no_answer_rate_zero_when_answer_present(): + rows = [{**_row(gold_rank=1), "answer": "some answer"}] + assert no_answer_rate(rows) == 0.0 -# ── optional fields ─────────────────────────────────────────────────────────── +def test_no_answer_rate_one_when_no_answer_field(): + assert no_answer_rate([_row(gold_rank=1)]) == 1.0 -def test_no_answer_rate_is_zero_when_answer_present(): - # Rows with a non-empty answer string are counted as answered. - results = [{**_row(gold_rank=1), "answer": "some answer text"}] - m = compute_retrieval_metrics(results) - assert m["no_answer_rate"] == 0.0 +def test_no_answer_rate_none_when_empty(): + assert no_answer_rate([]) is None -def test_no_answer_rate_is_one_when_no_answer_field(): - # Rows without an answer field are treated as no-answer by the implementation. - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["no_answer_rate"] == 1.0 +# ── citation_precision ──────────────────────────────────────────────────────── -def test_citation_precision_is_none_when_no_citations(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["citation_precision"] is None +def test_citation_precision_none_when_no_citations(): + assert citation_precision([_row(gold_rank=1)]) is None -def test_latency_fields_are_none_when_absent(): - results = [_row(gold_rank=1)] - m = compute_retrieval_metrics(results) - assert m["mean_search_ms"] is None - assert m["mean_answer_ms"] is None +def test_citation_precision_1_when_all_gold(): + rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": True}]}] + assert citation_precision(rows) == 1.0 -def test_mean_search_ms_computed_when_present(): - results = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] - m = compute_retrieval_metrics(results) - assert m["mean_search_ms"] == 100 - assert m["mean_answer_ms"] == 200 +def test_citation_precision_half_when_half_gold(): + rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": False}]}] + assert citation_precision(rows) == 0.5 -# ── RetrieverMetrics.from_results ───────────────────────────────────────────── +# ── mean_latency_ms ─────────────────────────────────────────────────────────── -def test_retriever_metrics_from_results_hit_at_1(): - results = [_row(gold_rank=1)] - rm = RetrieverMetrics.from_results(results) - assert rm.hit_at_1 == 1.0 +def test_mean_latency_none_when_field_absent(): + assert mean_latency_ms([_row(gold_rank=1)], "search_ms") is None -def test_retriever_metrics_from_results_n_queries(): - results = [_row(gold_rank=1), _row(gold_rank=2)] - rm = RetrieverMetrics.from_results(results) - assert rm.n_queries == 2 +def test_mean_latency_computed_when_present(): + rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] + assert mean_latency_ms(rows, "search_ms") == 100 + assert mean_latency_ms(rows, "answer_ms") == 200 -def test_retriever_metrics_from_results_mrr(): - results = [_row(gold_rank=1)] - rm = RetrieverMetrics.from_results(results) - assert rm.mrr_at_10 == 1.0 +# ── compute_retrieval_metrics (aggregate) ───────────────────────────────────── -def test_retriever_metrics_from_results_defaults_on_empty(): - rm = RetrieverMetrics.from_results([]) - assert rm.n_queries == 0 - assert rm.hit_at_1 == 0.0 - assert rm.mrr_at_10 == 0.0 +def test_compute_retrieval_metrics_n_queries(): + assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3 -def test_retriever_metrics_is_pydantic_model(): - rm = RetrieverMetrics() - assert rm.n_queries == 0 - assert rm.hit_at_1 == 0.0 - assert rm.no_answer_rate is None + +def test_compute_retrieval_metrics_empty(): + m = compute_retrieval_metrics([]) + assert m["n_queries"] == 0 + assert m["hit@1"] == 0.0 -def test_retriever_metrics_recall_increases_with_k(): - results = [_row(gold_rank=3)] - rm = RetrieverMetrics.from_results(results) - assert rm.recall_at_1 <= rm.recall_at_5 <= rm.recall_at_10 +def test_compute_retrieval_metrics_matches_individual_functions(): + rows = [_row(gold_rank=1), _row(gold_rank=2)] + m = compute_retrieval_metrics(rows) + assert m["hit@1"] == hit_at_k(rows, 1) + assert m["recall@5"] == recall_at_k(rows, 5) + assert m["mrr@10"] == mrr(rows) + assert m["ndcg@10"] == ndcg(rows) From feadcbdc28a70cc9bd0cf38b3793268108f845f3 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:06 +0200 Subject: [PATCH 43/67] Remove compute_retrieval_metrics() and KS constant from retrieval_metrics --- .../evaluation/retrieval_metrics.py | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index 5a318a2a..df42ab24 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -30,7 +30,7 @@ "answer_ms": float, } -Individual metrics (recommended for composability):: +Individual metrics:: hit_at_k(results, k) -> float recall_at_k(results, k) -> float @@ -41,19 +41,12 @@ no_answer_rate(results) -> float | None citation_precision(results) -> float | None mean_latency_ms(results, field) -> float | None - -Convenience aggregate (all metrics in one call):: - - compute_retrieval_metrics(results) -> dict """ from __future__ import annotations import math -KS = (1, 5, 10) - - def _dedup(retrieved: list[dict]) -> list[dict]: """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() @@ -184,23 +177,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None: return round(sum(values) / len(values)) if values else None -def compute_retrieval_metrics(results: list[dict]) -> dict: - """Compute all IR metrics over a list of retrieval result rows and return a flat dict. - - Convenience wrapper that calls each individual metric function. Prefer the - individual functions (``hit_at_k``, ``recall_at_k``, etc.) when you only - need a subset. - """ - out: dict[str, object] = {"n_queries": len(results)} - for k in KS: - out[f"hit@{k}"] = hit_at_k(results, k) - out[f"recall@{k}"] = recall_at_k(results, k) - out[f"precision@{k}"] = precision_at_k(results, k) - out["mrr@10"] = mrr(results) - out["map@10"] = map_score(results) - out["ndcg@10"] = ndcg(results) - out["no_answer_rate"] = no_answer_rate(results) - out["citation_precision"] = citation_precision(results) - out["mean_search_ms"] = mean_latency_ms(results, "search_ms") - out["mean_answer_ms"] = mean_latency_ms(results, "answer_ms") - return out From d54814fa98f85f42c8ef20be5f6f74db3b111f81 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:12 +0200 Subject: [PATCH 44/67] Remove compute_retrieval_metrics export from evaluation __init__ --- fireflyframework_agentic/evaluation/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 9f31ee7b..35dd32f7 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -85,9 +85,6 @@ from fireflyframework_agentic.evaluation.retrieval_metrics import ( citation_precision as citation_precision, ) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - compute_retrieval_metrics as compute_retrieval_metrics, -) from fireflyframework_agentic.evaluation.retrieval_metrics import ( hit_at_k as hit_at_k, ) From 08536982e27522f6c3ade60db4c6e2716942e46e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:09:35 +0200 Subject: [PATCH 45/67] Remove test_compute_retrieval_metrics_* tests --- .../unit/evaluation/test_retrieval_metrics.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index 38fc07fe..ef38467f 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -18,7 +18,6 @@ from fireflyframework_agentic.evaluation.retrieval_metrics import ( citation_precision, - compute_retrieval_metrics, hit_at_k, map_score, mean_latency_ms, @@ -182,23 +181,3 @@ def test_mean_latency_computed_when_present(): assert mean_latency_ms(rows, "answer_ms") == 200 -# ── compute_retrieval_metrics (aggregate) ───────────────────────────────────── - - -def test_compute_retrieval_metrics_n_queries(): - assert compute_retrieval_metrics([_row(1), _row(2), _row(3)])["n_queries"] == 3 - - -def test_compute_retrieval_metrics_empty(): - m = compute_retrieval_metrics([]) - assert m["n_queries"] == 0 - assert m["hit@1"] == 0.0 - - -def test_compute_retrieval_metrics_matches_individual_functions(): - rows = [_row(gold_rank=1), _row(gold_rank=2)] - m = compute_retrieval_metrics(rows) - assert m["hit@1"] == hit_at_k(rows, 1) - assert m["recall@5"] == recall_at_k(rows, 5) - assert m["mrr@10"] == mrr(rows) - assert m["ndcg@10"] == ndcg(rows) From a7b1b91843b8c1c848872375d0b01618ceb84143 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:10:32 +0200 Subject: [PATCH 46/67] Update flycanon_eval_example to use plain metric functions instead of RetrieverMetrics --- examples/flycanon_eval_example.py | 74 ++++++++++++++++--------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py index 856b520b..30e66bd1 100644 --- a/examples/flycanon_eval_example.py +++ b/examples/flycanon_eval_example.py @@ -26,7 +26,7 @@ The champion/challenger pattern mirrors the flycanon_experiments harness: each run writes metrics to a file; ``approve`` promotes it by repointing baseline.json. Here we replicate that flow using the framework's -``compute_retrieval_metrics`` / ``RetrieverMetrics`` API directly. +individual retrieval metric functions directly. Usage:: @@ -94,7 +94,17 @@ import sys from pathlib import Path -from fireflyframework_agentic.evaluation import RetrieverMetrics +from fireflyframework_agentic.evaluation import ( + citation_precision, + hit_at_k, + map_score, + mean_latency_ms, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, +) # --------------------------------------------------------------------------- # Helpers @@ -131,32 +141,31 @@ def _save_baseline(path: str, metrics: dict) -> None: Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") -def _metrics_to_flat(m: RetrieverMetrics) -> dict: - """Convert a RetrieverMetrics model to the flat dict stored in baseline.json.""" +def _compute_metrics(results: list[dict]) -> dict: + """Compute all IR metrics and return a flat dict.""" return { - "n_queries": m.n_queries, - "hit@1": m.hit_at_1, - "hit@5": m.hit_at_5, - "hit@10": m.hit_at_10, - "recall@1": m.recall_at_1, - "recall@5": m.recall_at_5, - "recall@10": m.recall_at_10, - "precision@1": m.precision_at_1, - "precision@5": m.precision_at_5, - "precision@10": m.precision_at_10, - "mrr@10": m.mrr_at_10, - "map@10": m.map_at_10, - "ndcg@10": m.ndcg_at_10, - "no_answer_rate": m.no_answer_rate, - "citation_precision": m.citation_precision, - "mean_search_ms": m.mean_search_ms, - "mean_answer_ms": m.mean_answer_ms, + "n_queries": len(results), + "hit@1": hit_at_k(results, 1), + "hit@5": hit_at_k(results, 5), + "hit@10": hit_at_k(results, 10), + "recall@1": recall_at_k(results, 1), + "recall@5": recall_at_k(results, 5), + "recall@10": recall_at_k(results, 10), + "precision@1": precision_at_k(results, 1), + "precision@5": precision_at_k(results, 5), + "precision@10": precision_at_k(results, 10), + "mrr@10": mrr(results), + "map@10": map_score(results), + "ndcg@10": ndcg(results), + "no_answer_rate": no_answer_rate(results), + "citation_precision": citation_precision(results), + "mean_search_ms": mean_latency_ms(results, "search_ms"), + "mean_answer_ms": mean_latency_ms(results, "answer_ms"), } -def _print_metrics_table(metrics: RetrieverMetrics, baseline: dict | None) -> None: +def _print_metrics_table(flat: dict, baseline: dict | None) -> None: """Print a formatted table comparing current metrics vs baseline.""" - flat = _metrics_to_flat(metrics) col_w = 22 num_w = 10 @@ -244,10 +253,6 @@ def run_evaluation(args: argparse.Namespace) -> int: # ------------------------------------------------------------------ # Step 2 — Compute deterministic IR metrics. # - # compute_retrieval_metrics() returns a flat dict of standard IR metrics. - # RetrieverMetrics.from_results() wraps that into a typed Pydantic model - # for convenient attribute access. - # # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include: # hit@k -- at least one gold doc in top-k (binary) # recall@k -- fraction of gold docs found in top-k @@ -257,13 +262,13 @@ def run_evaluation(args: argparse.Namespace) -> int: # ndcg@10 -- normalised discounted cumulative gain # ------------------------------------------------------------------ print("\nComputing retrieval metrics ...") - metrics = RetrieverMetrics.from_results(results) + flat = _compute_metrics(results) - print(f" nDCG@10 : {metrics.ndcg_at_10:.4f}") - print(f" MRR@10 : {metrics.mrr_at_10:.4f}") - print(f" Recall@10 : {metrics.recall_at_10:.4f}") - print(f" Hit@10 : {metrics.hit_at_10:.4f}") - print(f" MAP@10 : {metrics.map_at_10:.4f}") + print(f" nDCG@10 : {flat['ndcg@10']:.4f}") + print(f" MRR@10 : {flat['mrr@10']:.4f}") + print(f" Recall@10 : {flat['recall@10']:.4f}") + print(f" Hit@10 : {flat['hit@10']:.4f}") + print(f" MAP@10 : {flat['map@10']:.4f}") # ------------------------------------------------------------------ # Step 3 — Load the baseline (champion) for regression detection. @@ -282,7 +287,7 @@ def run_evaluation(args: argparse.Namespace) -> int: print("\n" + "=" * 56) print("Retrieval Metrics") print("=" * 56) - _print_metrics_table(metrics, baseline) + _print_metrics_table(flat, baseline) # ------------------------------------------------------------------ # Step 5 — Regression check. @@ -291,7 +296,6 @@ def run_evaluation(args: argparse.Namespace) -> int: # promotion (exit code 1) unless --promote-if-better is set and the # run actually improved overall. # ------------------------------------------------------------------ - flat = _metrics_to_flat(metrics) if baseline: regressions = _detect_regressions(flat, baseline) From 0c911b3d5d0e02d8c47b829d63dedd133b0ed8f5 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:12:02 +0200 Subject: [PATCH 47/67] Apply ruff format to retrieval_metrics.py --- fireflyframework_agentic/evaluation/retrieval_metrics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index df42ab24..7c9c5cfe 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -47,6 +47,7 @@ import math + def _dedup(retrieved: list[dict]) -> list[dict]: """Return one entry per source, first chunk wins, preserving rank order.""" seen: set[str] = set() @@ -154,9 +155,7 @@ def no_answer_rate(results: list[dict]) -> float | None: """Fraction of queries where the model produced no answer. None if no results.""" if not results: return None - count = sum( - 1 for row in results if row.get("no_answer") or not row.get("answer", "").strip() - ) + count = sum(1 for row in results if row.get("no_answer") or not row.get("answer", "").strip()) return round(count / len(results), 4) @@ -175,5 +174,3 @@ def mean_latency_ms(results: list[dict], field: str) -> float | None: """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent.""" values = [row[field] for row in results if row.get(field) is not None] return round(sum(values) / len(values)) if values else None - - From ef16882e83038856c182c67ad0818446c135ea2f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 19 Jun 2026 15:15:10 +0200 Subject: [PATCH 48/67] Apply ruff format to test_retrieval_metrics.py --- tests/unit/evaluation/test_retrieval_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index ef38467f..fa453e2d 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -179,5 +179,3 @@ def test_mean_latency_computed_when_present(): rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] assert mean_latency_ms(rows, "search_ms") == 100 assert mean_latency_ms(rows, "answer_ms") == 200 - - From e9e97d1654340aa660f8113afd168562fa6dab72 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:48:29 +0200 Subject: [PATCH 49/67] fix(evaluation): deepcopy base in _median_runs to prevent mutation of first sample --- fireflyframework_agentic/evaluation/judge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index d5bcad66..c2dfe0c2 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -10,6 +10,7 @@ from __future__ import annotations import asyncio +import copy import math import os import statistics @@ -810,7 +811,7 @@ def _median_runs(samples: list[dict]) -> dict: for s in samples: for path, val in _numeric_leaves(s).items(): leaf_values.setdefault(path, []).append(val) - merged = dict(base) + merged = copy.deepcopy(base) for path, vals in leaf_values.items(): try: _set_leaf(merged, path, round(statistics.median(vals), 4)) From eeb315fcd6ba334c8306e83be9a860506694b64c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:48:35 +0200 Subject: [PATCH 50/67] fix(evaluation): strip ragas_ prefix in _ragas_score column lookup so ragas_faithfulness matches faithfulness column --- fireflyframework_agentic/evaluation/judge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index c2dfe0c2..a37842a9 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -742,7 +742,7 @@ def _sync(): dataset = EvaluationDataset(samples=[sample]) result = evaluate(dataset=dataset, metrics=[metric]) df = result.to_pandas() # type: ignore[attr-defined] - col = df.columns[df.columns.str.contains(metric_name.replace("_ragas", ""), case=False)] + col = df.columns[df.columns.str.contains(metric_name.replace("ragas_", "").replace("_ragas", ""), case=False)] if col.empty: return None val = df[col[0]].iloc[0] From ef092bede3a1bcd7116225d69b53dc1264623755 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:48:47 +0200 Subject: [PATCH 51/67] fix(evaluation): use provider-appropriate embeddings in _make_ragas_embeddings instead of always falling back to Anthropic --- fireflyframework_agentic/evaluation/judge.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index a37842a9..0bc6cccc 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -705,9 +705,19 @@ def _make_ragas_embeddings(ctx: EvalContext): from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 return OllamaEmbeddings(model=ctx.embedder._model) - from langchain_anthropic import AnthropicEmbeddings # type: ignore[import] # noqa: PLC0415 + provider = ctx.client.provider + if provider == "anthropic": + from langchain_anthropic import AnthropicEmbeddings # type: ignore[import] # noqa: PLC0415 - return AnthropicEmbeddings() + return AnthropicEmbeddings() + if provider == "ollama": + from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 + + return OllamaEmbeddings() + raise ValueError( + f"RAGAS: no embedder configured for provider {provider!r}; " + "pass ctx.embedder=OllamaEmbedder(...) explicitly" + ) async def _ragas_score(metric_name: str, item: dict, ctx: EvalContext) -> float | None: From d25d2ceb7122de4af2351b2071cd26142c9ee4d7 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:48:55 +0200 Subject: [PATCH 52/67] fix(evaluation): use n_gold as MAP denominator instead of min(n_gold, k) --- fireflyframework_agentic/evaluation/retrieval_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index 7c9c5cfe..c6caea05 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -74,7 +74,7 @@ def _ap_single(retrieved: list[dict], n_gold: int, k: int = 10) -> float: if r.get("is_gold"): hits += 1 precisions.append(hits / r["rank"]) - return sum(precisions) / min(n_gold, k) if n_gold else 0.0 + return sum(precisions) / n_gold if n_gold else 0.0 def hit_at_k(results: list[dict], k: int) -> float: From c690977daf7cf7adb64043d22ba1d37630c04b22 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:49:51 +0200 Subject: [PATCH 53/67] refactor(examples): replace flycanon_eval_example with simpler generic rag_eval_example --- examples/flycanon_eval_example.py | 376 ------------------------------ examples/rag_eval_example.py | 97 ++++++++ 2 files changed, 97 insertions(+), 376 deletions(-) delete mode 100644 examples/flycanon_eval_example.py create mode 100644 examples/rag_eval_example.py diff --git a/examples/flycanon_eval_example.py b/examples/flycanon_eval_example.py deleted file mode 100644 index 30e66bd1..00000000 --- a/examples/flycanon_eval_example.py +++ /dev/null @@ -1,376 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""FlyCanon evaluation example — RAG retrieval benchmark with champion/challenger tracking. - -Demonstrates how to use ``fireflyframework_agentic.evaluation`` to replicate -the flycanon experiment evaluation workflow: - -1. Load a results JSONL file produced by a flycanon retrieval pipeline. -2. Compute deterministic IR metrics (Recall@k, Precision@k, MRR, nDCG, MAP). -3. Compare against a saved baseline to detect regression. -4. Print a formatted metrics table. -5. Offer to promote the new run to champion when it beats the baseline. - -The champion/challenger pattern mirrors the flycanon_experiments harness: -each run writes metrics to a file; ``approve`` promotes it by repointing -baseline.json. Here we replicate that flow using the framework's -individual retrieval metric functions directly. - -Usage:: - - # Score a results file (no baseline comparison) - python examples/flycanon_eval_example.py --results-file results.jsonl - - # Compare against a saved baseline - python examples/flycanon_eval_example.py \\ - --results-file results.jsonl \\ - --baseline baseline.json - - # Promote if better (write new champion to baseline.json) - python examples/flycanon_eval_example.py \\ - --results-file results.jsonl \\ - --baseline baseline.json \\ - --promote-if-better - -Exit codes: 0 = scored successfully, 1 = regression detected vs baseline. - -Results JSONL format --------------------- -Each line is a JSON object representing one query's retrieval result:: - - { - "question": "What was Apple's revenue in Q4 2023?", - "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"], - "retrieved": [ - {"rank": 1, "source_id": "AAPL_10K_2023", "is_gold": true}, - {"rank": 2, "source_id": "MSFT_10K_2023", "is_gold": false}, - {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true} - ], - "answer": "Apple's revenue in Q4 2023 was $89.5 billion.", - "no_answer": false, - "citations": [ - {"source_id": "AAPL_10K_2023", "is_gold": true} - ], - "search_ms": 142, - "answer_ms": 2310 - } - -The ``gold`` list contains the source IDs that are considered correct answers. -Each entry in ``retrieved`` must have a 1-based ``rank``, ``source_id`` (or -``identities`` list), and ``is_gold`` bool. - -Baseline JSON format --------------------- -A flat JSON object with metric names as keys and float values:: - - { - "ndcg@10": 0.7234, - "mrr@10": 0.6891, - "recall@10": 0.8120, - "hit@10": 0.9100, - "map@10": 0.6543, - "n_queries": 200 - } - -This is the same format written by ``--promote-if-better``. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - -from fireflyframework_agentic.evaluation import ( - citation_precision, - hit_at_k, - map_score, - mean_latency_ms, - mrr, - ndcg, - no_answer_rate, - precision_at_k, - recall_at_k, -) - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -# Metrics that form the primary quality signal for champion/challenger -# comparisons. These are listed in priority order: nDCG@10 is the primary -# ranking metric; MRR@10 measures how quickly the first gold result appears; -# Recall@10 measures overall coverage; Hit@10 measures binary success rate; -# MAP@10 measures precision across the ranked list. -PRIMARY_METRICS = ["ndcg@10", "mrr@10", "recall@10", "hit@10", "map@10"] - -# Regression threshold: a metric must drop by more than this fraction of its -# baseline value to be flagged as a regression (guards against noise). -REGRESSION_THRESHOLD = 0.01 - - -def _load_jsonl(path: str) -> list[dict]: - """Load a newline-delimited JSON file, one object per line.""" - lines = Path(path).read_text(encoding="utf-8").strip().splitlines() - return [json.loads(line) for line in lines if line.strip()] - - -def _load_baseline(path: str) -> dict | None: - """Load a baseline JSON file, returning None if it does not exist.""" - p = Path(path) - if not p.exists(): - return None - return json.loads(p.read_text(encoding="utf-8")) - - -def _save_baseline(path: str, metrics: dict) -> None: - """Write a flat metrics dict to the baseline JSON file.""" - Path(path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") - - -def _compute_metrics(results: list[dict]) -> dict: - """Compute all IR metrics and return a flat dict.""" - return { - "n_queries": len(results), - "hit@1": hit_at_k(results, 1), - "hit@5": hit_at_k(results, 5), - "hit@10": hit_at_k(results, 10), - "recall@1": recall_at_k(results, 1), - "recall@5": recall_at_k(results, 5), - "recall@10": recall_at_k(results, 10), - "precision@1": precision_at_k(results, 1), - "precision@5": precision_at_k(results, 5), - "precision@10": precision_at_k(results, 10), - "mrr@10": mrr(results), - "map@10": map_score(results), - "ndcg@10": ndcg(results), - "no_answer_rate": no_answer_rate(results), - "citation_precision": citation_precision(results), - "mean_search_ms": mean_latency_ms(results, "search_ms"), - "mean_answer_ms": mean_latency_ms(results, "answer_ms"), - } - - -def _print_metrics_table(flat: dict, baseline: dict | None) -> None: - """Print a formatted table comparing current metrics vs baseline.""" - - col_w = 22 - num_w = 10 - header = f"{'Metric':<{col_w}} {'Current':>{num_w}}" - if baseline: - header += f" {'Baseline':>{num_w}} {'Delta':>{num_w}}" - print(header) - print("-" * (col_w + num_w + (num_w * 2 + 2 if baseline else 0))) - - for key, value in flat.items(): - if value is None: - continue - # Format floats as 4 decimal places; ints as plain integers. - cur_str = f"{value:.4f}" if isinstance(value, float) else str(value) - - row = f"{key:<{col_w}} {cur_str:>{num_w}}" - if baseline and key in baseline and isinstance(value, float): - base_val = baseline[key] - delta = value - base_val - delta_str = f"{delta:+.4f}" - row += f" {base_val:>{num_w}.4f} {delta_str:>{num_w}}" - print(row) - - print() - - -def _detect_regressions(flat: dict, baseline: dict) -> list[str]: - """Return the names of primary metrics that regressed vs baseline. - - A regression is flagged when the new value drops by more than - REGRESSION_THRESHOLD * baseline_value (relative threshold). This - guards against flagging noise as a regression. - """ - regressions = [] - for key in PRIMARY_METRICS: - new_val = flat.get(key) - base_val = baseline.get(key) - if new_val is None or base_val is None: - continue - if base_val > 0 and (base_val - new_val) / base_val > REGRESSION_THRESHOLD: - regressions.append(key) - return regressions - - -def _beats_baseline(flat: dict, baseline: dict) -> bool: - """Return True if the new metrics are better than or equal to the baseline. - - 'Better' means no primary metric has regressed beyond REGRESSION_THRESHOLD - AND at least one primary metric has improved. - """ - regressions = _detect_regressions(flat, baseline) - if regressions: - return False - # Check for at least one improvement. - for key in PRIMARY_METRICS: - new_val = flat.get(key) - base_val = baseline.get(key) - if new_val is not None and base_val is not None and new_val > base_val: - return True - return False - - -# --------------------------------------------------------------------------- -# Main evaluation flow -# --------------------------------------------------------------------------- - - -def run_evaluation(args: argparse.Namespace) -> int: - """Run retrieval metric scoring and optional champion/challenger comparison.""" - - # ------------------------------------------------------------------ - # Step 1 — Load results from the JSONL file. - # - # Each line is one query's retrieval result. The file is produced by - # a flycanon pipeline run (runner.run_queries writes results.jsonl). - # ------------------------------------------------------------------ - print(f"Loading results : {args.results_file}") - results = _load_jsonl(args.results_file) - print(f" {len(results)} query results loaded.") - - if not results: - print("ERROR: results file is empty.", file=sys.stderr) - return 1 - - # ------------------------------------------------------------------ - # Step 2 — Compute deterministic IR metrics. - # - # Metrics are computed at cut-offs k ∈ {1, 5, 10} and include: - # hit@k -- at least one gold doc in top-k (binary) - # recall@k -- fraction of gold docs found in top-k - # precision@k -- fraction of top-k that are gold - # mrr@10 -- mean reciprocal rank of first gold hit - # map@10 -- mean average precision - # ndcg@10 -- normalised discounted cumulative gain - # ------------------------------------------------------------------ - print("\nComputing retrieval metrics ...") - flat = _compute_metrics(results) - - print(f" nDCG@10 : {flat['ndcg@10']:.4f}") - print(f" MRR@10 : {flat['mrr@10']:.4f}") - print(f" Recall@10 : {flat['recall@10']:.4f}") - print(f" Hit@10 : {flat['hit@10']:.4f}") - print(f" MAP@10 : {flat['map@10']:.4f}") - - # ------------------------------------------------------------------ - # Step 3 — Load the baseline (champion) for regression detection. - # ------------------------------------------------------------------ - baseline = None - if args.baseline: - baseline = _load_baseline(args.baseline) - if baseline: - print(f"\nLoaded baseline : {args.baseline}") - else: - print(f"\nNo baseline found at {args.baseline} — first run, no comparison.") - - # ------------------------------------------------------------------ - # Step 4 — Print the full metrics table. - # ------------------------------------------------------------------ - print("\n" + "=" * 56) - print("Retrieval Metrics") - print("=" * 56) - _print_metrics_table(flat, baseline) - - # ------------------------------------------------------------------ - # Step 5 — Regression check. - # - # Compare against the baseline on primary metrics. Regressions block - # promotion (exit code 1) unless --promote-if-better is set and the - # run actually improved overall. - # ------------------------------------------------------------------ - - if baseline: - regressions = _detect_regressions(flat, baseline) - if regressions: - print(f"REGRESSION detected on: {', '.join(regressions)}") - print(f" Threshold: {REGRESSION_THRESHOLD * 100:.0f}% relative drop on any primary metric.") - else: - better = _beats_baseline(flat, baseline) - if better: - print("Challenger BEATS baseline on at least one primary metric.") - else: - print("Challenger is on-par with baseline (no regression, no improvement).") - - if regressions and not args.promote_if_better: - print("\nVerdict: HOLD — regression detected. Tune the pipeline and re-run.") - return 1 - - # ------------------------------------------------------------------ - # Step 6 — Champion promotion. - # - # When --promote-if-better is set and the metrics beat (or equal) the - # baseline, save the new metrics as the champion. Future runs will - # compare against this updated record. - # ------------------------------------------------------------------ - if args.promote_if_better and args.baseline: - if baseline is None or _beats_baseline(flat, baseline): - _save_baseline(args.baseline, flat) - print(f"\nChampion PROMOTED — metrics saved to {args.baseline}") - else: - print("\nNot promoted — challenger did not beat baseline on primary metrics.") - - print("\nVerdict: PROMOTE" if not (baseline and _detect_regressions(flat, baseline)) else "\nVerdict: HOLD") - return 0 - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def build_parser() -> argparse.ArgumentParser: - p = argparse.ArgumentParser( - prog="flycanon_eval_example", - description=( - "FlyCanon RAG retrieval benchmark — computes IR metrics from a results JSONL " - "and compares against a champion baseline." - ), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - p.add_argument( - "--results-file", - required=True, - help="Path to results.jsonl produced by the flycanon pipeline.", - ) - p.add_argument( - "--baseline", - default=None, - help=("Path to baseline.json (champion store). When absent, scores are printed without comparison."), - ) - p.add_argument( - "--promote-if-better", - action="store_true", - help=( - "When set, write new metrics to baseline.json if the challenger beats the " - "champion on primary metrics. Has no effect when --baseline is omitted." - ), - ) - return p - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - sys.exit(run_evaluation(args)) - - -if __name__ == "__main__": - main() diff --git a/examples/rag_eval_example.py b/examples/rag_eval_example.py new file mode 100644 index 00000000..b5b98170 --- /dev/null +++ b/examples/rag_eval_example.py @@ -0,0 +1,97 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RAG retrieval evaluation example. + +Compute deterministic IR metrics (Hit@k, Recall@k, Precision@k, MRR, MAP, nDCG) +from a JSONL results file produced by any retrieval pipeline. + +Usage:: + + python examples/rag_eval_example.py --results-file results.jsonl + +Results JSONL format — one JSON object per line:: + + { + "question": "What was Apple's revenue in Q4 2023?", + "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"], + "retrieved": [ + {"rank": 1, "source_id": "AAPL_10K_2023", "is_gold": true}, + {"rank": 2, "source_id": "MSFT_10K_2023", "is_gold": false}, + {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true} + ], + "answer": "Apple's revenue in Q4 2023 was $89.5 billion.", + "citations": [{"source_id": "AAPL_10K_2023", "is_gold": true}], + "search_ms": 142, + "answer_ms": 2310 + } +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from fireflyframework_agentic.evaluation import ( + citation_precision, + hit_at_k, + map_score, + mean_latency_ms, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, +) + + +def _load_jsonl(path: str) -> list[dict]: + lines = Path(path).read_text(encoding="utf-8").strip().splitlines() + return [json.loads(line) for line in lines if line.strip()] + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compute IR metrics from a retrieval results JSONL.") + parser.add_argument("--results-file", required=True, help="Path to results.jsonl") + parser.add_argument("--k", type=int, default=10, help="Cut-off rank (default: 10)") + args = parser.parse_args() + + results = _load_jsonl(args.results_file) + k = args.k + + metrics = { + f"hit@{k}": hit_at_k(results, k), + f"recall@{k}": recall_at_k(results, k), + f"precision@{k}": precision_at_k(results, k), + f"mrr@{k}": mrr(results, k), + f"map@{k}": map_score(results, k), + f"ndcg@{k}": ndcg(results, k), + "no_answer_rate": no_answer_rate(results), + "citation_precision": citation_precision(results), + "mean_search_ms": mean_latency_ms(results, "search_ms"), + "mean_answer_ms": mean_latency_ms(results, "answer_ms"), + } + + print(f"\n{'Metric':<22} {'Value':>10}") + print("-" * 33) + for name, value in metrics.items(): + if value is not None: + val_str = f"{value:.4f}" if isinstance(value, float) else str(value) + print(f"{name:<22} {val_str:>10}") + print(f"\n{len(results)} queries scored.") + + +if __name__ == "__main__": + main() From efdcbf7c03913788eb6116c2c3a1293568daea81 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:51:43 +0200 Subject: [PATCH 54/67] refactor(examples): replace rag_eval_example with llm_eval_example using LLM-as-judge metrics --- examples/llm_eval_example.py | 116 +++++++++++++++++++++++++++++++++++ examples/rag_eval_example.py | 97 ----------------------------- 2 files changed, 116 insertions(+), 97 deletions(-) create mode 100644 examples/llm_eval_example.py delete mode 100644 examples/rag_eval_example.py diff --git a/examples/llm_eval_example.py b/examples/llm_eval_example.py new file mode 100644 index 00000000..50bf472f --- /dev/null +++ b/examples/llm_eval_example.py @@ -0,0 +1,116 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LLM-as-judge evaluation example. + +Score a set of Q&A pairs using the evaluation metrics: + - contains_answer — does the answer contain the correct information? + - addresses_question — does the answer directly address what was asked? + +Each metric runs ``--runs`` times and reports the median score (default 3). + +Usage:: + + python examples/llm_eval_example.py --model anthropic:claude-haiku-4-5-20251001 + + # Or score from a JSONL file instead of the built-in sample data: + python examples/llm_eval_example.py \\ + --model anthropic:claude-haiku-4-5-20251001 \\ + --items-file items.jsonl + +Items JSONL format — one JSON object per line:: + + {"question": "...", "answer": "...", "reference": "..."} +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +from pathlib import Path + +from fireflyframework_agentic.evaluation import ( + EvalContext, + JudgeClient, + addresses_question, + contains_answer, +) + +# Sample data used when no --items-file is provided. +_SAMPLE_ITEMS = [ + { + "question": "What is the boiling point of water at sea level?", + "reference": "Water boils at 100 °C (212 °F) at standard atmospheric pressure.", + "answer": "Water boils at 100 degrees Celsius at sea level.", + }, + { + "question": "Who wrote Romeo and Juliet?", + "reference": "Romeo and Juliet was written by William Shakespeare around 1594–1596.", + "answer": "It was written by Shakespeare.", + }, + { + "question": "What is the capital of France?", + "reference": "The capital of France is Paris.", + "answer": "The weather in France is generally mild.", + }, +] + + +async def score_items(items: list[dict], ctx: EvalContext) -> list[dict]: + tasks = [(contains_answer(item, ctx), addresses_question(item, ctx)) for item in items] + pairs = await asyncio.gather(*[asyncio.gather(ca, aq) for ca, aq in tasks]) + return [ + {"question": item["question"], "contains_answer": ca, "addresses_question": aq} + for item, (ca, aq) in zip(items, pairs) + ] + + +async def main(args: argparse.Namespace) -> None: + if args.items_file: + lines = Path(args.items_file).read_text(encoding="utf-8").strip().splitlines() + items = [json.loads(line) for line in lines if line.strip()] + else: + items = _SAMPLE_ITEMS + + ctx = EvalContext(client=JudgeClient(args.model), runs=args.runs) + results = await score_items(items, ctx) + + print(f"\n{'Question':<45} {'contains':>8} {'addresses':>9}") + print("-" * 63) + for r in results: + q = r["question"][:43] + ".." if len(r["question"]) > 45 else r["question"] + ca = f"{r['contains_answer']:.2f}" if r["contains_answer"] is not None else " n/a" + aq = f"{r['addresses_question']:.2f}" if r["addresses_question"] is not None else " n/a" + print(f"{q:<45} {ca:>8} {aq:>9}") + + scored = [r for r in results if r["contains_answer"] is not None] + if scored: + avg_ca = sum(r["contains_answer"] for r in scored) / len(scored) + avg_aq = sum(r["addresses_question"] for r in scored) / len(scored) + print("-" * 63) + print(f"{'Average':<45} {avg_ca:>8.2f} {avg_aq:>9.2f}") + print(f"\n{len(items)} items scored ({args.runs} judge run(s) each).") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Score Q&A pairs with LLM-as-judge metrics.") + parser.add_argument( + "--model", + default="anthropic:claude-haiku-4-5-20251001", + help="Judge model spec (provider:model).", + ) + parser.add_argument("--runs", type=int, default=3, help="Judge runs per item (median is reported).") + parser.add_argument("--items-file", default=None, help="Optional JSONL file of {question, answer, reference} items.") + asyncio.run(main(parser.parse_args())) diff --git a/examples/rag_eval_example.py b/examples/rag_eval_example.py deleted file mode 100644 index b5b98170..00000000 --- a/examples/rag_eval_example.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2026 Firefly Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""RAG retrieval evaluation example. - -Compute deterministic IR metrics (Hit@k, Recall@k, Precision@k, MRR, MAP, nDCG) -from a JSONL results file produced by any retrieval pipeline. - -Usage:: - - python examples/rag_eval_example.py --results-file results.jsonl - -Results JSONL format — one JSON object per line:: - - { - "question": "What was Apple's revenue in Q4 2023?", - "gold": ["AAPL_10K_2023", "AAPL_10Q_Q4_2023"], - "retrieved": [ - {"rank": 1, "source_id": "AAPL_10K_2023", "is_gold": true}, - {"rank": 2, "source_id": "MSFT_10K_2023", "is_gold": false}, - {"rank": 3, "source_id": "AAPL_10Q_Q4_2023", "is_gold": true} - ], - "answer": "Apple's revenue in Q4 2023 was $89.5 billion.", - "citations": [{"source_id": "AAPL_10K_2023", "is_gold": true}], - "search_ms": 142, - "answer_ms": 2310 - } -""" - -from __future__ import annotations - -import argparse -import json -from pathlib import Path - -from fireflyframework_agentic.evaluation import ( - citation_precision, - hit_at_k, - map_score, - mean_latency_ms, - mrr, - ndcg, - no_answer_rate, - precision_at_k, - recall_at_k, -) - - -def _load_jsonl(path: str) -> list[dict]: - lines = Path(path).read_text(encoding="utf-8").strip().splitlines() - return [json.loads(line) for line in lines if line.strip()] - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compute IR metrics from a retrieval results JSONL.") - parser.add_argument("--results-file", required=True, help="Path to results.jsonl") - parser.add_argument("--k", type=int, default=10, help="Cut-off rank (default: 10)") - args = parser.parse_args() - - results = _load_jsonl(args.results_file) - k = args.k - - metrics = { - f"hit@{k}": hit_at_k(results, k), - f"recall@{k}": recall_at_k(results, k), - f"precision@{k}": precision_at_k(results, k), - f"mrr@{k}": mrr(results, k), - f"map@{k}": map_score(results, k), - f"ndcg@{k}": ndcg(results, k), - "no_answer_rate": no_answer_rate(results), - "citation_precision": citation_precision(results), - "mean_search_ms": mean_latency_ms(results, "search_ms"), - "mean_answer_ms": mean_latency_ms(results, "answer_ms"), - } - - print(f"\n{'Metric':<22} {'Value':>10}") - print("-" * 33) - for name, value in metrics.items(): - if value is not None: - val_str = f"{value:.4f}" if isinstance(value, float) else str(value) - print(f"{name:<22} {val_str:>10}") - print(f"\n{len(results)} queries scored.") - - -if __name__ == "__main__": - main() From a1519b9e2484d204834212fe1b3de045135ceb4e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Thu, 25 Jun 2026 17:55:06 +0200 Subject: [PATCH 55/67] fix(examples): use SI units only in sample reference --- examples/llm_eval_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llm_eval_example.py b/examples/llm_eval_example.py index 50bf472f..98b4f469 100644 --- a/examples/llm_eval_example.py +++ b/examples/llm_eval_example.py @@ -52,7 +52,7 @@ _SAMPLE_ITEMS = [ { "question": "What is the boiling point of water at sea level?", - "reference": "Water boils at 100 °C (212 °F) at standard atmospheric pressure.", + "reference": "Water boils at 100 °C at standard atmospheric pressure.", "answer": "Water boils at 100 degrees Celsius at sea level.", }, { From d3f53a700e5c63310781efc05e0229ef290afa31 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 09:40:26 +0200 Subject: [PATCH 56/67] docs(evaluation): rewrite guide around metrics, drop deleted gate pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The evaluation package no longer ships gates, verdict, champion/challenger, or the flyeval CLI — those modules were removed in this branch. Rewrite the guide to document the actual public surface: LLM-as-judge metrics (judge.py) and deterministic retrieval metrics (retrieval_metrics.py). Fix the README mention to match. --- README.md | 10 +- docs/evaluation.md | 556 +++++++++++++++++---------------------------- 2 files changed, 210 insertions(+), 356 deletions(-) diff --git a/README.md b/README.md index 904237da..3277faef 100644 --- a/README.md +++ b/README.md @@ -412,10 +412,10 @@ classDiagram `EvalDataset` loads/saves test cases from JSON. `ModelComparison` runs the same prompts across multiple agents for side-by-side analysis. -- **Evaluation** — Gate-based quality gates (G1–G5), LLM-as-judge advisory scoring, - champion/challenger tracking, and deterministic retrieval metrics for assessing - agent and pipeline outputs. The `flyeval` CLI drives the full gate pipeline from - the command line. Install with `pip install "fireflyframework-agentic[evaluation]"`. +- **Evaluation** — LLM-as-judge metrics (faithfulness, relevancy, answer correctness, + RAGAS, …) and deterministic retrieval metrics (recall@k, MRR, MAP, nDCG, …) for + assessing LLM and pipeline outputs. Each metric is a plain function you call directly. + Install with `pip install "fireflyframework-agentic[evaluation]"`. See [docs/evaluation.md](docs/evaluation.md) for the full guide. > **Optional developer tooling.** `fireflyframework_agentic.experiments` (A/B @@ -823,7 +823,7 @@ Detailed guides for each module: - [Security](docs/security.md) — Prompt/output guards, at-rest encryption - [Experiments](docs/experiments.md) — A/B testing, variant comparison - [Lab](docs/lab.md) — Benchmarks, datasets, evaluators -- [Evaluation](docs/evaluation.md) — Gate pipeline, flyeval CLI, champion/challenger, retrieval metrics +- [Evaluation](docs/evaluation.md) — LLM-as-judge metrics, RAGAS, retrieval metrics - Studio — moved to [fireflyframework-agentic-studio](https://github.com/fireflyframework/fireflyframework-agentic-studio) --- diff --git a/docs/evaluation.md b/docs/evaluation.md index c2abe319..51fd9c44 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -2,434 +2,288 @@ Copyright 2026 Firefly Software Foundation. Licensed under the Apache License 2.0. -The Evaluation subpackage provides gate-based quality gates, LLM-as-judge advisory scoring, -champion/challenger tracking, and deterministic retrieval metrics for assessing agent outputs. +The Evaluation subpackage provides **metrics for assessing LLM and pipeline outputs**: +LLM-as-judge metrics (faithfulness, relevancy, answer correctness, …) and deterministic +information-retrieval metrics (recall@k, nDCG, MRR, …). Every metric is a plain function +you call directly and combine however your harness needs — there is no gate, verdict, or +promotion machinery to opt into. --- -## Concepts - -### Gate pipeline - -The evaluation framework runs **five gates** in sequence. Every gate always runs — a failed -gate raises a *flag*, not a veto, so the scorecard always carries the complete picture. - -| Gate | Name | Kind | Description | -|------|------|------|-------------| -| G1 | Structural & Safe | Deterministic | Schema validity, PII non-disclosure, empty-registry guard. | -| G2 | Must-finds & Negative Controls | Deterministic | Lexical/semantic recall against the must-find registry; NC precision. | -| G3 | Evidence (Grounding) | Deterministic | Excerpt-to-corpus anchoring; fabricated-evidence detection. | -| G4 | LLM-as-a-Judge | Advisory (non-blocking) | Semantic faithfulness, entailment, gap detection — never changes the verdict. | -| G5 | No-regression / Promotion | Human decision | Champion/challenger comparison with A/A noise band; collects sign-offs. | - -**No gate vetoes.** Failures append to the `GateResult` flags list and scoring continues. -The scorecard carries every signal regardless of which gates fired. - -### GateResult +## Installation -`GateResult` is a dataclass returned by each gate: +The evaluation subpackage needs `numpy` for the embedding path and `ragas` (plus its +LangChain providers) for the RAGAS metrics. Install the optional extra: -```python -@dataclass -class GateResult: - gate: str # "G1", "G2", …, "G5" - passed: bool - reason_code: str = "" # e.g. "SCHEMA_INVALID", "NC_HIT", "UNGROUNDED" - details: dict = field(default_factory=dict) +```bash +pip install "fireflyframework-agentic[evaluation]" ``` -`str(gate_result)` prints `[G2] PASS` or `[G2] FLAG:NC_HIT`. - -### Verdict - -`verdict(gate_results)` returns `VERDICT_PROMOTE` or `VERDICT_HOLD`: +Everything except the RAGAS metrics works without `ragas` installed; the RAGAS functions +import it lazily and only fail if you call them without the extra. -- `VERDICT_PROMOTE` — all gates passed **and** G5 (the human sign-off gate) is present. -- `VERDICT_HOLD` — any gate flagged, or G5 is missing. - -The CLI exits `0` on PROMOTE and `1` on HOLD, so it composes into CI. - -### Must-find registry - -A registry (`lean-1` schema) is a JSON file listing items the discovery output is -expected to surface (`tier` L0–L3) and negative controls (NC) it must *not* assert. - -```json -{ - "schema_version": "lean-1", - "corpus": "banca-cordobesa", - "items": [ - { "id": "ao-pep-4eyes", "tier": "L0", "scope": "decision", - "description": "PEP cases require a second analyst sign-off (4-eyes)", - "keywords": ["PEP", "4-eyes"], - "evidence": ["SOP-002-kyc-edd.md"] }, - { "id": "ao-nc-realtime", "tier": "NC", "scope": "finding", - "description": "KYC-Hub synchronises in real time — factually false" } - ] -} -``` +--- -Tier semantics: L0 = must-find control (a single miss flags the run), L1 = high-priority, -L2 = important, L3 = nice-to-have (not counted in the recall floor). +## Two metric families -### Advisory judge (G4) +| Family | Module | Needs an LLM? | Use it to evaluate… | +|--------|--------|---------------|---------------------| +| **LLM-as-judge** | `evaluation.judge` | Most metrics yes (a few are deterministic/embedding) | The semantic quality of a model's answers and reports — faithfulness, relevancy, correctness, hallucination. | +| **Retrieval** | `evaluation.retrieval_metrics` | No (pure functions, no network) | The ranked retrieval that feeds the LLM — recall@k, precision@k, MRR, MAP, nDCG, latency. | -G4 calls a chat LLM (or local Ollama model) for semantic checks the deterministic gates -cannot perform: faithfulness, entailment, numeric/temporal fidelity, actionability, -fabricated-entity detection, and more. It is: +Both are re-exported from `fireflyframework_agentic.evaluation`. -- **Non-blocking** — `AdvisoryReport` is carried separately and never enters `verdict()`. -- **Non-deterministic** — each metric runs `judge_runs` times (default: 3) and the - median score is reported. -- **Opt-in** — pass `--judge-model provider:model` to activate it; omit the flag to skip. +--- -### Champion/challenger pattern +## LLM-as-judge metrics -Champions are **per-corpus**. `ChampionRecord` persists the best-known run so that -promotion decisions are made against a stable, signed baseline rather than the last run. +Each judge metric is an **async function** with the same signature: -``` - ┌──────────────────────────────────────────┐ - │ run result JSON (challenger) │ - └──────────────┬───────────────────────────┘ - │ - ┌───────────────▼───────────────┐ - │ G1 · G2 · G3 (deterministic) │ - │ G4 (advisory, opt-in) │ - └───────────────┬───────────────┘ - │ flags + scores - ┌───────────────▼───────────────┐ - │ G5 — no-regression vs │ - │ champion baseline + A/A band │ - └───────────────┬───────────────┘ - │ - ┌───────────────▼───────────────┐ - │ Markdown scorecard │ - │ PROMOTE / HOLD │ - └───────────────────────────────┘ +```python +async def metric(item: dict, ctx: EvalContext) -> dict | float | None ``` -`invalidate_champion()` marks a baseline invalid. The `EMPTY_MUST_FIND` guard in G1 -prevents a fake-100% champion being created against an empty registry. +- `item` — a plain dict of the output under evaluation (see schema below). +- `ctx` — an `EvalContext` carrying the judge client, optional embedder, and run count. +- The return is either a small summary dict, a single float, or `None` when the metric + cannot run (e.g. an embedding metric with no embedder, or a missing field). ---- - -## Installation +### EvalContext and JudgeClient -The evaluation subpackage requires `scipy` and `numpy`. Install the optional extra: +```python +from fireflyframework_agentic.evaluation import EvalContext, JudgeClient -```bash -pip install "fireflyframework-agentic[evaluation]" +ctx = EvalContext( + client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), + runs=3, # metrics that repeat use the median of this many calls + embedder=None, # optional OllamaEmbedder, required only by semantic_recovery +) ``` -The `flyeval` CLI entry-point is registered automatically by the package. Verify: - -```bash -flyeval --version -``` +`JudgeClient` is an async multi-provider chat client. The model spec is +`":"`, where provider is one of `anthropic`, `openai`, `azure`, `ollama`. +Temperature is pinned to `0.0` for stable verdicts, and API keys are read lazily from the +environment (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `AZURE_OPENAI_*`, `OLLAMA_HOST`) — so +constructing a client never requires a secret. ---- +### Item schema -## CLI +The judge metrics read whichever keys they need and ignore the rest, so one `item` dict can +serve many metrics. -All subcommands exit `0` on PROMOTE and `1` on HOLD. +**RAG / Q&A items** (single answer under test): -### `flyeval gate` +```python +item = { + "question": "What is the boiling point of water at sea level?", + "answer": "Water boils at 100 degrees Celsius at sea level.", + "reference": "Water boils at 100 °C at standard atmospheric pressure.", + "contexts": ["...retrieved passage...", "..."], # used by RAGAS metrics +} +``` -Run the full gate pipeline against a result JSON and print a Markdown scorecard. +**Report / discovery items** (a structured pipeline output): -```bash -flyeval gate \ - --result runs/2026-06-18/output.json \ - --registry registries/banca-cordobesa.json \ - --baseline baselines/banca-cordobesa.json \ - --judge-model anthropic:claude-3-5-haiku \ - --judge-runs 3 +```python +item = { + "findings": [{"id": ..., "title": ..., "description": ..., "severity": ..., + "evidence_refs": [{"evidence_id": ...}], ...}], + "evidence_index": [{"id": ..., "locator": "doc.md#L1", "excerpt": "..."}], + "process_graph": {"processes": [{"name": ..., "activities": [...], "decisions": [...]}]}, + "proposed_actions": [{"title": ..., "finding_id": ..., "expected_savings_fte": ...}], + "workspace": {"name": ..., "description": ...}, + "nc_items": [{"id": ..., "description": "a statement that is factually false"}], + "lexical_missed_ids": ["..."], # ids the lexical pass missed (semantic_recovery) + "champion": { ... another item ... }, # baseline for comparative_vs_champion +} ``` -Key flags: +### Quick start — scoring Q&A pairs -| Flag | Default | Description | -|------|---------|-------------| -| `--result` | required | Path to the run's `output.json`. | -| `--registry` | required | Must-find registry (lean-1 JSON). | -| `--baseline` | — | Champion baseline JSON for G5 regression check. | -| `--judge-model` | — | `provider:model` for G4 advisory judge. | -| `--judge-runs` | 3 | Number of independent judge calls (median aggregation). | -| `--no-judge` | — | Skip G4 entirely. | -| `--recall-floor` | 0.70 | Minimum G2 recall before flagging. | -| `--grounding-floor` | 0.90 | Minimum G3 grounding rate before flagging. | -| `--corpus` | — | Path to the evidence corpus bundle for G3 verification. | -| `--pii-list` | — | Path to a JSON array of names to scan for PII leaks (G1). | -| `--embedder` | — | `provider:model` for semantic recall (G2 embedding path). | -| `--model-id` | "unknown" | Identifier of the model under evaluation (for scorecard). | +```python +import asyncio +from fireflyframework_agentic.evaluation import ( + EvalContext, JudgeClient, contains_answer, addresses_question, +) -### `flyeval aa-band` +item = { + "question": "Who wrote Romeo and Juliet?", + "reference": "Romeo and Juliet was written by William Shakespeare around 1594–1596.", + "answer": "It was written by Shakespeare.", +} -Compute the A/A noise band from multiple repeated runs of the same model to establish -the noise floor before setting up the champion comparison. +async def main(): + ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), runs=3) + contains = await contains_answer(item, ctx) # 0.0–1.0 + addresses = await addresses_question(item, ctx) # 0.0–1.0 + print(contains, addresses) -```bash -flyeval aa-band \ - --results runs/aa-run-1/output.json runs/aa-run-2/output.json runs/aa-run-3/output.json \ - --registry registries/banca-cordobesa.json +asyncio.run(main()) ``` -The command prints per-metric variance and recommended noise floors. - -### `flyeval day-zero` +See `examples/llm_eval_example.py` for a runnable version that scores a list of items +(built-in sample data or a JSONL file) and prints a table. -Promote the very first champion for a corpus (Day-Zero protocol). Requires at least -`--signoffs` sign-offs (default: 2) before PROMOTE is issued. - -```bash -flyeval day-zero \ - --result runs/2026-06-18/output.json \ - --registry registries/banca-cordobesa.json \ - --baseline baselines/banca-cordobesa.json \ - --signoffs 2 -``` +### Metric catalog -The command writes the new `ChampionRecord` into `--baseline` on success. +**Deterministic** — no LLM call, always available: -### `flyeval invalidate` +| Metric | Returns | Measures | +|--------|---------|----------| +| `source_coverage` | `{cited, total, orphaned}` | Distinct source documents cited by ≥1 finding vs. all sources; `orphaned` lists uncited stems. | +| `excerpt_fill_rate` | `{populated, total}` | Fraction of `evidence_index` entries that carry a non-empty excerpt. | -Mark the current champion invalid with a documented reason. Use this when the registry -changes in a way that makes the existing champion incommensurable. +**Embedding** — requires `ctx.embedder`: -```bash -flyeval invalidate \ - --baseline baselines/banca-cordobesa.json \ - --reason "Registry expanded from 39 to 94 items (lean-1 v2)." -``` +| Metric | Returns | Measures | +|--------|---------|----------| +| `semantic_recovery` | `{lexical_recall, recovered_recall, recovered, tau, scored_denominator}` or `None` | Context-recall: recovers lexically-missed items via embedding similarity above `tau` (default 0.70). Returns `None` when no embedder is set. | ---- +**LLM-as-judge** — requires `ctx.client`: -## Python API +| Metric | Returns | Measures | +|--------|---------|----------| +| `faithfulness` | `{supported, total, unsupported_ids}` | Does each finding's cited evidence entail its claim? | +| `numeric_temporal_fidelity` | `{mismatches, count}` | Numbers/dates asserted in a finding that don't match its evidence. | +| `citation_relevance` | `{precision, relevant, total}` | Context precision: fraction of cited passages actually relevant to the claim. | +| `nc_semantic_precision` | `{asserted, total, asserted_ids}` | How many negative-control falsehoods (`nc_items`) the output asserts or endorses. | +| `fabricated_entity` | `{count, entities}` | Systems/orgs/metrics named in the output but absent from the corpus. | +| `contradiction` | `{count, pairs}` | Internally contradictory finding pairs. | +| `open_gap` | `{gap}` | G-Eval open probe: the most important issue the output missed (free-text, no score). | +| `actionability` | `{score, rated}` | Average 0–1 rating of whether proposed actions are specific, quantified, and linked. | +| `severity_calibration` | `{miscalibrated, total, verdicts}` | Whether each finding's stated severity matches its evidence (under/over/calibrated). | +| `answer_relevancy` | `{score}` | Does the output address the stated workspace intention? | +| `surface_deduplication` | `{distinct, redundant, total, distinct_rate, redundant_pairs}` | Fraction of near-duplicate process-graph nodes that are genuinely distinct. | +| `comparative_vs_champion` | `{candidate, champion, more_consistent}` or `None` | Pairwise five-axis review of candidate vs. `item["champion"]`. `None` if no champion. | -### Running gates +**RAG Q&A** — requires `ctx.client`; repeats `ctx.runs` times and returns the median: -```python -import json -from fireflyframework_agentic.evaluation import ( - run_gates, - render_scorecard, - verdict, - load_registry, - VERDICT_PROMOTE, -) +| Metric | Returns | Measures | +|--------|---------|----------| +| `contains_answer` | `float` or `None` | Does the answer contain the correct information from the reference? | +| `addresses_question` | `float` or `None` | Does the answer directly address what the question asks? | -result = json.loads(open("runs/2026-06-18/output.json").read()) -registry = load_registry("registries/banca-cordobesa.json") +**RAGAS** — requires the `ragas` extra and `ctx.client` (plus an embedder for some): -gate_results = run_gates(result, registry) -scorecard_md = render_scorecard( - gate_results, - corpus="banca-cordobesa", - model_id="anthropic:claude-3-5-sonnet", - run_id="2026-06-18-sonnet-01", -) -print(scorecard_md) +| Metric | Returns | Measures | +|--------|---------|----------| +| `answer_correctness` | `float` or `None` | Semantic F1 of the answer against the reference. | +| `ragas_faithfulness` | `float` or `None` | Answer grounded in the retrieved `contexts`. | +| `context_recall` | `float` or `None` | Reference coverage by the retrieved `contexts`. | +| `context_precision` | `float` or `None` | Retrieved `contexts` relevant to the question. | -v = verdict(gate_results) -print("Verdict:", v) # "PROMOTE" or "HOLD" -assert v == VERDICT_PROMOTE -``` +### Running every metric at once -### Champion management +`run_judge()` runs all metrics concurrently and collects them into an `AdvisoryReport`. It +is best-effort and never raises — any metric that fails is recorded in `report.errors` +instead of propagating. ```python -from fireflyframework_agentic.evaluation import ( - load_champion, - save_champion, - invalidate_champion, - ChampionRecord, -) - -# Load the current champion (returns None on Day Zero). -champ = load_champion("baselines/banca-cordobesa.json") -if champ is None: - print("Day Zero — no champion yet.") -else: - print(f"Champion: {champ.run_id} | {champ.primary_metric()}={champ.primary_score():.3f}") - -# Save a new champion after a successful PROMOTE. -new_champ = ChampionRecord( - corpus="banca-cordobesa", - run_id="2026-06-18-sonnet-01", - model_id="anthropic:claude-3-5-sonnet", - registry_sha256=registry.sha256(), - scores={"lexical_recall": 0.857, "grounding_pct": 0.941}, - human_sign_offs=["alice", "bob"], -) -save_champion("baselines/banca-cordobesa.json", new_champ) - -# Invalidate when the registry changes materially. -invalidate_champion( - "baselines/banca-cordobesa.json", - reason="Registry expanded from 39 to 94 items.", -) -``` - -### EvalConfig +import asyncio +from fireflyframework_agentic.evaluation import run_judge, EvalContext, JudgeClient -`EvalConfig` is a Pydantic model that captures the parameters of a single evaluation run. -Use it to build reproducible, serialisable run records. +async def main(): + ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), runs=3) + report = await run_judge(item, ctx, pipeline_model="anthropic:claude-sonnet-4-6") + print(report.metrics) # {metric_name: result, ...} + print(report.errors) # ["metric: ExceptionType: message", ...] -```python -from fireflyframework_agentic.evaluation.models import EvalConfig - -cfg = EvalConfig( - model_id="anthropic:claude-3-5-sonnet", - corpus="banca-cordobesa", - run_id="2026-06-18-sonnet-01", - registry_path="registries/banca-cordobesa.json", - corpus_path="corpora/banca-cordobesa/", - baseline_path="baselines/banca-cordobesa.json", - judge_model="anthropic:claude-3-5-haiku", - judge_runs=3, -) -print(cfg.model_dump_json(indent=2)) +asyncio.run(main()) ``` -### Advisory judge (G4) - -```python -from fireflyframework_agentic.evaluation import run_judge, JudgeClient, build_embedder - -client = JudgeClient( - chat_fn=my_chat_fn, # callable(system: str, user: str) -> dict - embed_fn=build_embedder("ollama:bge-m3"), -) +`AdvisoryReport` fields: -advisory = run_judge( - result=result, - registry=registry, - client=client, - runs=3, - missed_ids=[], # IDs the deterministic G2 missed — judge tries to recover them -) -print(advisory.scores) # dict of metric -> float -print(advisory.errors) # any metrics that failed (best-effort, never raises) -``` +| Field | Type | Description | +|-------|------|-------------| +| `judge_model` | `str` | The judge model spec used. | +| `same_provider_caveat` | `bool` | `True` when the judge and the evaluated pipeline share a provider (self-grading risk). | +| `calibrated` | `bool` | Reserved; always `False` for now. | +| `runs` | `int` | Judge runs per repeated metric. | +| `metrics` | `dict` | Per-metric results, keyed by metric name. | +| `details` | `dict` | Supporting context (counts, ids). | +| `errors` | `list[str]` | Per-metric failures captured best-effort. | --- -## Retrieval Metrics +## Retrieval metrics -The `compute_retrieval_metrics()` function computes standard IR metrics over ranked -retrieval results. It is imported from `fireflyframework_agentic.lab.retrieval_metrics` -and re-exported by the evaluation package. +Deterministic IR metrics over ranked retrieval results — no LLM and no network, the same +design as scikit-learn or MS MARCO evaluation scripts. Each is a plain function over a list +of result rows. -Supported metrics at cut-offs k ∈ {1, 5, 10}: - -- **Hit@k** — at least one gold document in top-k. -- **Recall@k** — fraction of gold documents in top-k. -- **Precision@k** — fraction of top-k results that are gold. -- **MRR@10** — mean reciprocal rank of the first gold hit. -- **MAP@10** — mean average precision. -- **nDCG@10** — normalised discounted cumulative gain. +### Row schema ```python -from fireflyframework_agentic.evaluation import compute_retrieval_metrics, RetrieverMetrics - -# Each row is a query; each row's "retrieved" list is ranked (rank=1 is top). -rows = [ +results = [ { - "query": "KYC enhanced due diligence steps", - "gold": ["SOP-002-kyc-edd.md"], - "retrieved": [ - {"rank": 1, "source_id": "SOP-002-kyc-edd.md", "is_gold": True}, - {"rank": 2, "source_id": "SOP-001-account-opening.md", "is_gold": False}, - {"rank": 3, "source_id": "INT-002-KYC-Jaime.md", "is_gold": True}, - ], + "retrieved": [{"rank": 1, "source_id": "SOP-002.md", "is_gold": True}, + {"rank": 2, "source_id": "SOP-001.md", "is_gold": False}], + "gold": ["SOP-002.md"], # gold source identifiers + # optional: + "no_answer": False, # model refused / produced no answer + "answer": "...", # used for no_answer detection if no_answer absent + "citations": [{"is_gold": True}], + "search_ms": 42.0, + "answer_ms": 310.0, }, ] - -metrics: RetrieverMetrics = compute_retrieval_metrics(rows) -print(f"Recall@5: {metrics.recall_5:.3f}") -print(f"nDCG@10: {metrics.ndcg_10:.3f}") -print(f"MRR@10: {metrics.mrr_10:.3f}") ``` -`RetrieverMetrics` also carries optional fields when the raw rows include them: -`no_answer_rate`, `citation_precision`, `mean_search_ms`, `mean_answer_ms`. +`rank` is 1-based (rank 1 is the top hit). Duplicate sources are de-duplicated by +`source_id`, keeping the best-ranked chunk. ---- +### Metric catalog + +| Function | Signature | Measures | +|----------|-----------|----------| +| `hit_at_k` | `(results, k) -> float` | Fraction of queries with ≥1 gold document in top-k. | +| `recall_at_k` | `(results, k) -> float` | Mean fraction of gold documents found in top-k. | +| `precision_at_k` | `(results, k) -> float` | Mean fraction of top-k results that are gold. | +| `mrr` | `(results, k=10) -> float` | Mean reciprocal rank of the first gold hit. | +| `map_score` | `(results, k=10) -> float` | Mean average precision at k. | +| `ndcg` | `(results, k=10) -> float` | Mean normalised discounted cumulative gain at k. | +| `no_answer_rate` | `(results) -> float \| None` | Fraction of queries with no answer. `None` if no results. | +| `citation_precision` | `(results) -> float \| None` | Precision of in-answer citations vs. the gold set. `None` if no citations. | +| `mean_latency_ms` | `(results, field) -> float \| None` | Mean latency for `"search_ms"` or `"answer_ms"`. `None` if absent. | -## Architecture - -```mermaid -flowchart TD - R["result JSON\n(DiscoveryResult / output.json)"] - REG["Registry JSON\n(lean-1 must-find)"] - CORP["Corpus bundle\n(raw evidence documents)"] - BASE["Baseline JSON\n(champion record)"] - - R --> G1["G1 · Structural & Safe\n(schema, PII, empty-registry)"] - REG --> G1 - R --> G2["G2 · Recall & NC Precision\n(lexical + optional semantic)"] - REG --> G2 - R --> G3["G3 · Grounding\n(excerpt anchoring, fabrication)"] - CORP --> G3 - R --> G4["G4 · LLM Judge advisory\n(faithfulness, entailment, gaps)"] - REG --> G4 - G1 --> SC["Markdown Scorecard\nrender_scorecard()"] - G2 --> SC - G3 --> SC - G4 -.advisory.-> SC - BASE --> G5["G5 · No-regression\n(A/A band, sign-offs)"] - G1 --> G5 - G2 --> G5 - G3 --> G5 - G5 --> SC - SC --> V["verdict()\nPROMOTE / HOLD"] - V --> CHAMP["save_champion()\nor invalidate_champion()"] +### Example + +```python +from fireflyframework_agentic.evaluation import recall_at_k, ndcg, mrr + +print(f"Recall@5: {recall_at_k(results, 5):.3f}") +print(f"nDCG@10: {ndcg(results):.3f}") +print(f"MRR@10: {mrr(results):.3f}") ``` --- ## Reference -### Exports - All symbols below are importable from `fireflyframework_agentic.evaluation`. +### Core types + | Symbol | Kind | Description | |--------|------|-------------| -| `EvalConfig` | Pydantic model | Parameters for a single evaluation run. | -| `GateResult` | Dataclass | Result of one gate: `gate`, `passed`, `reason_code`, `details`. | -| `Verdict` | Constants class | `Verdict.PROMOTE`, `Verdict.HOLD`. | -| `VERDICT_PROMOTE` | `str` | `"PROMOTE"`. | -| `VERDICT_HOLD` | `str` | `"HOLD"`. | -| `run_gates()` | Function | Run all four deterministic gates (G1–G3, G5 shape) and return results. | -| `g2_recall_precision()` | Function | Run only G2 (recall + NC precision) and return `GateResult`. | -| `verdict()` | Function | Derive PROMOTE/HOLD from a list of `GateResult`. | -| `render_scorecard()` | Function | Render a Markdown scorecard from gate results and metadata. | -| `ChampionRecord` | Dataclass | Per-corpus champion metadata and scores. | -| `load_champion()` | Function | Load the current champion from `baseline.json`; returns `None` on Day Zero. | -| `save_champion()` | Function | Persist a new champion to `baseline.json`. | -| `invalidate_champion()` | Function | Mark the champion invalid with a reason string. | -| `AdvisoryReport` | Dataclass | G4 judge output: `scores`, `errors`, `raw`. | -| `run_judge()` | Function | Run the LLM-as-a-Judge advisory pass. | -| `JudgeClient` | Dataclass | Holds `chat_fn` and `embed_fn` for the judge. | -| `OllamaEmbedder` | Class | Local Ollama embedding callable (default BGE-M3). | -| `build_embedder()` | Function | Factory: `"ollama:bge-m3"` → `OllamaEmbedder`. | -| `cosine()` | Function | Cosine similarity between two numpy vectors. | -| `Registry` | Dataclass | Parsed must-find registry with real items and NC items. | -| `RegistryItem` | Dataclass | One must-find or NC item: `id`, `tier`, `scope`, `description`, …. | -| `load_registry()` | Function | Parse and validate a lean-1 registry JSON file. | -| `registry_sha256()` | Function | SHA-256 of a registry file path. | -| `load_corpus()` | Function | Load and index a corpus bundle for G3 evidence verification. | -| `corpus_sha256()` | Function | SHA-256 of a corpus directory or bundle. | -| `verify_evidence_index()` | Function | Check each `evidence_index` entry against the corpus. | -| `EMPTY` / `FABRICATED` / `SOURCE_UNKNOWN` / `VERIFIED` | `str` | Evidence verification status constants. | -| `RetrieverMetrics` | Pydantic model | IR metrics: `recall_k`, `precision_k`, `ndcg_10`, `mrr_10`, `map_10`. | -| `compute_retrieval_metrics()` | Function | Compute IR metrics from a list of ranked-retrieval result rows. | -| `anchored()` | Function | True if claim and evidence share at least one non-trivial token. | -| `matches()` | Function | Gate predicate: does a candidate match a registry item? | -| `source_stem()` | Function | Normalise a `locator` path to its file stem for dedup. | -| `tokens()` | Function | Tokenise text to a list of lowercase word strings. | -| `aa_band()` | Function | Compute per-metric A/A noise floor from repeated runs. | -| `aggregate_grounding()` | Function | Summarise grounding stats across a result's findings. | -| `left_skew_flag()` | Function | True when the score distribution is left-skewed (over-optimistic). | +| `EvalContext` | Pydantic model | Carries `client`, optional `embedder`, and `runs` for the judge metrics. | +| `JudgeClient` | Class | Async multi-provider (`anthropic`/`openai`/`azure`/`ollama`) JSON chat client. | +| `AdvisoryReport` | Dataclass | Aggregated `run_judge` output: `metrics`, `errors`, and run metadata. | +| `Metric` | Type alias | `Callable[[dict, EvalContext], Awaitable[dict \| float \| None]]`. | +| `parse_model` | Function | Split `"provider:model"` into `(provider, model)`. | +| `same_provider` | Function | `True` if two model specs share a known provider prefix. | + +### Judge metrics + +`source_coverage`, `excerpt_fill_rate`, `semantic_recovery`, `faithfulness`, +`numeric_temporal_fidelity`, `citation_relevance`, `nc_semantic_precision`, +`fabricated_entity`, `contradiction`, `open_gap`, `actionability`, +`severity_calibration`, `answer_relevancy`, `surface_deduplication`, +`comparative_vs_champion`, `contains_answer`, `addresses_question`, +`answer_correctness`, `ragas_faithfulness`, `context_recall`, `context_precision`, +and the orchestrator `run_judge`. + +### Retrieval metrics + +`hit_at_k`, `recall_at_k`, `precision_at_k`, `mrr`, `map_score`, `ndcg`, +`no_answer_rate`, `citation_precision`, `mean_latency_ms`. From a6db4ff75fc8b813fe3bec1e18ff3af9707dda2e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 10:56:54 +0200 Subject: [PATCH 57/67] =?UTF-8?q?refactor(evaluation):=20drop=20mean=5Flat?= =?UTF-8?q?ency=5Fms=20=E2=80=94=20telemetry,=20not=20a=20quality=20metric?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mean_latency_ms measured search_ms/answer_ms latency, which is operational telemetry rather than an evaluation of output quality. Remove the function, its export, its tests, and the search_ms/answer_ms row-schema fields it was the sole consumer of. --- docs/evaluation.md | 5 +---- fireflyframework_agentic/evaluation/__init__.py | 3 --- .../evaluation/retrieval_metrics.py | 9 --------- tests/unit/evaluation/test_retrieval_metrics.py | 14 -------------- 4 files changed, 1 insertion(+), 30 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 51fd9c44..d5afe752 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -223,8 +223,6 @@ results = [ "no_answer": False, # model refused / produced no answer "answer": "...", # used for no_answer detection if no_answer absent "citations": [{"is_gold": True}], - "search_ms": 42.0, - "answer_ms": 310.0, }, ] ``` @@ -244,7 +242,6 @@ results = [ | `ndcg` | `(results, k=10) -> float` | Mean normalised discounted cumulative gain at k. | | `no_answer_rate` | `(results) -> float \| None` | Fraction of queries with no answer. `None` if no results. | | `citation_precision` | `(results) -> float \| None` | Precision of in-answer citations vs. the gold set. `None` if no citations. | -| `mean_latency_ms` | `(results, field) -> float \| None` | Mean latency for `"search_ms"` or `"answer_ms"`. `None` if absent. | ### Example @@ -286,4 +283,4 @@ and the orchestrator `run_judge`. ### Retrieval metrics `hit_at_k`, `recall_at_k`, `precision_at_k`, `mrr`, `map_score`, `ndcg`, -`no_answer_rate`, `citation_precision`, `mean_latency_ms`. +`no_answer_rate`, `citation_precision`. diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 35dd32f7..bae3c79d 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -91,9 +91,6 @@ from fireflyframework_agentic.evaluation.retrieval_metrics import ( map_score as map_score, ) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - mean_latency_ms as mean_latency_ms, -) from fireflyframework_agentic.evaluation.retrieval_metrics import ( mrr as mrr, ) diff --git a/fireflyframework_agentic/evaluation/retrieval_metrics.py b/fireflyframework_agentic/evaluation/retrieval_metrics.py index c6caea05..bc8beb93 100644 --- a/fireflyframework_agentic/evaluation/retrieval_metrics.py +++ b/fireflyframework_agentic/evaluation/retrieval_metrics.py @@ -26,8 +26,6 @@ "no_answer": bool, # model refused / produced no answer "answer": str, # used for no_answer detection when no_answer absent "citations": [{"is_gold": bool}, ...], - "search_ms": float, - "answer_ms": float, } Individual metrics:: @@ -40,7 +38,6 @@ ndcg(results, k=10) -> float no_answer_rate(results) -> float | None citation_precision(results) -> float | None - mean_latency_ms(results, field) -> float | None """ from __future__ import annotations @@ -168,9 +165,3 @@ def citation_precision(results: list[dict]) -> float | None: num += sum(1 for c in cites if c.get("is_gold")) den += len(cites) return round(num / den, 4) if den else None - - -def mean_latency_ms(results: list[dict], field: str) -> float | None: - """Mean latency in ms for the given field (``search_ms`` or ``answer_ms``). None if absent.""" - values = [row[field] for row in results if row.get(field) is not None] - return round(sum(values) / len(values)) if values else None diff --git a/tests/unit/evaluation/test_retrieval_metrics.py b/tests/unit/evaluation/test_retrieval_metrics.py index fa453e2d..c25595eb 100644 --- a/tests/unit/evaluation/test_retrieval_metrics.py +++ b/tests/unit/evaluation/test_retrieval_metrics.py @@ -20,7 +20,6 @@ citation_precision, hit_at_k, map_score, - mean_latency_ms, mrr, ndcg, no_answer_rate, @@ -166,16 +165,3 @@ def test_citation_precision_1_when_all_gold(): def test_citation_precision_half_when_half_gold(): rows = [{**_row(gold_rank=1), "citations": [{"is_gold": True}, {"is_gold": False}]}] assert citation_precision(rows) == 0.5 - - -# ── mean_latency_ms ─────────────────────────────────────────────────────────── - - -def test_mean_latency_none_when_field_absent(): - assert mean_latency_ms([_row(gold_rank=1)], "search_ms") is None - - -def test_mean_latency_computed_when_present(): - rows = [{**_row(gold_rank=1), "search_ms": 100.0, "answer_ms": 200.0}] - assert mean_latency_ms(rows, "search_ms") == 100 - assert mean_latency_ms(rows, "answer_ms") == 200 From 9c78ae504bf22024536291cbb208efc9969025de Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 10:57:27 +0200 Subject: [PATCH 58/67] docs: fix stale evaluation subpackage description in package docstring The top-level docstring still described the deleted gate/champion/challenger infrastructure. Correct it to match the shipped surface: LLM-as-judge metrics, RAGAS, and retrieval metrics. --- fireflyframework_agentic/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fireflyframework_agentic/__init__.py b/fireflyframework_agentic/__init__.py index 1736f1f4..431db615 100644 --- a/fireflyframework_agentic/__init__.py +++ b/fireflyframework_agentic/__init__.py @@ -28,8 +28,7 @@ Optional subpackages (not imported eagerly at the top level): fireflyframework_agentic.lab -- sessions, benchmarks, datasets, evaluation orchestration fireflyframework_agentic.experiments -- experiment tracking and comparison - fireflyframework_agentic.evaluation -- gate-based quality gates, LLM-as-judge advisory, - champion/challenger tracking, retrieval metrics + fireflyframework_agentic.evaluation -- LLM-as-judge metrics, RAGAS, and retrieval metrics (requires the ``evaluation`` optional extra) """ From 0d2476bc7707be47e17537018d99ec4808de4a30 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:25:59 +0200 Subject: [PATCH 59/67] feat(evaluation): build RAGAS embeddings from the framework embedder Mirror flycanon's embedding_service factory: add build_embedder(spec) resolving a ':' spec to a fireflyframework_agentic embedder (8 providers, deferred per-provider imports). Widen EvalContext.embedder to BaseEmbedder and feed it into RAGAS via LangchainEmbeddingsWrapper, so the evaluator embeds with the same provider as the pipeline. Removes the broken AnthropicEmbeddings branch. Rename _make_ragas_embeddings -> _build_embeddings to decouple the name from RAGAS for future refactoring. --- docs/evaluation.md | 19 ++++- .../evaluation/__init__.py | 3 + .../evaluation/embedder.py | 82 +++++++++++++++++++ fireflyframework_agentic/evaluation/judge.py | 51 +++++++----- tests/unit/evaluation/test_embedder.py | 44 ++++++++++ 5 files changed, 178 insertions(+), 21 deletions(-) create mode 100644 fireflyframework_agentic/evaluation/embedder.py create mode 100644 tests/unit/evaluation/test_embedder.py diff --git a/docs/evaluation.md b/docs/evaluation.md index d5afe752..7f6270f8 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -51,15 +51,29 @@ async def metric(item: dict, ctx: EvalContext) -> dict | float | None ### EvalContext and JudgeClient ```python -from fireflyframework_agentic.evaluation import EvalContext, JudgeClient +from fireflyframework_agentic.evaluation import EvalContext, JudgeClient, build_embedder ctx = EvalContext( client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), runs=3, # metrics that repeat use the median of this many calls - embedder=None, # optional OllamaEmbedder, required only by semantic_recovery + embedder=None, # optional framework embedder; required by semantic_recovery and RAGAS ) ``` +`embedder` is any `fireflyframework_agentic` embedder. Build one from a +`":"` spec with `build_embedder` (openai, azure, cohere, google, +mistral, voyage, bedrock, ollama): + +```python +ctx = EvalContext( + client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), + embedder=build_embedder("ollama:nomic-embed-text"), +) +``` + +The RAGAS metrics reuse this same framework embedder (wrapped for RAGAS), so the +evaluator embeds with the same provider as the rest of your pipeline. + `JudgeClient` is an async multi-provider chat client. The model spec is `":"`, where provider is one of `anthropic`, `openai`, `azure`, `ollama`. Temperature is pinned to `0.0` for stable verdicts, and API keys are read lazily from the @@ -264,6 +278,7 @@ All symbols below are importable from `fireflyframework_agentic.evaluation`. | Symbol | Kind | Description | |--------|------|-------------| | `EvalContext` | Pydantic model | Carries `client`, optional `embedder`, and `runs` for the judge metrics. | +| `build_embedder` | Function | Build a framework embedder from a `":"` spec (openai/azure/cohere/google/mistral/voyage/bedrock/ollama). | | `JudgeClient` | Class | Async multi-provider (`anthropic`/`openai`/`azure`/`ollama`) JSON chat client. | | `AdvisoryReport` | Dataclass | Aggregated `run_judge` output: `metrics`, `errors`, and run metadata. | | `Metric` | Type alias | `Callable[[dict, EvalContext], Awaitable[dict \| float \| None]]`. | diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index bae3c79d..0efb057f 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -1,3 +1,6 @@ +from fireflyframework_agentic.evaluation.embedder import ( + build_embedder as build_embedder, +) from fireflyframework_agentic.evaluation.judge import ( AdvisoryReport as AdvisoryReport, ) diff --git a/fireflyframework_agentic/evaluation/embedder.py b/fireflyframework_agentic/evaluation/embedder.py new file mode 100644 index 00000000..7fe82ac1 --- /dev/null +++ b/fireflyframework_agentic/evaluation/embedder.py @@ -0,0 +1,82 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Resolve a ``:`` spec to a framework embedder. + +Mirrors flycanon's ``embedding_service._build_embedder``: one branch per +provider shipped by ``fireflyframework_agentic.embeddings``. Per-provider +imports are deferred so a spec that never touches a given provider doesn't +require its SDK to be installed. +""" + +from __future__ import annotations + +import os + +from fireflyframework_agentic.embeddings.base import BaseEmbedder + + +def build_embedder(spec: str, *, dimensions: int | None = None, batch_size: int = 64) -> BaseEmbedder: + """Build a framework embedder from a ``":"`` spec. + + Supported providers: openai, azure, cohere, google, mistral, voyage, + bedrock, ollama. Raises ``ValueError`` on a malformed spec or unknown + provider. + """ + if ":" not in spec: + raise ValueError(f"embedder spec must be ':' (got {spec!r})") + provider, _, model = spec.partition(":") + p = provider.strip().lower() + if p == "openai": + from fireflyframework_agentic.embeddings.providers.openai import OpenAIEmbedder # noqa: PLC0415 + + return OpenAIEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p in ("azure", "azure-openai"): + from fireflyframework_agentic.embeddings.providers.azure import AzureEmbedder # noqa: PLC0415 + + return AzureEmbedder( + model=model, + dimensions=dimensions, + batch_size=batch_size, + azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""), + ) + if p == "cohere": + from fireflyframework_agentic.embeddings.providers.cohere import CohereEmbedder # noqa: PLC0415 + + return CohereEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p in ("google", "gemini"): + from fireflyframework_agentic.embeddings.providers.google import GoogleEmbedder # noqa: PLC0415 + + return GoogleEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p == "mistral": + from fireflyframework_agentic.embeddings.providers.mistral import MistralEmbedder # noqa: PLC0415 + + return MistralEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p == "voyage": + from fireflyframework_agentic.embeddings.providers.voyage import VoyageEmbedder # noqa: PLC0415 + + return VoyageEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p == "bedrock": + from fireflyframework_agentic.embeddings.providers.bedrock import BedrockEmbedder # noqa: PLC0415 + + return BedrockEmbedder(model=model, dimensions=dimensions, batch_size=batch_size) + if p == "ollama": + from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder # noqa: PLC0415 + + base_url = os.environ.get("OLLAMA_HOST", "http://localhost:11434") + return OllamaEmbedder(model=model, dimensions=dimensions, base_url=base_url, batch_size=batch_size) + raise ValueError( + f"unknown embedding provider {provider!r}; supported: " + "openai, azure, cohere, google, mistral, voyage, bedrock, ollama" + ) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index 0bc6cccc..348583d5 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -19,7 +19,7 @@ from pydantic import BaseModel, ConfigDict -from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder +from fireflyframework_agentic.embeddings.base import BaseEmbedder from fireflyframework_agentic.embeddings.similarity import cosine_similarity from fireflyframework_agentic.evaluation.judge_client import JudgeClient, same_provider @@ -41,7 +41,7 @@ class EvalContext(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) client: JudgeClient - embedder: OllamaEmbedder | None = None + embedder: BaseEmbedder | None = None runs: int = 3 @@ -699,25 +699,38 @@ def _make_ragas_llm(ctx: EvalContext): raise ValueError(f"RAGAS: unsupported provider {provider!r}") -def _make_ragas_embeddings(ctx: EvalContext): - """Build LangChain embeddings for RAGAS (langchain import inline).""" - if ctx.embedder is not None: - from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 +def _build_embeddings(ctx: EvalContext): + """Wrap the framework embedder (``ctx.embedder``) for RAGAS. - return OllamaEmbeddings(model=ctx.embedder._model) - provider = ctx.client.provider - if provider == "anthropic": - from langchain_anthropic import AnthropicEmbeddings # type: ignore[import] # noqa: PLC0415 + RAGAS consumes a LangChain ``Embeddings`` via ``LangchainEmbeddingsWrapper``; + we feed it a thin adapter over the fireflyframework_agentic ``BaseEmbedder`` so + RAGAS uses the same embedder (and provider) as the rest of the pipeline. Build + one with :func:`fireflyframework_agentic.evaluation.build_embedder`. + """ + from langchain_core.embeddings import Embeddings # type: ignore[import] # noqa: PLC0415 + from ragas.embeddings import LangchainEmbeddingsWrapper # type: ignore[import] # noqa: PLC0415 + + embedder = ctx.embedder + if embedder is None: + raise ValueError( + "RAGAS metrics need an embedder; set " + "EvalContext.embedder=build_embedder(':')" + ) - return AnthropicEmbeddings() - if provider == "ollama": - from langchain_ollama import OllamaEmbeddings # type: ignore[import] # noqa: PLC0415 + class _FrameworkEmbeddings(Embeddings): + async def aembed_documents(self, texts: list[str]) -> list[list[float]]: + return (await embedder.embed(texts)).embeddings - return OllamaEmbeddings() - raise ValueError( - f"RAGAS: no embedder configured for provider {provider!r}; " - "pass ctx.embedder=OllamaEmbedder(...) explicitly" - ) + async def aembed_query(self, text: str) -> list[float]: + return await embedder.embed_one(text) + + def embed_documents(self, texts: list[str]) -> list[list[float]]: + return asyncio.run(self.aembed_documents(texts)) + + def embed_query(self, text: str) -> list[float]: + return asyncio.run(self.aembed_query(text)) + + return LangchainEmbeddingsWrapper(_FrameworkEmbeddings()) async def _ragas_score(metric_name: str, item: dict, ctx: EvalContext) -> float | None: @@ -746,7 +759,7 @@ def _sync(): return None llm = _make_ragas_llm(ctx) - embeddings = _make_ragas_embeddings(ctx) + embeddings = _build_embeddings(ctx) metric = metric_cls(llm=llm, embeddings=embeddings) sample = _make_ragas_sample(item) dataset = EvaluationDataset(samples=[sample]) diff --git a/tests/unit/evaluation/test_embedder.py b/tests/unit/evaluation/test_embedder.py new file mode 100644 index 00000000..803d2cdf --- /dev/null +++ b/tests/unit/evaluation/test_embedder.py @@ -0,0 +1,44 @@ +# Copyright 2026 Firefly Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for evaluation.embedder.build_embedder.""" + +from __future__ import annotations + +import pytest + +from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder +from fireflyframework_agentic.evaluation import build_embedder + + +def test_build_embedder_ollama_returns_framework_embedder(): + embedder = build_embedder("ollama:nomic-embed-text") + assert isinstance(embedder, OllamaEmbedder) + assert embedder.model == "nomic-embed-text" + + +def test_build_embedder_honours_ollama_host(monkeypatch): + monkeypatch.setenv("OLLAMA_HOST", "http://example:1234") + embedder = build_embedder("ollama:nomic-embed-text") + assert embedder._base_url == "http://example:1234" + + +def test_build_embedder_requires_provider_prefix(): + with pytest.raises(ValueError, match=":"): + build_embedder("nomic-embed-text") + + +def test_build_embedder_rejects_unknown_provider(): + with pytest.raises(ValueError, match="unknown embedding provider"): + build_embedder("bogus:model") From 7c14351f5b38676fc5fefaf718a17b206c6c04ea Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:27:45 +0200 Subject: [PATCH 60/67] fix(evaluation): use AzureChatOpenAI for the azure RAGAS LLM The azure provider was grouped with openai and built a public-OpenAI ChatOpenAI client (api.openai.com + OPENAI_API_KEY), sending the azure deployment name as an OpenAI model id. Split azure out to AzureChatOpenAI using AZURE_OPENAI_ENDPOINT/ AZURE_OPENAI_API_KEY/AZURE_OPENAI_API_VERSION, mirroring judge_client._azure, and add langchain-openai to the [evaluation] extra so the openai/azure paths import. --- fireflyframework_agentic/evaluation/judge.py | 12 +++++++++++- pyproject.toml | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index 348583d5..e6b6b02e 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -687,11 +687,21 @@ def _make_ragas_llm(ctx: EvalContext): api_key = os.environ.get("ANTHROPIC_API_KEY", "") return ChatAnthropic(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] - if provider in ("openai", "azure"): + if provider == "openai": from langchain_openai import ChatOpenAI # type: ignore[import] # noqa: PLC0415 api_key = os.environ.get("OPENAI_API_KEY", "") return ChatOpenAI(model=model, api_key=api_key, temperature=0.0) # type: ignore[call-arg,arg-type] + if provider == "azure": + from langchain_openai import AzureChatOpenAI # type: ignore[import] # noqa: PLC0415 + + return AzureChatOpenAI( # type: ignore[call-arg,arg-type] + azure_deployment=model, + azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.environ.get("AZURE_OPENAI_API_KEY", ""), + api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-01"), + temperature=0.0, + ) if provider == "ollama": from langchain_ollama import ChatOllama # type: ignore[import] # noqa: PLC0415 diff --git a/pyproject.toml b/pyproject.toml index dc6a1507..7aef49a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,6 +124,7 @@ evaluation = [ "ragas>=0.2", "langchain-anthropic>=0.3", "langchain-ollama>=0.3", + "langchain-openai>=0.3", ] dev = [ "pytest>=8.3.0", From c02b10c17b9599be4708d7883cb40b7097c7f654 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:29:40 +0200 Subject: [PATCH 61/67] refactor(evaluation): strip gate-era baggage from AdvisoryReport Drop the dead 'calibrated' field (only ever set to False, never read) and the 'details' field (never written or read), and rewrite the docstring to remove the 'G4 output / GateResult' gate-era framing. Keeps the live fields: judge_model, same_provider_caveat, runs, metrics, errors. --- docs/evaluation.md | 2 -- fireflyframework_agentic/evaluation/judge.py | 13 ++++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 7f6270f8..5aab6a42 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -211,10 +211,8 @@ asyncio.run(main()) |-------|------|-------------| | `judge_model` | `str` | The judge model spec used. | | `same_provider_caveat` | `bool` | `True` when the judge and the evaluated pipeline share a provider (self-grading risk). | -| `calibrated` | `bool` | Reserved; always `False` for now. | | `runs` | `int` | Judge runs per repeated metric. | | `metrics` | `dict` | Per-metric results, keyed by metric name. | -| `details` | `dict` | Supporting context (counts, ids). | | `errors` | `list[str]` | Per-metric failures captured best-effort. | --- diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index e6b6b02e..778b70bd 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -47,19 +47,19 @@ class EvalContext(BaseModel): @dataclass class AdvisoryReport: - """The G4 output: a plain metrics bag, never a GateResult. + """Aggregated output of :func:`run_judge`: a plain metrics bag. - metrics maps metric-name -> small dict (the per-metric summary). details - carries supporting context (counts, ids). errors lists per-metric failures - captured by run_judge's best-effort try/except so nothing propagates. + metrics maps metric-name -> the per-metric result (a small dict or float). + errors lists per-metric failures captured by run_judge's best-effort + try/except so nothing propagates. judge_model and runs are run metadata; + same_provider_caveat flags self-grading risk (the judge shares the evaluated + pipeline's provider). """ judge_model: str same_provider_caveat: bool - calibrated: bool # ALWAYS False for now runs: int metrics: dict = field(default_factory=dict) - details: dict = field(default_factory=dict) errors: list[str] = field(default_factory=list) @@ -869,7 +869,6 @@ async def run_judge( report = AdvisoryReport( judge_model=ctx.client.model_spec, same_provider_caveat=same_provider(pipeline_model, ctx.client.model_spec), - calibrated=False, runs=ctx.runs, ) From 4d262ceba06cccf35cfa11355dca3faff4b8fc3c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:30:38 +0200 Subject: [PATCH 62/67] docs: drop optional-subpackages block from package docstring Revert the top-level __init__.py docstring addition: it duplicated the README and docs/evaluation.md, pulled in unrelated lab/experiments, and already went stale (it described the deleted gates). This leaves the package root untouched by the PR. --- fireflyframework_agentic/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fireflyframework_agentic/__init__.py b/fireflyframework_agentic/__init__.py index 431db615..993b0248 100644 --- a/fireflyframework_agentic/__init__.py +++ b/fireflyframework_agentic/__init__.py @@ -24,12 +24,6 @@ config = get_config() print(config.default_model) - -Optional subpackages (not imported eagerly at the top level): - fireflyframework_agentic.lab -- sessions, benchmarks, datasets, evaluation orchestration - fireflyframework_agentic.experiments -- experiment tracking and comparison - fireflyframework_agentic.evaluation -- LLM-as-judge metrics, RAGAS, and retrieval metrics - (requires the ``evaluation`` optional extra) """ from importlib.metadata import PackageNotFoundError, version From 2dc1054953320b67542b0ff3e5efe1cf214a43eb Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:51:47 +0200 Subject: [PATCH 63/67] refactor(evaluation): back JudgeClient with FireflyAgent + typed outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the hand-rolled multi-provider httpx client with the framework's FireflyAgent (pydantic-ai, a core dep). JudgeClient.chat_json(system,user)->dict becomes judge(system,user,output_type)->validated pydantic model; each of the 13 call-shapes gets a typed output model, so the LLM's structured output is schema- checked instead of parsed via _first_json_object. Agents are built lazily and cached per (system, output_type, max_tokens); temperature pinned to 0.0; retries handled by FireflyAgent/pydantic-ai. Deletes the bespoke _anthropic/_openai/_azure/_ollama methods, _first_json_object, _env, and _coerce_float. Fixes the _gather_chat bug: failed judge calls no longer collapse to {} and get scored as verdicts — they propagate and are recorded in report.errors (new _judge_all). Adds tests for agent caching, failure propagation, and the previously-untested run_judge orchestrator. --- docs/evaluation.md | 14 +- fireflyframework_agentic/evaluation/judge.py | 276 ++++++++++-------- .../evaluation/judge_client.py | 260 +++-------------- tests/unit/evaluation/test_judge.py | 56 +++- 4 files changed, 254 insertions(+), 352 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 5aab6a42..514b4ad9 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -74,11 +74,13 @@ ctx = EvalContext( The RAGAS metrics reuse this same framework embedder (wrapped for RAGAS), so the evaluator embeds with the same provider as the rest of your pipeline. -`JudgeClient` is an async multi-provider chat client. The model spec is -`":"`, where provider is one of `anthropic`, `openai`, `azure`, `ollama`. -Temperature is pinned to `0.0` for stable verdicts, and API keys are read lazily from the -environment (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `AZURE_OPENAI_*`, `OLLAMA_HOST`) — so -constructing a client never requires a secret. +`JudgeClient` is an async multi-provider judge backed by the framework's `FireflyAgent` +(pydantic-ai). The model spec is `":"`, where provider is one of +`anthropic`, `openai`, `azure`, `ollama`. Each call returns a **validated, typed** Pydantic +model — the LLM's structured output is schema-checked rather than hand-parsed — and +`temperature` is pinned to `0.0` for stable verdicts. The provider reads its API key +(`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `AZURE_OPENAI_*`, `OLLAMA_HOST`) when the underlying +agent is first built, so constructing a `JudgeClient` never requires a secret. ### Item schema @@ -277,7 +279,7 @@ All symbols below are importable from `fireflyframework_agentic.evaluation`. |--------|------|-------------| | `EvalContext` | Pydantic model | Carries `client`, optional `embedder`, and `runs` for the judge metrics. | | `build_embedder` | Function | Build a framework embedder from a `":"` spec (openai/azure/cohere/google/mistral/voyage/bedrock/ollama). | -| `JudgeClient` | Class | Async multi-provider (`anthropic`/`openai`/`azure`/`ollama`) JSON chat client. | +| `JudgeClient` | Class | Async multi-provider (`anthropic`/`openai`/`azure`/`ollama`) judge backed by `FireflyAgent`; returns validated typed output. | | `AdvisoryReport` | Dataclass | Aggregated `run_judge` output: `metrics`, `errors`, and run metadata. | | `Metric` | Type alias | `Callable[[dict, EvalContext], Awaitable[dict \| float \| None]]`. | | `parse_model` | Function | Split `"provider:model"` into `(provider, model)`. | diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index 778b70bd..b4f5feaa 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -36,6 +36,61 @@ 'Reply with ONLY {"contains_answer": , "addresses_question": }.' ) +# ── structured judge outputs (validated by the model via FireflyAgent) ─────────── + + +class _Verdict(BaseModel): + verdict: str = "" + reason: str = "" + + +class _Mismatch(BaseModel): + value: str = "" + source: str = "" + + +class _Mismatches(BaseModel): + mismatches: list[_Mismatch] = [] + + +class _Relevant(BaseModel): + relevant: str = "" + + +class _Asserted(BaseModel): + asserted: str = "" + + +class _Fabricated(BaseModel): + fabricated: list[str] = [] + + +class _Pairs(BaseModel): + pairs: list[list[str]] = [] + + +class _Gap(BaseModel): + gap: str = "" + + +class _Score(BaseModel): + score: float | None = None + + +class _Calibration(BaseModel): + calibration: str = "calibrated" + + +class _Comparison(BaseModel): + candidate: dict = {} + champion: dict = {} + more_consistent: str = "" + + +class _RagScore(BaseModel): + contains_answer: float | None = None + addresses_question: float | None = None + class EvalContext(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -98,24 +153,19 @@ def _workspace_intention(item: dict) -> str: return f"{ws.get('name', '')}\n{ws.get('description', '')}".strip() -def _coerce_float(value, default=None): - """Coerce a model-returned number/numeric-string to float; total (never raises).""" - try: - return float(value) - except (TypeError, ValueError): - return default - - def _source_stem(locator: str) -> str: """Return the part before the first '#', or the full string if no '#'.""" idx = locator.find("#") return locator[:idx] if idx != -1 else locator -async def _gather_chat(chat_fn, prompts: list[tuple[str, str]]) -> list[dict]: - """Run a list of (system, user) prompts concurrently, returning ordered results.""" - results = await asyncio.gather(*[chat_fn(s, u) for s, u in prompts], return_exceptions=True) - return [r if isinstance(r, dict) else {} for r in results] +async def _judge_all[T: BaseModel](ctx: EvalContext, system: str, users: list[str], output_type: type[T]) -> list[T]: + """Judge a list of user prompts concurrently against one system prompt. + + Failures propagate (no swallowing into a fake verdict): a failed call raises, + so run_judge records it in report.errors instead of scoring it as a result. + """ + return list(await asyncio.gather(*[ctx.client.judge(system, u, output_type) for u in users])) # ── [D] DETERMINISTIC — no LLM, always available ──────────────────────────────── @@ -221,7 +271,7 @@ async def semantic_recovery(item: dict, ctx: EvalContext, tau: float = 0.70) -> } -# ── [J] JUDGE — needs chat_fn(system, user) -> dict ────────────────────────────── +# ── [J] JUDGE — needs ctx.client.judge(system, user, output_type) ──────────────── async def faithfulness(item: dict, ctx: EvalContext) -> dict: @@ -232,18 +282,15 @@ async def faithfulness(item: dict, ctx: EvalContext) -> dict: ev_idx = _evidence_index(item) findings = item.get("findings", []) cited = [(f, _cited_excerpts(f, ev_idx)) for f in findings] - prompts = [ - ( - SYSTEM, - "Does the cited evidence span ENTAIL the claim made in this finding?\n" - 'Reply with ONLY {"verdict": "SUPPORTED" or "NOT_SUPPORTED", "reason": ""}.\n\n' - f"FINDING: {f.get('description', '')}\n" - f"CITED EVIDENCE: {' || '.join(excerpts)}", - ) + users = [ + "Does the cited evidence span ENTAIL the claim made in this finding?\n" + 'Reply with ONLY {"verdict": "SUPPORTED" or "NOT_SUPPORTED", "reason": ""}.\n\n' + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}" for f, excerpts in cited if excerpts ] - answers = iter(await _gather_chat(ctx.client.chat_json, prompts)) + answers = iter(await _judge_all(ctx, SYSTEM, users, _Verdict)) supported = 0 unsupported_ids: list[str] = [] for f, excerpts in cited: @@ -251,8 +298,7 @@ async def faithfulness(item: dict, ctx: EvalContext) -> dict: if not excerpts: unsupported_ids.append(fid) continue - verdict = str(next(answers).get("verdict", "")).upper() - if verdict == "SUPPORTED": + if str(next(answers).verdict).upper() == "SUPPORTED": supported += 1 else: unsupported_ids.append(fid) @@ -266,27 +312,24 @@ async def numeric_temporal_fidelity(item: dict, ctx: EvalContext) -> dict: """ ev_idx = _evidence_index(item) scored = [(f, excerpts) for f in item.get("findings", []) if (excerpts := _cited_excerpts(f, ev_idx))] - prompts = [ - ( - SYSTEM, - "List every specific number or date asserted in the FINDING that does " - "NOT match the CITED EVIDENCE.\n" - 'Reply with ONLY {"mismatches": [{"value": "", "source": ""}]}. ' - "Empty list if all match.\n\n" - f"FINDING: {f.get('description', '')}\n" - f"CITED EVIDENCE: {' || '.join(excerpts)}", - ) + users = [ + "List every specific number or date asserted in the FINDING that does " + "NOT match the CITED EVIDENCE.\n" + 'Reply with ONLY {"mismatches": [{"value": "", "source": ""}]}. ' + "Empty list if all match.\n\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(excerpts)}" for f, excerpts in scored ] - answers = await _gather_chat(ctx.client.chat_json, prompts) + answers = await _judge_all(ctx, SYSTEM, users, _Mismatches) mismatches: list[dict] = [] for (f, _excerpts), answer in zip(scored, answers, strict=False): - for m in answer.get("mismatches", []) or []: + for m in answer.mismatches: mismatches.append( { "finding_id": f.get("id", "?"), - "value": m.get("value", ""), - "source": m.get("source", ""), + "value": m.value, + "source": m.source, } ) return {"mismatches": mismatches, "count": len(mismatches)} @@ -298,7 +341,7 @@ async def citation_relevance(item: dict, ctx: EvalContext) -> dict: Returns {precision, relevant, total}. """ ev_idx = _evidence_index(item) - prompts: list[tuple[str, str]] = [] + users: list[str] = [] for f in item.get("findings", []): desc = f.get("description", "") for ref in f.get("evidence_refs", []): @@ -308,18 +351,15 @@ async def citation_relevance(item: dict, ctx: EvalContext) -> dict: excerpt = ev.get("excerpt") or "" if not excerpt: continue - prompts.append( - ( - SYSTEM, - "Is this cited passage actually relevant to / used by this claim?\n" - 'Reply with ONLY {"relevant": "yes" or "no"}.\n\n' - f"CLAIM: {desc}\n" - f"CITED PASSAGE: {excerpt}", - ) + users.append( + "Is this cited passage actually relevant to / used by this claim?\n" + 'Reply with ONLY {"relevant": "yes" or "no"}.\n\n' + f"CLAIM: {desc}\n" + f"CITED PASSAGE: {excerpt}" ) - answers = await _gather_chat(ctx.client.chat_json, prompts) - total = len(prompts) - relevant = sum(1 for a in answers if str(a.get("relevant", "")).lower() == "yes") + answers = await _judge_all(ctx, SYSTEM, users, _Relevant) + total = len(users) + relevant = sum(1 for a in answers if str(a.relevant).lower() == "yes") if not total: return {"precision": None, "relevant": relevant, "total": total} return {"precision": round(relevant / total, 4), "relevant": relevant, "total": total} @@ -333,21 +373,18 @@ async def nc_semantic_precision(item: dict, ctx: EvalContext) -> dict: """ output_text = _output_text(item) nc_items: list[dict] = item.get("nc_items", []) - prompts = [ - ( - SYSTEM, - "Does the OUTPUT assert or endorse the following FALSE statement?\n" - 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' - f"FALSE STATEMENT: {nc.get('description', '')}\n" - f"OUTPUT:\n{output_text}", - ) + users = [ + "Does the OUTPUT assert or endorse the following FALSE statement?\n" + 'Reply with ONLY {"asserted": "yes" or "no"}.\n\n' + f"FALSE STATEMENT: {nc.get('description', '')}\n" + f"OUTPUT:\n{output_text}" for nc in nc_items ] - answers = await _gather_chat(ctx.client.chat_json, prompts) + answers = await _judge_all(ctx, SYSTEM, users, _Asserted) asserted_ids = [ nc.get("id", "?") for nc, a in zip(nc_items, answers, strict=False) - if str(a.get("asserted", "")).lower() == "yes" + if str(a.asserted).lower() == "yes" ] return {"asserted": len(asserted_ids), "total": len(nc_items), "asserted_ids": asserted_ids} @@ -366,8 +403,8 @@ async def fabricated_entity(item: dict, ctx: EvalContext) -> dict: f"OUTPUT:\n{output_text}\n\n" f"CORPUS EVIDENCE:\n{corpus}" ) - answer = await ctx.client.chat_json(SYSTEM, user) - entities = answer.get("fabricated", []) or [] + answer = await ctx.client.judge(SYSTEM, user, _Fabricated) + entities = answer.fabricated return {"count": len(entities), "entities": list(entities)} @@ -383,8 +420,8 @@ async def contradiction(item: dict, ctx: EvalContext) -> dict: "Are any two of these FINDINGS mutually contradictory? List each contradicting pair.\n" 'Reply with ONLY {"pairs": [["", ""], ...]}. Empty list if none.\n\n' + "\n".join(lines) ) - answer = await ctx.client.chat_json(SYSTEM, user) - pairs = answer.get("pairs", []) or [] + answer = await ctx.client.judge(SYSTEM, user, _Pairs) + pairs = answer.pairs return {"count": len(pairs), "pairs": [list(p) for p in pairs]} @@ -403,8 +440,8 @@ async def open_gap(item: dict, ctx: EvalContext) -> dict: f"{pg_summary}\n" f"OUTPUT:\n{_output_text(item)}" ) - answer = await ctx.client.chat_json(SYSTEM, user) - return {"gap": str(answer.get("gap", ""))} + answer = await ctx.client.judge(SYSTEM, user, _Gap) + return {"gap": str(answer.gap)} async def actionability(item: dict, ctx: EvalContext) -> dict: @@ -414,29 +451,21 @@ async def actionability(item: dict, ctx: EvalContext) -> dict: """ actions = item.get("proposed_actions", []) or [] finding_ids = {f.get("id") for f in item.get("findings", [])} - prompts = [ - ( - SYSTEM, - "Rate whether this proposed action is SPECIFIC, QUANTIFIED, and LINKED to a " - "finding.\n" - 'Reply with ONLY {"score": }.\n\n' - f"TITLE: {a.get('title', '')}\n" - f"DESCRIPTION: {a.get('description', '')}\n" - f"OWNER: {a.get('owner_persona', '')} HORIZON: {a.get('horizon', '')} " - f"LEVER: {a.get('lever', '')} EFFORT: {a.get('effort', '')}\n" - f"EXPECTED_SAVINGS_FTE: {a.get('expected_savings_fte', '')} " - f"EXPECTED_SAVINGS_USD: {a.get('expected_savings_usd', '')}\n" - f"LINKED_TO_FINDING: {a.get('finding_id') in finding_ids}", - ) + users = [ + "Rate whether this proposed action is SPECIFIC, QUANTIFIED, and LINKED to a " + "finding.\n" + 'Reply with ONLY {"score": }.\n\n' + f"TITLE: {a.get('title', '')}\n" + f"DESCRIPTION: {a.get('description', '')}\n" + f"OWNER: {a.get('owner_persona', '')} HORIZON: {a.get('horizon', '')} " + f"LEVER: {a.get('lever', '')} EFFORT: {a.get('effort', '')}\n" + f"EXPECTED_SAVINGS_FTE: {a.get('expected_savings_fte', '')} " + f"EXPECTED_SAVINGS_USD: {a.get('expected_savings_usd', '')}\n" + f"LINKED_TO_FINDING: {a.get('finding_id') in finding_ids}" for a in actions ] - answers = await _gather_chat(ctx.client.chat_json, prompts) - scores: list[float] = [] - for a in answers: - value = _coerce_float(a.get("score")) - if value is None: - continue - scores.append(value) + answers = await _judge_all(ctx, SYSTEM, users, _Score) + scores = [a.score for a in answers if a.score is not None] score = round(sum(scores) / len(scores), 4) if scores else None return {"score": score, "rated": len(scores)} @@ -448,22 +477,19 @@ async def severity_calibration(item: dict, ctx: EvalContext) -> dict: """ ev_idx = _evidence_index(item) findings = item.get("findings", []) - prompts = [ - ( - SYSTEM, - "Does the STATED SEVERITY match what the CITED EVIDENCE supports?\n" - 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' - f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" - f"FINDING: {f.get('description', '')}\n" - f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, ev_idx))}", - ) + users = [ + "Does the STATED SEVERITY match what the CITED EVIDENCE supports?\n" + 'Reply with ONLY {"calibration": "under" or "over" or "calibrated"}.\n\n' + f"STATED SEVERITY: {f.get('severity', '')} SCORE: {f.get('score', '')}\n" + f"FINDING: {f.get('description', '')}\n" + f"CITED EVIDENCE: {' || '.join(_cited_excerpts(f, ev_idx))}" for f in findings ] - answers = await _gather_chat(ctx.client.chat_json, prompts) + answers = await _judge_all(ctx, SYSTEM, users, _Calibration) verdicts: dict[str, str] = {} miscalibrated = 0 for f, a in zip(findings, answers, strict=False): - verdict = str(a.get("calibration", "calibrated")).lower() + verdict = str(a.calibration).lower() verdicts[f.get("id", "?")] = verdict if verdict in ("under", "over"): miscalibrated += 1 @@ -481,8 +507,8 @@ async def answer_relevancy(item: dict, ctx: EvalContext) -> dict: f"WORKSPACE INTENTION: {_workspace_intention(item)}\n" f"OUTPUT:\n{_output_text(item)}" ) - answer = await ctx.client.chat_json(SYSTEM, user) - return {"score": _coerce_float(answer.get("score"))} + answer = await ctx.client.judge(SYSTEM, user, _Score) + return {"score": answer.score} async def surface_deduplication(item: dict, ctx: EvalContext) -> dict: @@ -537,28 +563,25 @@ def _toks(node: dict) -> frozenset[str]: if not candidates: return {"distinct": 0, "redundant": 0, "total": 0, "distinct_rate": None, "redundant_pairs": []} - prompts = [] + users = [] for surface, a, b, parent_proc in candidates: ctx_line = f"\nPARENT PROCESS: {parent_proc}\n" if parent_proc else "" - prompts.append( - ( - SYSTEM, - f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " - f"duplicate / sub-case / restatement of the other?\n" - f"{ctx_line}" - 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' - f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" - f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}", - ) + users.append( + f"Are these two {surface} nodes genuinely DISTINCT process concepts, or is one a " + f"duplicate / sub-case / restatement of the other?\n" + f"{ctx_line}" + 'Reply with ONLY {"verdict": "DISTINCT" or "DUPLICATE", "reason": ""}.\n\n' + f"{surface.upper()} A: {a.get('name', '')} — {a.get('description', '')}\n" + f"{surface.upper()} B: {b.get('name', '')} — {b.get('description', '')}" ) - answers = await _gather_chat(ctx.client.chat_json, prompts) + answers = await _judge_all(ctx, SYSTEM, users, _Verdict) distinct = 0 redundant = 0 redundant_pairs: list[dict] = [] for (surface, a, b, _parent), answer in zip(candidates, answers, strict=False): - verdict = str(answer.get("verdict", "")).upper() + verdict = str(answer.verdict).upper() if verdict == "DISTINCT": distinct += 1 else: @@ -568,7 +591,7 @@ def _toks(node: dict) -> frozenset[str]: "surface": surface, "a": a.get("name", ""), "b": b.get("name", ""), - "reason": str(answer.get("reason", "")), + "reason": str(answer.reason), } ) @@ -602,27 +625,26 @@ async def comparative_vs_champion(item: dict, ctx: EvalContext) -> dict | None: f"CANDIDATE:\n{_output_text(item)}\n\n" f"CHAMPION:\n{_output_text(champion)}" ) - out = await ctx.client.chat_json(SYSTEM, user) + out = await ctx.client.judge(SYSTEM, user, _Comparison) return { - "candidate": out.get("candidate", {}), - "champion": out.get("champion", {}), - "more_consistent": out.get("more_consistent", ""), + "candidate": out.candidate, + "champion": out.champion, + "more_consistent": out.more_consistent, } # ── flycanon custom metrics ─────────────────────────────────────────────────────── -async def _rag_score_once(item: dict, ctx: EvalContext) -> dict | None: - """Single RAG scoring call: returns {"contains_answer": float, "addresses_question": float}.""" +async def _rag_score_once(item: dict, ctx: EvalContext) -> _RagScore | None: + """Single RAG scoring call returning a _RagScore (or None if item lacks Q/A).""" question = item.get("question", "") reference = item.get("reference", "") answer = item.get("answer", "") if not question or not answer: return None user = f"QUESTION: {question}\nREFERENCE: {reference}\nANSWER: {answer}\n\n{RUBRIC}" - result = await ctx.client.chat_json(SYSTEM_RAG, user) - return result + return await ctx.client.judge(SYSTEM_RAG, user, _RagScore) async def contains_answer(item: dict, ctx: EvalContext) -> float | None: @@ -636,9 +658,8 @@ async def contains_answer(item: dict, ctx: EvalContext) -> float | None: result = await _rag_score_once(item, ctx) if result is None: return None - val = _coerce_float(result.get("contains_answer")) - if val is not None: - scores.append(val) + if result.contains_answer is not None: + scores.append(result.contains_answer) if not scores: return None return round(statistics.median(scores), 4) @@ -655,9 +676,8 @@ async def addresses_question(item: dict, ctx: EvalContext) -> float | None: result = await _rag_score_once(item, ctx) if result is None: return None - val = _coerce_float(result.get("addresses_question")) - if val is not None: - scores.append(val) + if result.addresses_question is not None: + scores.append(result.addresses_question) if not scores: return None return round(statistics.median(scores), 4) diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py index 7f050d16..cba97a24 100644 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ b/fireflyframework_agentic/evaluation/judge_client.py @@ -1,29 +1,24 @@ """Async LLM scoring client for judge metrics. -Thin httpx-based wrapper over Anthropic / OpenAI / Azure OpenAI / Ollama. -Reads API keys lazily (per-call) from env so importing never requires secrets. -Provider/model spec: ":", e.g. "anthropic:claude-sonnet-4-6". +Thin wrapper over the framework :class:`FireflyAgent` that returns validated, +typed structured output. The model spec is ``":"`` (e.g. +``"anthropic:claude-sonnet-4-6"``); provider resolution, retries, and JSON +schema enforcement are handled by FireflyAgent / pydantic-ai. API keys are read +by the provider when the agent is first built (on the first :meth:`judge` call), +so constructing a JudgeClient never requires a secret. """ from __future__ import annotations -import asyncio -import json -import os -import re +from typing import TypeVar -import httpx +from pydantic import BaseModel -_RETRY_STATUS = (429, 500, 502, 503, 504) -_MAX_RETRY_AFTER = 30.0 +from fireflyframework_agentic.agents import FireflyAgent +T = TypeVar("T", bound=BaseModel) -def _env(name: str, default: str | None = None) -> str | None: - value = os.environ.get(name) - if value is None: - return default - value = value.strip() - return value if value else default +_AGENT_NAME = "evaluation-judge" def parse_model(spec: str) -> tuple[str, str]: @@ -44,66 +39,15 @@ def same_provider(pipeline_model: str, judge_model: str) -> bool: return p == j -def _first_json_object(text: str) -> dict: - """Extract the first balanced JSON object from text (handles prose/code-fence wrapping).""" - if not text: - raise ValueError("empty model response") - - # Fast path: a clean JSON object with no surrounding prose. A non-dict - # clean parse (e.g. a top-level array) is intentionally ignored so the brace - # scanner can still find an embedded object rather than returning arr[0]. - try: - parsed = json.loads(text.strip()) - except (json.JSONDecodeError, ValueError): - parsed = None - if isinstance(parsed, dict): - return parsed - - start = text.find("{") - while start != -1: - depth = 0 - in_string = False - escape = False - for i in range(start, len(text)): - ch = text[i] - if in_string: - if escape: - escape = False - elif ch == "\\": - escape = True - elif ch == '"': - in_string = False - continue - if ch == '"': - in_string = True - elif ch == "{": - depth += 1 - elif ch == "}": - depth -= 1 - if depth == 0: - candidate = text[start : i + 1] - try: - return json.loads(candidate) - except json.JSONDecodeError: - break # try the next '{' - start = text.find("{", start + 1) - - # Greedy fallback: first '{' .. last '}' across newlines. - match = re.search(r"\{.*\}", text, re.DOTALL) - if match: - return json.loads(match.group(0)) - raise ValueError("no JSON object found in model response") - - class JudgeClient: - """Async multi-provider chat client returning parsed JSON dicts. - - Dispatch is by the provider prefix of the model spec. temperature is pinned - to 0.0 for deterministic verdicts. Transient HTTP errors (429/5xx) and network - errors are retried up to max_retries with backoff. - - The API key / endpoint env vars are read lazily inside chat_json, so - constructing a JudgeClient never requires a secret. + """Async multi-provider judge backed by :class:`FireflyAgent`. + + Each ``judge`` call returns a validated instance of the requested pydantic + ``output_type`` — schema enforcement replaces hand-rolled JSON parsing. + ``temperature`` is pinned to 0.0 for deterministic verdicts. Agents are built + lazily and cached per ``(system, output_type, max_tokens)``; transient + rate-limit / 5xx errors and output-validation failures are retried by + FireflyAgent / pydantic-ai (``max_retries``). """ def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None: @@ -111,144 +55,30 @@ def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None self.provider, self.model = parse_model(model) self.timeout = timeout self.max_retries = max_retries - - async def chat_json(self, system: str, user: str, max_tokens: int = 1024) -> dict: - """Send (system, user) to the provider and parse the first JSON object. - - Raises on exhausted retries / unknown provider / unparseable output. + self._agents: dict[tuple[str, type, int], FireflyAgent] = {} + + def _agent(self, system: str, output_type: type[T], max_tokens: int) -> FireflyAgent: + key = (system, output_type, max_tokens) + agent = self._agents.get(key) + if agent is None: + agent = FireflyAgent( + name=_AGENT_NAME, + model=self.model_spec, + instructions=system, + output_type=output_type, + model_settings={"temperature": 0.0, "max_tokens": max_tokens}, + retries=self.max_retries, + auto_register=False, + ) + self._agents[key] = agent + return agent + + async def judge(self, system: str, user: str, output_type: type[T], max_tokens: int = 1024) -> T: + """Send (system, user) to the model and return a validated ``output_type``. + + Raises on exhausted retries / unknown provider / output that cannot be + coerced to ``output_type`` — callers must not treat a failure as a verdict. """ - last_exc: Exception | None = None - for attempt in range(self.max_retries): - try: - if self.provider == "anthropic": - return await self._anthropic(system, user, max_tokens) - if self.provider == "openai": - return await self._openai(system, user, max_tokens) - if self.provider == "azure": - return await self._azure(system, user, max_tokens) - if self.provider == "ollama": - return await self._ollama(system, user, max_tokens) - raise ValueError( - f"unknown judge provider {self.provider!r} in {self.model_spec!r}; " - "use anthropic:/openai:/azure:/ollama:" - ) - except httpx.HTTPStatusError as exc: - last_exc = exc - if exc.response.status_code not in _RETRY_STATUS or attempt == self.max_retries - 1: - raise - retry_after_header = exc.response.headers.get("retry-after") - if retry_after_header is not None: - try: - delay = min(float(retry_after_header), _MAX_RETRY_AFTER) - except (TypeError, ValueError): - delay = 2.0**attempt - else: - delay = 2.0**attempt - await asyncio.sleep(delay) - except httpx.RequestError as exc: - last_exc = exc - if attempt == self.max_retries - 1: - raise - await asyncio.sleep(2.0) - if last_exc is not None: - raise last_exc - raise RuntimeError("chat_json exhausted retries without a response") - - async def _anthropic(self, system: str, user: str, max_tokens: int) -> dict: - api_key = _env("ANTHROPIC_API_KEY") - if not api_key: - raise RuntimeError("ANTHROPIC_API_KEY not set") - body = { - "model": self.model, - "max_tokens": max_tokens, - "temperature": 0.0, - "system": system, - "messages": [{"role": "user", "content": user}], - } - headers = { - "x-api-key": api_key, - "anthropic-version": "2023-06-01", - "content-type": "application/json", - } - async with httpx.AsyncClient(timeout=self.timeout) as client: - resp = await client.post("https://api.anthropic.com/v1/messages", json=body, headers=headers) - resp.raise_for_status() - data = resp.json() - text = next((b.get("text") for b in data.get("content", []) if b.get("type") == "text"), None) - if not text: - raise RuntimeError(f"judge returned no text: {data}") - return _first_json_object(text) - - async def _openai(self, system: str, user: str, max_tokens: int) -> dict: - api_key = _env("OPENAI_API_KEY") - if not api_key: - raise RuntimeError("OPENAI_API_KEY not set") - body = { - "model": self.model, - "max_tokens": max_tokens, - "temperature": 0.0, - "messages": [ - {"role": "system", "content": system}, - {"role": "user", "content": user}, - ], - } - headers = {"Authorization": f"Bearer {api_key}", "content-type": "application/json"} - async with httpx.AsyncClient(timeout=self.timeout) as client: - resp = await client.post("https://api.openai.com/v1/chat/completions", json=body, headers=headers) - resp.raise_for_status() - data = resp.json() - choices = data.get("choices") or [] - if choices: - text = (choices[0].get("message") or {}).get("content") - if text: - return _first_json_object(text) - raise RuntimeError(f"judge returned no text: {data}") - - async def _azure(self, system: str, user: str, max_tokens: int) -> dict: - endpoint = _env("AZURE_OPENAI_ENDPOINT") - api_key = _env("AZURE_OPENAI_API_KEY") - if not endpoint: - raise RuntimeError("AZURE_OPENAI_ENDPOINT not set") - if not api_key: - raise RuntimeError("AZURE_OPENAI_API_KEY not set") - api_version = _env("AZURE_OPENAI_API_VERSION") or "2024-02-01" - url = f"{endpoint.rstrip('/')}/openai/deployments/{self.model}/chat/completions?api-version={api_version}" - body = { - "max_tokens": max_tokens, - "temperature": 0.0, - "messages": [ - {"role": "system", "content": system}, - {"role": "user", "content": user}, - ], - } - headers = {"api-key": api_key, "content-type": "application/json"} - async with httpx.AsyncClient(timeout=self.timeout) as client: - resp = await client.post(url, json=body, headers=headers) - resp.raise_for_status() - data = resp.json() - choices = data.get("choices") or [] - if choices: - text = (choices[0].get("message") or {}).get("content") - if text: - return _first_json_object(text) - raise RuntimeError(f"judge returned no text: {data}") - - async def _ollama(self, system: str, user: str, max_tokens: int) -> dict: # noqa: ARG002 - host = _env("OLLAMA_HOST") or "http://localhost:11434" - body = { - "model": self.model, - "stream": False, - "options": {"temperature": 0.0}, - "messages": [ - {"role": "system", "content": system}, - {"role": "user", "content": user}, - ], - } - async with httpx.AsyncClient(timeout=self.timeout) as client: - resp = await client.post(f"{host.rstrip('/')}/api/chat", json=body) - resp.raise_for_status() - data = resp.json() - text = (data.get("message") or {}).get("content") - if not text: - raise RuntimeError(f"judge returned no text: {data}") - return _first_json_object(text) + agent = self._agent(system, output_type, max_tokens) + result = await agent.run(user, timeout=self.timeout) + return result.output diff --git a/tests/unit/evaluation/test_judge.py b/tests/unit/evaluation/test_judge.py index 7f27c125..c4b51d9b 100644 --- a/tests/unit/evaluation/test_judge.py +++ b/tests/unit/evaluation/test_judge.py @@ -2,12 +2,15 @@ import pytest +from fireflyframework_agentic.agents import FireflyAgent from fireflyframework_agentic.evaluation.judge import ( EvalContext, + _Verdict, addresses_question, contains_answer, excerpt_fill_rate, faithfulness, + run_judge, source_coverage, ) from fireflyframework_agentic.evaluation.judge_client import JudgeClient @@ -20,10 +23,10 @@ def make_ctx(responses: list[dict]) -> EvalContext: client.model = "claude-sonnet-4-6" call_iter = iter(responses) - async def mock_chat_json(system, user, max_tokens=1024): - return next(call_iter) + async def mock_judge(system, user, output_type, max_tokens=1024): + return output_type(**next(call_iter)) - client.chat_json = mock_chat_json + client.judge = mock_judge return EvalContext(client=client, runs=1) @@ -246,3 +249,50 @@ async def test_excerpt_fill_rate_empty(): result = await excerpt_fill_rate(item, ctx) assert result["populated"] == 0 assert result["total"] == 0 + + +# ── JudgeClient (FireflyAgent-backed) ───────────────────────────────────────────── + + +def test_judge_client_builds_and_caches_agent(monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "dummy") + client = JudgeClient("anthropic:claude-sonnet-4-6") + a1 = client._agent("sys", _Verdict, 1024) + a2 = client._agent("sys", _Verdict, 1024) + assert isinstance(a1, FireflyAgent) + assert a1 is a2 # cached per (system, output_type, max_tokens) + + +@pytest.mark.asyncio +async def test_faithfulness_propagates_judge_failure(): + # A failed judge call must NOT be silently scored as a verdict — it propagates. + client = MagicMock(spec=JudgeClient) + + async def boom(system, user, output_type, max_tokens=1024): + raise RuntimeError("API down") + + client.judge = boom + ctx = EvalContext(client=client, runs=1) + item = { + "findings": [{"id": "F1", "description": "x", "evidence_refs": [{"evidence_id": "E1"}]}], + "evidence_index": [{"id": "E1", "excerpt": "y"}], + } + with pytest.raises(RuntimeError): + await faithfulness(item, ctx) + + +@pytest.mark.asyncio +async def test_run_judge_aggregates_and_captures_errors(): + client = MagicMock(spec=JudgeClient) + client.model_spec = "anthropic:claude-sonnet-4-6" + + async def mock_judge(system, user, output_type, max_tokens=1024): + return output_type() + + client.judge = mock_judge + ctx = EvalContext(client=client, runs=1) + report = await run_judge({"question": "Q", "reference": "R", "answer": "A"}, ctx, pipeline_model="anthropic:claude-sonnet-4-6") + assert report.judge_model == "anthropic:claude-sonnet-4-6" + assert report.same_provider_caveat is True + assert "source_coverage" in report.metrics # deterministic metric always runs + assert isinstance(report.errors, list) # best-effort: never raises From dd86d74b88b90e5cb41f839fe49e4fb79524a088 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 11:57:56 +0200 Subject: [PATCH 64/67] refactor(evaluation): make AdvisoryReport a pydantic model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the run_judge output DTO with the framework convention — *Result/*Report types (EvalReport, EvalResult, BenchmarkResult, PipelineResult, ...) and the module's own EvalContext are all pydantic BaseModel, leaving AdvisoryReport the lone dataclass. Switching gains free model_dump_json() for logging/persistence at no cost (internal output, mutated in place). --- docs/evaluation.md | 2 +- fireflyframework_agentic/evaluation/judge.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 514b4ad9..bf07c851 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -280,7 +280,7 @@ All symbols below are importable from `fireflyframework_agentic.evaluation`. | `EvalContext` | Pydantic model | Carries `client`, optional `embedder`, and `runs` for the judge metrics. | | `build_embedder` | Function | Build a framework embedder from a `":"` spec (openai/azure/cohere/google/mistral/voyage/bedrock/ollama). | | `JudgeClient` | Class | Async multi-provider (`anthropic`/`openai`/`azure`/`ollama`) judge backed by `FireflyAgent`; returns validated typed output. | -| `AdvisoryReport` | Dataclass | Aggregated `run_judge` output: `metrics`, `errors`, and run metadata. | +| `AdvisoryReport` | Pydantic model | Aggregated `run_judge` output: `metrics`, `errors`, and run metadata. | | `Metric` | Type alias | `Callable[[dict, EvalContext], Awaitable[dict \| float \| None]]`. | | `parse_model` | Function | Split `"provider:model"` into `(provider, model)`. | | `same_provider` | Function | `True` if two model specs share a known provider prefix. | diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index b4f5feaa..f9bd5e69 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -15,7 +15,6 @@ import os import statistics from collections.abc import Awaitable, Callable -from dataclasses import dataclass, field from pydantic import BaseModel, ConfigDict @@ -100,8 +99,7 @@ class EvalContext(BaseModel): runs: int = 3 -@dataclass -class AdvisoryReport: +class AdvisoryReport(BaseModel): """Aggregated output of :func:`run_judge`: a plain metrics bag. metrics maps metric-name -> the per-metric result (a small dict or float). @@ -114,8 +112,8 @@ class AdvisoryReport: judge_model: str same_provider_caveat: bool runs: int - metrics: dict = field(default_factory=dict) - errors: list[str] = field(default_factory=list) + metrics: dict = {} + errors: list[str] = [] # ── shared accessors ─────────────────────────────────────────────────────────── From 30e5fa69a9c51ab9edf1d4249205589f54691c6e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 12:09:02 +0200 Subject: [PATCH 65/67] refactor(evaluation): merge judge_client into judge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the FireflyAgent refactor the client shrank to ~90 lines and is used only by judge.py. Fold JudgeClient + parse_model + same_provider into judge.py and drop the separate file — no standalone transport to justify it anymore. Public imports are unchanged (still re-exported from the package). --- .../evaluation/__init__.py | 18 ++-- fireflyframework_agentic/evaluation/judge.py | 71 +++++++++++++++- .../evaluation/judge_client.py | 84 ------------------- tests/unit/evaluation/test_judge.py | 2 +- 4 files changed, 80 insertions(+), 95 deletions(-) delete mode 100644 fireflyframework_agentic/evaluation/judge_client.py diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index 0efb057f..b3a42e21 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -7,6 +7,9 @@ from fireflyframework_agentic.evaluation.judge import ( EvalContext as EvalContext, ) +from fireflyframework_agentic.evaluation.judge import ( + JudgeClient as JudgeClient, +) from fireflyframework_agentic.evaluation.judge import ( Metric as Metric, ) @@ -58,12 +61,18 @@ from fireflyframework_agentic.evaluation.judge import ( open_gap as open_gap, ) +from fireflyframework_agentic.evaluation.judge import ( + parse_model as parse_model, +) from fireflyframework_agentic.evaluation.judge import ( ragas_faithfulness as ragas_faithfulness, ) from fireflyframework_agentic.evaluation.judge import ( run_judge as run_judge, ) +from fireflyframework_agentic.evaluation.judge import ( + same_provider as same_provider, +) from fireflyframework_agentic.evaluation.judge import ( semantic_recovery as semantic_recovery, ) @@ -76,15 +85,6 @@ from fireflyframework_agentic.evaluation.judge import ( surface_deduplication as surface_deduplication, ) -from fireflyframework_agentic.evaluation.judge_client import ( - JudgeClient as JudgeClient, -) -from fireflyframework_agentic.evaluation.judge_client import ( - parse_model as parse_model, -) -from fireflyframework_agentic.evaluation.judge_client import ( - same_provider as same_provider, -) from fireflyframework_agentic.evaluation.retrieval_metrics import ( citation_precision as citation_precision, ) diff --git a/fireflyframework_agentic/evaluation/judge.py b/fireflyframework_agentic/evaluation/judge.py index f9bd5e69..1be8548b 100644 --- a/fireflyframework_agentic/evaluation/judge.py +++ b/fireflyframework_agentic/evaluation/judge.py @@ -18,9 +18,78 @@ from pydantic import BaseModel, ConfigDict +from fireflyframework_agentic.agents import FireflyAgent from fireflyframework_agentic.embeddings.base import BaseEmbedder from fireflyframework_agentic.embeddings.similarity import cosine_similarity -from fireflyframework_agentic.evaluation.judge_client import JudgeClient, same_provider + +# ── judge client ───────────────────────────────────────────────────────────────── + +_AGENT_NAME = "evaluation-judge" + + +def parse_model(spec: str) -> tuple[str, str]: + """Split "provider:model" -> (provider, model). Bare spec -> ("unknown", spec).""" + spec = (spec or "").strip() + if ":" not in spec: + return "unknown", spec + provider, model = spec.split(":", 1) + return provider.strip().lower(), model.strip() + + +def same_provider(pipeline_model: str, judge_model: str) -> bool: + """True iff both specs share the same known provider prefix.""" + p, _ = parse_model(pipeline_model) + j, _ = parse_model(judge_model) + if p == "unknown" or j == "unknown": + return False + return p == j + + +class JudgeClient: + """Async multi-provider judge backed by :class:`FireflyAgent`. + + Each ``judge`` call returns a validated instance of the requested pydantic + ``output_type`` — schema enforcement replaces hand-rolled JSON parsing. + ``temperature`` is pinned to 0.0 for deterministic verdicts. Agents are built + lazily and cached per ``(system, output_type, max_tokens)``; transient + rate-limit / 5xx errors and output-validation failures are retried by + FireflyAgent / pydantic-ai (``max_retries``). The provider reads its API key + when the agent is first built, so constructing a client never needs a secret. + """ + + def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None: + self.model_spec = model + self.provider, self.model = parse_model(model) + self.timeout = timeout + self.max_retries = max_retries + self._agents: dict[tuple[str, type, int], FireflyAgent] = {} + + def _agent[T: BaseModel](self, system: str, output_type: type[T], max_tokens: int) -> FireflyAgent: + key = (system, output_type, max_tokens) + agent = self._agents.get(key) + if agent is None: + agent = FireflyAgent( + name=_AGENT_NAME, + model=self.model_spec, + instructions=system, + output_type=output_type, + model_settings={"temperature": 0.0, "max_tokens": max_tokens}, + retries=self.max_retries, + auto_register=False, + ) + self._agents[key] = agent + return agent + + async def judge[T: BaseModel](self, system: str, user: str, output_type: type[T], max_tokens: int = 1024) -> T: + """Send (system, user) to the model and return a validated ``output_type``. + + Raises on exhausted retries / unknown provider / output that cannot be + coerced to ``output_type`` — callers must not treat a failure as a verdict. + """ + agent = self._agent(system, output_type, max_tokens) + result = await agent.run(user, timeout=self.timeout) + return result.output + Metric = Callable[["dict", "EvalContext"], Awaitable["dict | float | None"]] diff --git a/fireflyframework_agentic/evaluation/judge_client.py b/fireflyframework_agentic/evaluation/judge_client.py deleted file mode 100644 index cba97a24..00000000 --- a/fireflyframework_agentic/evaluation/judge_client.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Async LLM scoring client for judge metrics. - -Thin wrapper over the framework :class:`FireflyAgent` that returns validated, -typed structured output. The model spec is ``":"`` (e.g. -``"anthropic:claude-sonnet-4-6"``); provider resolution, retries, and JSON -schema enforcement are handled by FireflyAgent / pydantic-ai. API keys are read -by the provider when the agent is first built (on the first :meth:`judge` call), -so constructing a JudgeClient never requires a secret. -""" - -from __future__ import annotations - -from typing import TypeVar - -from pydantic import BaseModel - -from fireflyframework_agentic.agents import FireflyAgent - -T = TypeVar("T", bound=BaseModel) - -_AGENT_NAME = "evaluation-judge" - - -def parse_model(spec: str) -> tuple[str, str]: - """Split "provider:model" -> (provider, model). Bare spec -> ("unknown", spec).""" - spec = (spec or "").strip() - if ":" not in spec: - return "unknown", spec - provider, model = spec.split(":", 1) - return provider.strip().lower(), model.strip() - - -def same_provider(pipeline_model: str, judge_model: str) -> bool: - """True iff both specs share the same known provider prefix.""" - p, _ = parse_model(pipeline_model) - j, _ = parse_model(judge_model) - if p == "unknown" or j == "unknown": - return False - return p == j - - -class JudgeClient: - """Async multi-provider judge backed by :class:`FireflyAgent`. - - Each ``judge`` call returns a validated instance of the requested pydantic - ``output_type`` — schema enforcement replaces hand-rolled JSON parsing. - ``temperature`` is pinned to 0.0 for deterministic verdicts. Agents are built - lazily and cached per ``(system, output_type, max_tokens)``; transient - rate-limit / 5xx errors and output-validation failures are retried by - FireflyAgent / pydantic-ai (``max_retries``). - """ - - def __init__(self, model: str, timeout: int = 120, max_retries: int = 3) -> None: - self.model_spec = model - self.provider, self.model = parse_model(model) - self.timeout = timeout - self.max_retries = max_retries - self._agents: dict[tuple[str, type, int], FireflyAgent] = {} - - def _agent(self, system: str, output_type: type[T], max_tokens: int) -> FireflyAgent: - key = (system, output_type, max_tokens) - agent = self._agents.get(key) - if agent is None: - agent = FireflyAgent( - name=_AGENT_NAME, - model=self.model_spec, - instructions=system, - output_type=output_type, - model_settings={"temperature": 0.0, "max_tokens": max_tokens}, - retries=self.max_retries, - auto_register=False, - ) - self._agents[key] = agent - return agent - - async def judge(self, system: str, user: str, output_type: type[T], max_tokens: int = 1024) -> T: - """Send (system, user) to the model and return a validated ``output_type``. - - Raises on exhausted retries / unknown provider / output that cannot be - coerced to ``output_type`` — callers must not treat a failure as a verdict. - """ - agent = self._agent(system, output_type, max_tokens) - result = await agent.run(user, timeout=self.timeout) - return result.output diff --git a/tests/unit/evaluation/test_judge.py b/tests/unit/evaluation/test_judge.py index c4b51d9b..8f82c015 100644 --- a/tests/unit/evaluation/test_judge.py +++ b/tests/unit/evaluation/test_judge.py @@ -5,6 +5,7 @@ from fireflyframework_agentic.agents import FireflyAgent from fireflyframework_agentic.evaluation.judge import ( EvalContext, + JudgeClient, _Verdict, addresses_question, contains_answer, @@ -13,7 +14,6 @@ run_judge, source_coverage, ) -from fireflyframework_agentic.evaluation.judge_client import JudgeClient def make_ctx(responses: list[dict]) -> EvalContext: From 648b20e50963468e2def351042cdc4567e51b35c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 12:12:52 +0200 Subject: [PATCH 66/67] style(evaluation): simplify __init__ to grouped imports + __all__ Replace the 35 one-symbol 'from X import (Y as Y)' re-export blocks with three grouped imports and an explicit __all__, matching the agents/__init__ convention. __all__ marks the public re-exports so ruff doesn't flag them as unused. --- .../evaluation/__init__.py | 194 ++++++++---------- 1 file changed, 85 insertions(+), 109 deletions(-) diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py index b3a42e21..735a23f3 100644 --- a/fireflyframework_agentic/evaluation/__init__.py +++ b/fireflyframework_agentic/evaluation/__init__.py @@ -1,111 +1,87 @@ -from fireflyframework_agentic.evaluation.embedder import ( - build_embedder as build_embedder, -) -from fireflyframework_agentic.evaluation.judge import ( - AdvisoryReport as AdvisoryReport, -) -from fireflyframework_agentic.evaluation.judge import ( - EvalContext as EvalContext, -) -from fireflyframework_agentic.evaluation.judge import ( - JudgeClient as JudgeClient, -) -from fireflyframework_agentic.evaluation.judge import ( - Metric as Metric, -) -from fireflyframework_agentic.evaluation.judge import ( - actionability as actionability, -) -from fireflyframework_agentic.evaluation.judge import ( - addresses_question as addresses_question, -) -from fireflyframework_agentic.evaluation.judge import ( - answer_correctness as answer_correctness, -) -from fireflyframework_agentic.evaluation.judge import ( - answer_relevancy as answer_relevancy, -) -from fireflyframework_agentic.evaluation.judge import ( - citation_relevance as citation_relevance, -) -from fireflyframework_agentic.evaluation.judge import ( - comparative_vs_champion as comparative_vs_champion, -) -from fireflyframework_agentic.evaluation.judge import ( - contains_answer as contains_answer, -) -from fireflyframework_agentic.evaluation.judge import ( - context_precision as context_precision, -) -from fireflyframework_agentic.evaluation.judge import ( - context_recall as context_recall, -) -from fireflyframework_agentic.evaluation.judge import ( - contradiction as contradiction, -) -from fireflyframework_agentic.evaluation.judge import ( - excerpt_fill_rate as excerpt_fill_rate, -) -from fireflyframework_agentic.evaluation.judge import ( - fabricated_entity as fabricated_entity, -) -from fireflyframework_agentic.evaluation.judge import ( - faithfulness as faithfulness, -) -from fireflyframework_agentic.evaluation.judge import ( - nc_semantic_precision as nc_semantic_precision, -) -from fireflyframework_agentic.evaluation.judge import ( - numeric_temporal_fidelity as numeric_temporal_fidelity, -) -from fireflyframework_agentic.evaluation.judge import ( - open_gap as open_gap, -) -from fireflyframework_agentic.evaluation.judge import ( - parse_model as parse_model, -) -from fireflyframework_agentic.evaluation.judge import ( - ragas_faithfulness as ragas_faithfulness, -) -from fireflyframework_agentic.evaluation.judge import ( - run_judge as run_judge, -) -from fireflyframework_agentic.evaluation.judge import ( - same_provider as same_provider, -) -from fireflyframework_agentic.evaluation.judge import ( - semantic_recovery as semantic_recovery, -) -from fireflyframework_agentic.evaluation.judge import ( - severity_calibration as severity_calibration, -) -from fireflyframework_agentic.evaluation.judge import ( - source_coverage as source_coverage, -) -from fireflyframework_agentic.evaluation.judge import ( - surface_deduplication as surface_deduplication, +"""Evaluation metrics for LLM and pipeline outputs. + +LLM-as-judge metrics (``judge``), the spec-driven embedder factory (``embedder``), +and deterministic retrieval metrics (``retrieval_metrics``). +""" + +from fireflyframework_agentic.evaluation.embedder import build_embedder +from fireflyframework_agentic.evaluation.judge import ( + AdvisoryReport, + EvalContext, + JudgeClient, + Metric, + actionability, + addresses_question, + answer_correctness, + answer_relevancy, + citation_relevance, + comparative_vs_champion, + contains_answer, + context_precision, + context_recall, + contradiction, + excerpt_fill_rate, + fabricated_entity, + faithfulness, + nc_semantic_precision, + numeric_temporal_fidelity, + open_gap, + parse_model, + ragas_faithfulness, + run_judge, + same_provider, + semantic_recovery, + severity_calibration, + source_coverage, + surface_deduplication, ) from fireflyframework_agentic.evaluation.retrieval_metrics import ( - citation_precision as citation_precision, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - hit_at_k as hit_at_k, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - map_score as map_score, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - mrr as mrr, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - ndcg as ndcg, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - no_answer_rate as no_answer_rate, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - precision_at_k as precision_at_k, -) -from fireflyframework_agentic.evaluation.retrieval_metrics import ( - recall_at_k as recall_at_k, -) + citation_precision, + hit_at_k, + map_score, + mrr, + ndcg, + no_answer_rate, + precision_at_k, + recall_at_k, +) + +__all__ = [ + "AdvisoryReport", + "EvalContext", + "JudgeClient", + "Metric", + "actionability", + "addresses_question", + "answer_correctness", + "answer_relevancy", + "build_embedder", + "citation_precision", + "citation_relevance", + "comparative_vs_champion", + "contains_answer", + "context_precision", + "context_recall", + "contradiction", + "excerpt_fill_rate", + "fabricated_entity", + "faithfulness", + "hit_at_k", + "map_score", + "mrr", + "nc_semantic_precision", + "ndcg", + "no_answer_rate", + "numeric_temporal_fidelity", + "open_gap", + "parse_model", + "precision_at_k", + "ragas_faithfulness", + "recall_at_k", + "run_judge", + "same_provider", + "semantic_recovery", + "severity_calibration", + "source_coverage", + "surface_deduplication", +] From d8a48d51356ddc791bf9f46b1c8905844cd3eaba Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 29 Jun 2026 16:03:51 +0200 Subject: [PATCH 67/67] docs(evaluation): use claude-haiku-4-5 alias in example and guide Switch the example default and doc snippets from the pinned claude-haiku-4-5-20251001 to the floating claude-haiku-4-5 alias. --- docs/evaluation.md | 8 ++++---- examples/llm_eval_example.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index bf07c851..2992f79c 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -54,7 +54,7 @@ async def metric(item: dict, ctx: EvalContext) -> dict | float | None from fireflyframework_agentic.evaluation import EvalContext, JudgeClient, build_embedder ctx = EvalContext( - client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), + client=JudgeClient("anthropic:claude-haiku-4-5"), runs=3, # metrics that repeat use the median of this many calls embedder=None, # optional framework embedder; required by semantic_recovery and RAGAS ) @@ -66,7 +66,7 @@ mistral, voyage, bedrock, ollama): ```python ctx = EvalContext( - client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), + client=JudgeClient("anthropic:claude-haiku-4-5"), embedder=build_embedder("ollama:nomic-embed-text"), ) ``` @@ -129,7 +129,7 @@ item = { } async def main(): - ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), runs=3) + ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5"), runs=3) contains = await contains_answer(item, ctx) # 0.0–1.0 addresses = await addresses_question(item, ctx) # 0.0–1.0 print(contains, addresses) @@ -199,7 +199,7 @@ import asyncio from fireflyframework_agentic.evaluation import run_judge, EvalContext, JudgeClient async def main(): - ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5-20251001"), runs=3) + ctx = EvalContext(client=JudgeClient("anthropic:claude-haiku-4-5"), runs=3) report = await run_judge(item, ctx, pipeline_model="anthropic:claude-sonnet-4-6") print(report.metrics) # {metric_name: result, ...} print(report.errors) # ["metric: ExceptionType: message", ...] diff --git a/examples/llm_eval_example.py b/examples/llm_eval_example.py index 98b4f469..1c4eb376 100644 --- a/examples/llm_eval_example.py +++ b/examples/llm_eval_example.py @@ -22,11 +22,11 @@ Usage:: - python examples/llm_eval_example.py --model anthropic:claude-haiku-4-5-20251001 + python examples/llm_eval_example.py --model anthropic:claude-haiku-4-5 # Or score from a JSONL file instead of the built-in sample data: python examples/llm_eval_example.py \\ - --model anthropic:claude-haiku-4-5-20251001 \\ + --model anthropic:claude-haiku-4-5 \\ --items-file items.jsonl Items JSONL format — one JSON object per line:: @@ -108,7 +108,7 @@ async def main(args: argparse.Namespace) -> None: parser = argparse.ArgumentParser(description="Score Q&A pairs with LLM-as-judge metrics.") parser.add_argument( "--model", - default="anthropic:claude-haiku-4-5-20251001", + default="anthropic:claude-haiku-4-5", help="Judge model spec (provider:model).", ) parser.add_argument("--runs", type=int, default=3, help="Judge runs per item (median is reported).")