From aa973b4e3de4f5c44d95032765e23149ff8eb480 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Sat, 4 Apr 2026 23:41:52 -0700 Subject: [PATCH] feat: GenAI Client(evals) - add rich HTML visualization for loss pattern analysis PiperOrigin-RevId: 894799725 --- tests/unit/vertexai/genai/test_evals.py | 608 +++++++++++++++++++++++- vertexai/_genai/_evals_utils.py | 238 ++++++++++ vertexai/_genai/_evals_visualization.py | 308 ++++++++++++ vertexai/_genai/_transformers.py | 22 +- vertexai/_genai/evals.py | 6 + vertexai/_genai/types/common.py | 13 +- 6 files changed, 1170 insertions(+), 25 deletions(-) diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index f6b7a799c8..5f42c0eb84 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -520,7 +520,82 @@ def test_response_structure(self): assert result.clusters[0].item_count == 3 assert result.clusters[1].cluster_id == "cluster-2" - def test_response_show_with_results(self, capsys): + def test_get_loss_analysis_html(self): + """Tests that _get_loss_analysis_html generates valid HTML with data.""" + from vertexai._genai import _evals_visualization + import json + + data = { + "results": [ + { + "config": { + "metric": "test_metric", + "candidate": "test-candidate", + }, + "clusters": [ + { + "cluster_id": "c1", + "taxonomy_entry": { + "l1_category": "Tool Calling", + "l2_category": "Missing Invocation", + "description": "Agent failed to call the tool.", + }, + "item_count": 5, + "examples": [ + { + "evaluation_result": { + "request": { + "prompt": { + "agent_data": { + "turns": [ + { + "turn_index": 0, + "events": [ + { + "author": "user", + "content": { + "parts": [ + { + "text": "Find flights to Paris" + } + ], + }, + } + ], + } + ], + }, + }, + }, + }, + "failed_rubrics": [ + { + "rubric_id": "tool_use", + "classification_rationale": "Did not invoke find_flights.", + } + ], + } + ], + }, + ], + } + ] + } + html = _evals_visualization._get_loss_analysis_html(json.dumps(data)) + assert "Loss Pattern Analysis" in html + assert "test_metric" not in html # data is Base64-encoded in the HTML + assert "" in html + assert "extractScenarioPreview" in html + assert "example-scenario" in html + assert "DOMPurify" in html # uses DOMPurify for sanitization + assert "example-section-label" in html # labels for scenario/rubrics + assert "Analysis Summary" in html # summary heading + + def test_display_loss_clusters_response_no_ipython(self): + """Tests graceful fallback when not in IPython.""" + from vertexai._genai import _evals_visualization + from unittest import mock + response = common_types.GenerateLossClustersResponse( results=[ common_types.LossAnalysisResult( @@ -541,12 +616,17 @@ def test_response_show_with_results(self, capsys): ) ], ) - response.show() - captured = capsys.readouterr() - assert "test_metric" in captured.out - assert "c1" in captured.out + with mock.patch.object( + _evals_visualization, "_is_ipython_env", return_value=False + ): + # Should not raise, just log a warning + response.show() + + def test_display_loss_analysis_result_no_ipython(self): + """Tests graceful fallback for individual result when not in IPython.""" + from vertexai._genai import _evals_visualization + from unittest import mock - def test_loss_analysis_result_show(self, capsys): result = common_types.LossAnalysisResult( config=common_types.LossAnalysisConfig( metric="test_metric", @@ -563,10 +643,518 @@ def test_loss_analysis_result_show(self, capsys): ), ], ) - result.show() - captured = capsys.readouterr() - assert "test_metric" in captured.out - assert "c1" in captured.out + with mock.patch.object( + _evals_visualization, "_is_ipython_env", return_value=False + ): + result.show() + + def test_enrich_scenario_from_agent_data_in_eval_cases(self): + """Tests scenario extraction from agent_data in eval_cases.""" + from vertexai._genai import _evals_utils + + # API response: evaluation_result has NO request (real API behavior) + api_response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + ), + item_count=1, + examples=[ + common_types.LossExample( + evaluation_result={ + "candidateResults": [ + { + "candidate": "travel-agent", + "metric": "multi_turn_task_success_v1", + } + ] + }, + failed_rubrics=[ + common_types.FailedRubric( + rubric_id="tool_use", + classification_rationale="Did not call tool.", + ) + ], + ) + ], + ) + ], + ) + ], + ) + # Original eval_result with agent_data in eval_cases + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "multi_turn_task_success_v1": common_types.EvalCaseMetricResult( + score=0.0, + ), + }, + ) + ], + ) + ], + evaluation_dataset=[ + common_types.EvaluationDataset( + eval_cases=[ + common_types.EvalCase( + agent_data=vertexai_genai_types.evals.AgentData( + turns=[ + vertexai_genai_types.evals.ConversationTurn( + turn_index=0, + events=[ + vertexai_genai_types.evals.AgentEvent( + author="user", + content={ + "parts": [ + { + "text": "Book a flight to Paris." + } + ] + }, + ), + ], + ) + ], + ) + ) + ] + ) + ], + metadata=common_types.EvaluationRunMetadata( + candidate_names=["travel-agent"] + ), + ) + + _evals_utils._enrich_loss_response_with_rubric_descriptions( + api_response, eval_result + ) + example = api_response.results[0].clusters[0].examples[0] + assert "scenario_preview" in example.evaluation_result + assert ( + example.evaluation_result["scenario_preview"] == "Book a flight to Paris." + ) + + def test_enrich_scenario_from_user_scenario_starting_prompt(self): + """Tests scenario extraction from user_scenario.starting_prompt.""" + from vertexai._genai import _evals_utils + + api_response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + ), + item_count=1, + examples=[ + common_types.LossExample( + evaluation_result={ + "candidateResults": [ + {"candidate": "travel-agent"} + ] + }, + failed_rubrics=[ + common_types.FailedRubric(rubric_id="t1") + ], + ) + ], + ) + ], + ) + ], + ) + # eval_result with user_scenario (from generate_conversation_scenarios) + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "multi_turn_task_success_v1": common_types.EvalCaseMetricResult( + score=0.0, + ), + }, + ) + ], + ) + ], + evaluation_dataset=[ + common_types.EvaluationDataset( + eval_cases=[ + common_types.EvalCase( + user_scenario=vertexai_genai_types.evals.UserScenario( + starting_prompt="I want to book a hotel in Tokyo.", + conversation_plan="User asks to book a hotel.", + ) + ) + ] + ) + ], + metadata=common_types.EvaluationRunMetadata( + candidate_names=["travel-agent"] + ), + ) + + _evals_utils._enrich_loss_response_with_rubric_descriptions( + api_response, eval_result + ) + example = api_response.results[0].clusters[0].examples[0] + assert "scenario_preview" in example.evaluation_result + assert ( + example.evaluation_result["scenario_preview"] + == "I want to book a hotel in Tokyo." + ) + + def test_enrich_scenario_from_dataframe_agent_data(self): + """Tests scenario extraction from DataFrame agent_data column.""" + import pandas as pd + from vertexai._genai import _evals_utils + + api_response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + ), + item_count=1, + examples=[ + common_types.LossExample( + evaluation_result={ + "candidateResults": [ + {"candidate": "travel-agent"} + ] + }, + failed_rubrics=[ + common_types.FailedRubric(rubric_id="t1") + ], + ) + ], + ) + ], + ) + ], + ) + # eval_result with agent_data in DataFrame (run_inference output) + agent_data_obj = vertexai_genai_types.evals.AgentData( + turns=[ + vertexai_genai_types.evals.ConversationTurn( + turn_index=0, + events=[ + vertexai_genai_types.evals.AgentEvent( + author="user", + content={"parts": [{"text": "Find flights to London"}]}, + ), + ], + ) + ], + ) + df = pd.DataFrame({"agent_data": [agent_data_obj]}) + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "multi_turn_task_success_v1": common_types.EvalCaseMetricResult( + score=0.0, + ), + }, + ) + ], + ) + ], + evaluation_dataset=[common_types.EvaluationDataset(eval_dataset_df=df)], + metadata=common_types.EvaluationRunMetadata( + candidate_names=["travel-agent"] + ), + ) + + _evals_utils._enrich_loss_response_with_rubric_descriptions( + api_response, eval_result + ) + example = api_response.results[0].clusters[0].examples[0] + assert "scenario_preview" in example.evaluation_result + assert example.evaluation_result["scenario_preview"] == "Find flights to London" + + def test_enrich_scenario_e2e_simulation(self): + """Simulates the full e2e flow: generate_scenarios -> run_inference -> evaluate -> loss_clusters.""" + import pandas as pd + from vertexai._genai import _evals_utils + + # Step 1: Simulate generate_conversation_scenarios output + # This creates eval_cases with user_scenario but no agent_data + scenario_dataset = common_types.EvaluationDataset( + eval_cases=[ + common_types.EvalCase( + user_scenario=vertexai_genai_types.evals.UserScenario( + starting_prompt="I need to book a flight from NYC to Paris for next Friday.", + conversation_plan="User books a flight.", + ) + ) + ], + eval_dataset_df=pd.DataFrame( + { + "starting_prompt": [ + "I need to book a flight from NYC to Paris for next Friday." + ], + "conversation_plan": ["User books a flight."], + } + ), + ) + + # Step 2: Simulate run_inference output + # run_inference extracts eval_dataset_df from the input, runs inference, + # then returns a NEW EvaluationDataset with only eval_dataset_df (no eval_cases) + agent_data_obj = vertexai_genai_types.evals.AgentData( + agents={ + "travel_agent": vertexai_genai_types.evals.AgentConfig( + agent_id="travel_agent", + ) + }, + turns=[ + vertexai_genai_types.evals.ConversationTurn( + turn_index=0, + events=[ + vertexai_genai_types.evals.AgentEvent( + author="user", + content=genai_types.Content( + parts=[ + genai_types.Part( + text="I need to book a flight from NYC to Paris for next Friday." + ) + ], + role="user", + ), + ), + vertexai_genai_types.evals.AgentEvent( + author="travel_agent", + content=genai_types.Content( + parts=[ + genai_types.Part( + text="I'll help you book that flight." + ) + ], + role="model", + ), + ), + ], + ), + ], + ) + inference_df = pd.concat( + [ + scenario_dataset.eval_dataset_df.reset_index(drop=True), + pd.DataFrame({"agent_data": [agent_data_obj]}).reset_index(drop=True), + ], + axis=1, + ) + inference_dataset = common_types.EvaluationDataset( + eval_dataset_df=inference_df, + candidate_name="travel_agent", + ) + + # Step 3: Simulate evaluate() output + # evaluate() stores the dataset (from step 2) in evaluation_dataset + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "multi_turn_task_success_v1": common_types.EvalCaseMetricResult( + score=0.0, + explanation="Failed", + ), + }, + ) + ], + ) + ], + evaluation_dataset=[inference_dataset], # Note: no eval_cases! + metadata=common_types.EvaluationRunMetadata( + candidate_names=["travel_agent"] + ), + ) + + # Step 4: Simulate API response (no request in evaluationResult) + api_response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel_agent", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + description="Agent failed to invoke the tool.", + ), + item_count=1, + examples=[ + common_types.LossExample( + evaluation_result={ + "candidateResults": [ + { + "candidate": "travel_agent", + "metric": "multi_turn_task_success_v1", + } + ] + }, + failed_rubrics=[ + common_types.FailedRubric( + rubric_id="tool_use", + classification_rationale="Did not call find_flights.", + ) + ], + ) + ], + ) + ], + ) + ], + ) + + # Verify intermediate steps + scenario_list = _evals_utils._build_scenario_preview_list(eval_result) + assert len(scenario_list) == 1, f"Expected 1 scenario, got {len(scenario_list)}" + assert scenario_list[0] is not None, ( + f"Scenario is None. eval_dataset type: {type(eval_result.evaluation_dataset)}, " + f"eval_cases: {eval_result.evaluation_dataset[0].eval_cases if eval_result.evaluation_dataset else 'N/A'}, " + f"df columns: {list(eval_result.evaluation_dataset[0].eval_dataset_df.columns) if eval_result.evaluation_dataset and eval_result.evaluation_dataset[0].eval_dataset_df is not None else 'N/A'}" + ) + + # Step 5: Enrich and verify + _evals_utils._enrich_loss_response_with_rubric_descriptions( + api_response, eval_result + ) + example = api_response.results[0].clusters[0].examples[0] + assert ( + "scenario_preview" in example.evaluation_result + ), f"scenario_preview not found. evaluation_result keys: {list(example.evaluation_result.keys())}" + assert ( + "I need to book a flight" in example.evaluation_result["scenario_preview"] + ) + + # Verify the full serialization pipeline (model_dump -> JSON -> parse) + import json + + result_dump = api_response.model_dump(mode="json", exclude_none=True) + json_str = json.dumps(result_dump) + parsed = json.loads(json_str) + ex_parsed = parsed["results"][0]["clusters"][0]["examples"][0] + assert "scenario_preview" in ex_parsed.get( + "evaluation_result", {} + ), f"scenario_preview missing after serialization. Keys: {list(ex_parsed.get('evaluation_result', {}).keys())}" + assert ( + "I need to book a flight" + in ex_parsed["evaluation_result"]["scenario_preview"] + ) + + def test_enrich_scenario_from_dataframe_starting_prompt(self): + """Tests scenario extraction from DataFrame starting_prompt column.""" + import pandas as pd + from vertexai._genai import _evals_utils + + api_response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="m1", + candidate="c1", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Cat", + l2_category="SubCat", + ), + item_count=1, + examples=[ + common_types.LossExample( + evaluation_result={"candidateResults": []}, + failed_rubrics=[ + common_types.FailedRubric(rubric_id="r1") + ], + ) + ], + ) + ], + ) + ], + ) + # DataFrame with starting_prompt but no agent_data + df = pd.DataFrame( + { + "starting_prompt": ["Cancel my reservation please"], + "conversation_plan": ["User wants to cancel."], + } + ) + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "m1": common_types.EvalCaseMetricResult(score=0.0) + }, + ) + ], + ) + ], + evaluation_dataset=[common_types.EvaluationDataset(eval_dataset_df=df)], + ) + + _evals_utils._enrich_loss_response_with_rubric_descriptions( + api_response, eval_result + ) + example = api_response.results[0].clusters[0].examples[0] + assert "scenario_preview" in example.evaluation_result + assert ( + example.evaluation_result["scenario_preview"] + == "Cancel my reservation please" + ) def _make_eval_result( diff --git a/vertexai/_genai/_evals_utils.py b/vertexai/_genai/_evals_utils.py index b5cd092179..d96fb0e71f 100644 --- a/vertexai/_genai/_evals_utils.py +++ b/vertexai/_genai/_evals_utils.py @@ -591,6 +591,244 @@ def _resolve_loss_analysis_config( return resolved_config +def _build_rubric_description_map( + eval_result: types.EvaluationResult, +) -> dict[str, str]: + """Builds a rubric_id -> description map from the EvaluationResult.""" + rubric_map: dict[str, str] = {} + for case_result in eval_result.eval_case_results or []: + for resp_cand in case_result.response_candidate_results or []: + for metric_res in (resp_cand.metric_results or {}).values(): + for verdict in metric_res.rubric_verdicts or []: + rubric = verdict.evaluated_rubric + if rubric and rubric.rubric_id and rubric.content: + if ( + rubric.content.property + and rubric.content.property.description + ): + rubric_map[rubric.rubric_id] = ( + rubric.content.property.description + ) + return rubric_map + + +def _extract_scenario_preview_from_dict( + eval_result_dict: dict[str, Any], +) -> Optional[str]: + """Extracts the first user message from an evaluation_result dict. + + Handles both snake_case (SDK-side) and camelCase (API echo-back) keys. + """ + request = eval_result_dict.get("request") + if not request: + return None + prompt = request.get("prompt") + if not prompt: + return None + # Try agent_data (snake_case or camelCase) + agent_data = prompt.get("agent_data") or prompt.get("agentData") + if agent_data and isinstance(agent_data, dict): + turns = agent_data.get("turns", []) + for turn in turns: + events = turn.get("events", []) + for event in events: + author = event.get("author", "") + content = event.get("content") + if author.lower() == "user" and content and isinstance(content, dict): + parts = content.get("parts", []) + for part in parts: + text = str(part.get("text", "")).strip() + if text: + if len(text) > 150: + return text[:150] + "..." + return text + # Try simple prompt path + parts = prompt.get("parts", []) + for part in parts: + text = str(part.get("text", "")).strip() + if text: + if len(text) > 150: + return text[:150] + "..." + return text + return None + + +def _extract_scenario_from_agent_data(agent_data: Any) -> Optional[str]: + """Extracts the first user message from an AgentData object or dict.""" + if agent_data is None: + return None + if hasattr(agent_data, "model_dump"): + agent_data = agent_data.model_dump() + if isinstance(agent_data, str): + try: + agent_data = json.loads(agent_data) + except (json.JSONDecodeError, ValueError): + return None + if not isinstance(agent_data, dict): + return None + turns = agent_data.get("turns", []) + if not isinstance(turns, list): + return None + for turn in turns: + if not isinstance(turn, dict): + continue + events = turn.get("events", []) + if not isinstance(events, list): + continue + for event in events: + if not isinstance(event, dict): + continue + author = event.get("author", "") + if not isinstance(author, str) or author.lower() != "user": + continue + content = event.get("content") + if not content or not isinstance(content, dict): + continue + parts = content.get("parts", []) + if not isinstance(parts, list): + continue + for part in parts: + if not isinstance(part, dict): + continue + text = str(part.get("text", "")).strip() + if text: + if len(text) > 150: + return text[:150] + "..." + return text + return None + + +def _truncate_scenario(text: str, max_len: int = 150) -> str: + """Truncates a scenario preview to max_len characters.""" + text = text.strip() + if len(text) > max_len: + return text[:max_len] + "..." + return text + + +def _build_scenario_preview_list( + eval_result: types.EvaluationResult, +) -> list[Optional[str]]: + """Builds an ordered list of scenario previews from the EvaluationResult. + + Returns one scenario preview per eval_case_result, in the same order as + eval_case_results. This extracts the first user message from the original + SDK EvaluationResult (via eval_cases or DataFrame), rather than relying + on the API echo-back which may not preserve the request data. + + Extraction priority per eval case: + 1. eval_case.agent_data → first user message in turns + 2. eval_case.user_scenario.starting_prompt + 3. eval_case.prompt → text content + 4. DataFrame agent_data column → first user message + 5. DataFrame starting_prompt column + """ + eval_dataset = eval_result.evaluation_dataset + eval_cases: list[Any] = [] + if isinstance(eval_dataset, list) and eval_dataset: + eval_cases = getv(eval_dataset[0], ["eval_cases"]) or [] + + eval_case_results = eval_result.eval_case_results or [] + scenarios: list[Optional[str]] = [] + + for case_result in eval_case_results: + case_idx = case_result.eval_case_index or 0 + scenario: Optional[str] = None + + eval_case = None + if 0 <= case_idx < len(eval_cases): + eval_case = eval_cases[case_idx] + + if eval_case: + # 1. Try agent_data (populated after run_inference) + agent_data = getv(eval_case, ["agent_data"]) + if agent_data: + scenario = _extract_scenario_from_agent_data(agent_data) + + # 2. Try user_scenario.starting_prompt (from + # generate_conversation_scenarios) + if scenario is None: + user_scenario = getv(eval_case, ["user_scenario"]) + if user_scenario: + starting_prompt = getv(user_scenario, ["starting_prompt"]) + if starting_prompt and isinstance(starting_prompt, str): + scenario = _truncate_scenario(starting_prompt) + + # 3. Try prompt text + if scenario is None: + prompt = getv(eval_case, ["prompt"]) + if prompt: + from . import _evals_data_converters + + text = _evals_data_converters._get_content_text(prompt) + if text: + scenario = _truncate_scenario(str(text)) + + # 4. Fallback: extract agent_data from DataFrame + if scenario is None and eval_dataset: + df_agent_data = _transformers._extract_agent_data_from_df( + eval_dataset, case_idx + ) + if df_agent_data is not None: + scenario = _extract_scenario_from_agent_data(df_agent_data) + + # 5. Fallback: extract starting_prompt from DataFrame + if scenario is None and eval_dataset: + ds = eval_dataset[0] if isinstance(eval_dataset, list) else eval_dataset + df = getv(ds, ["eval_dataset_df"]) + if df is not None and hasattr(df, "iloc"): + if 0 <= case_idx < len(df): + row = df.iloc[case_idx] + sp = row.get("starting_prompt") + if sp and isinstance(sp, str) and sp.strip(): + scenario = _truncate_scenario(sp) + + scenarios.append(scenario) + + return scenarios + + +def _enrich_loss_response_with_rubric_descriptions( + response: types.GenerateLossClustersResponse, + eval_result: types.EvaluationResult, +) -> None: + """Enriches loss response with rubric descriptions and scenario previews. + + Rubric descriptions and scenario previews are extracted from the original + SDK EvaluationResult object, because the API echo-back in + LossExample.evaluation_result may not preserve all request data (e.g., + agent_data turns with user messages). + """ + rubric_map = _build_rubric_description_map(eval_result) + scenario_list = _build_scenario_preview_list(eval_result) + logger.debug( + "Enriching loss response: %d scenarios extracted, %d rubric" " descriptions", + sum(1 for s in scenario_list if s), + len(rubric_map), + ) + for result in response.results or []: + for cluster in result.clusters or []: + for example in cluster.examples or []: + if example.evaluation_result is None: + example.evaluation_result = {} + if rubric_map: + example.evaluation_result["rubric_descriptions"] = rubric_map + # Try extracting scenario from the API echo-back first + if "scenario_preview" not in example.evaluation_result: + scenario = _extract_scenario_preview_from_dict( + example.evaluation_result + ) + if scenario: + example.evaluation_result["scenario_preview"] = scenario + # Fallback: match against scenarios from original eval_result + if "scenario_preview" not in example.evaluation_result: + if scenario_list: + for s in scenario_list: + if s: + example.evaluation_result["scenario_preview"] = s + break + + def _poll_operation( api_client: BaseApiClient, operation: types.GenerateLossClustersOperation, diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py index d9319f7406..0fed8fbb51 100644 --- a/vertexai/_genai/_evals_visualization.py +++ b/vertexai/_genai/_evals_visualization.py @@ -1491,6 +1491,314 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non display.display(display.HTML(html_content)) +def _get_loss_analysis_html(loss_analysis_json: str) -> str: + """Returns self-contained HTML for loss pattern analysis visualization.""" + payload_b64 = _encode_to_base64(loss_analysis_json) + return textwrap.dedent( + f""" + + + + + Loss Pattern Analysis + + + + +
+
+
+ + + +""" + ) + + +def display_loss_clusters_response( + response_obj: "types.GenerateLossClustersResponse", +) -> None: + """Displays a GenerateLossClustersResponse in an IPython environment.""" + if not _is_ipython_env(): + logger.warning("Skipping display: not in an IPython environment.") + return + else: + from IPython import display + + try: + result_dump = response_obj.model_dump(mode="json", exclude_none=True) + except Exception as e: + logger.error( + "Failed to serialize GenerateLossClustersResponse: %s", + e, + exc_info=True, + ) + raise + + html_content = _get_loss_analysis_html( + json.dumps(result_dump, ensure_ascii=False, default=_pydantic_serializer) + ) + display.display(display.HTML(html_content)) + + +def display_loss_analysis_result( + result_obj: "types.LossAnalysisResult", +) -> None: + """Displays a single LossAnalysisResult in an IPython environment.""" + if not _is_ipython_env(): + logger.warning("Skipping display: not in an IPython environment.") + return + else: + from IPython import display + + try: + # Wrap in a response-like structure for the shared HTML generator + wrapped = {"results": [result_obj.model_dump(mode="json", exclude_none=True)]} + except Exception as e: + logger.error( + "Failed to serialize LossAnalysisResult: %s", + e, + exc_info=True, + ) + raise + + html_content = _get_loss_analysis_html( + json.dumps(wrapped, ensure_ascii=False, default=_pydantic_serializer) + ) + display.display(display.HTML(html_content)) + + def _get_status_html(status: str, error_message: Optional[str] = None) -> str: """Returns a simple HTML string for displaying a status and optional error.""" error_html = "" diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index 4d694ab2d0..d02f388ec6 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -479,6 +479,11 @@ def t_inline_results( eval_rubric = getv(verdict, ["evaluated_rubric"]) if eval_rubric: + rubric_dict: dict[str, Any] = {} + rubric_id = getv(eval_rubric, ["rubric_id"]) + if rubric_id: + rubric_dict["rubric_id"] = str(rubric_id) + rubric_content = getv(eval_rubric, ["content"]) if rubric_content: text = getv(rubric_content, ["text"]) @@ -493,17 +498,16 @@ def t_inline_results( content_dict["property"] = { "description": str(desc) } - verdict_dict["evaluated_rubric"] = { - "content": content_dict - } + rubric_dict["content"] = content_dict + verdict_dict["evaluated_rubric"] = rubric_dict - score = getv(verdict, ["score"]) - if score is not None: - verdict_dict["score"] = float(score) + verdict_bool = getv(verdict, ["verdict"]) + if verdict_bool is not None: + verdict_dict["verdict"] = bool(verdict_bool) - explanation = getv(verdict, ["explanation"]) - if explanation: - verdict_dict["explanation"] = str(explanation) + reasoning = getv(verdict, ["reasoning"]) + if reasoning: + verdict_dict["reasoning"] = str(reasoning) if verdict_dict: api_rubric_verdicts.append(verdict_dict) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index fe98d8c667..479bf14dd3 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -2759,6 +2759,9 @@ def generate_loss_clusters( raise RuntimeError( "Loss analysis operation completed but returned no response." ) + _evals_utils._enrich_loss_response_with_rubric_descriptions( + completed.response, eval_result + ) return completed.response @_common.experimental_warning( @@ -4337,6 +4340,9 @@ async def generate_loss_clusters( raise RuntimeError( "Loss analysis operation completed but returned no response." ) + _evals_utils._enrich_loss_response_with_rubric_descriptions( + completed.response, eval_result + ) return completed.response @_common.experimental_warning( diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 97ecc9c3eb..bd70043256 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -4981,10 +4981,10 @@ class LossAnalysisResult(_common.BaseModel): ) def show(self) -> None: - """Shows the loss analysis result as a formatted pandas DataFrame.""" - from .. import _evals_utils + """Shows the loss analysis result with rich HTML visualization.""" + from .. import _evals_visualization - _evals_utils._display_loss_analysis_result(self) + _evals_visualization.display_loss_analysis_result(self) class LossAnalysisResultDict(TypedDict, total=False): @@ -5015,9 +5015,10 @@ class GenerateLossClustersResponse(_common.BaseModel): ) def show(self) -> None: - """Shows all loss analysis results.""" - for result in self.results or []: - result.show() + """Shows the loss pattern analysis report with rich HTML visualization.""" + from .. import _evals_visualization + + _evals_visualization.display_loss_clusters_response(self) class GenerateLossClustersResponseDict(TypedDict, total=False):