self-evolving-codegen/evolution/visualize.py at main · tathadn/self-evolving-codegen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np

from evolution.models import GenerationMetrics

_EXPERIMENTS_DIR = Path(__file__).parent.parent / "experiments"


def plot_evolution(
    history: list[GenerationMetrics],
    experiment_name: str,
    save_path: Path | None = None,
) -> Path:
    """Generate and save a multi-panel evolution performance chart.

    Creates four subplots:
    1. Overall score across generations
    2. Per-dimension breakdown (bug detection, false failure, etc.)
    3. Score delta from generation 0
    4. Strength / weakness observation counts

    Args:
        history: Ordered list of GenerationMetrics from all logged generations.
        experiment_name: Used as the chart title and default save directory.
        save_path: Where to save the PNG. Defaults to
            ``experiments/{experiment_name}/evolution_chart.png``.

    Returns:
        Path where the chart was saved.
    """
    if save_path is None:
        save_path = _EXPERIMENTS_DIR / experiment_name / "evolution_chart.png"

    gens = [m.generation for m in history]
    overall = [m.overall_score for m in history]
    bug_detection = [m.bug_detection_rate for m in history]
    false_failure = [m.false_failure_rate for m in history]
    redundancy = [m.redundancy_rate for m in history]
    coverage_q = [(m.coverage_quality - 1) / 9 for m in history]  # normalised 0-1
    edge_case = [(m.edge_case_coverage - 1) / 9 for m in history]

    fig, axes = plt.subplots(2, 2, figsize=(13, 8))
    fig.suptitle(f"Evolution Performance: {experiment_name}", fontsize=14, fontweight="bold")

    # ── Panel 1: Overall score ───────────────────────────────────────────────
    ax1 = axes[0, 0]
    ax1.plot(gens, overall, "b-o", linewidth=2, markersize=6, label="Overall score")
    ax1.fill_between(gens, overall, alpha=0.15, color="blue")
    ax1.set_title("Overall Score", fontweight="bold")
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Score (0–1)")
    ax1.set_ylim(0, 1)
    ax1.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))
    ax1.grid(True, alpha=0.3)
    if len(gens) > 1:
        best_idx = overall.index(max(overall))
        best_gen = gens[best_idx]
        ax1.axvline(
            x=best_gen, color="green", linestyle="--", alpha=0.5, label=f"Best: gen {best_gen}"
        )
    ax1.legend(fontsize=8)

    # ── Panel 2: Per-dimension breakdown ────────────────────────────────────
    ax2 = axes[0, 1]
    x = np.arange(len(gens))
    width = 0.15
    ax2.bar(x - 2 * width, bug_detection, width, label="Bug detection", color="steelblue")
    ax2.bar(
        x - width,
        [1 - v for v in false_failure],
        width,
        label="1 − False failure",
        color="seagreen",
    )
    ax2.bar(x, coverage_q, width, label="Coverage quality", color="darkorange")
    ax2.bar(x + width, edge_case, width, label="Edge case", color="mediumpurple")
    ax2.bar(
        x + 2 * width,
        [1 - v for v in redundancy],
        width,
        label="1 − Redundancy",
        color="crimson",
    )
    ax2.set_title("Per-Dimension Breakdown", fontweight="bold")
    ax2.set_xlabel("Generation")
    ax2.set_ylabel("Score (0–1)")
    ax2.set_xticks(x)
    ax2.set_xticklabels([str(g) for g in gens])
    ax2.set_ylim(0, 1)
    ax2.legend(fontsize=7, loc="lower right")
    ax2.grid(True, alpha=0.3, axis="y")

    # ── Panel 3: Score delta from generation 0 ──────────────────────────────
    ax3 = axes[1, 0]
    if overall:
        baseline = overall[0]
        deltas = [s - baseline for s in overall]
        colors = ["seagreen" if d >= 0 else "crimson" for d in deltas]
        ax3.bar(gens, deltas, color=colors, alpha=0.8)
        ax3.axhline(y=0, color="black", linewidth=0.8)
    ax3.set_title("Score Delta from Generation 0", fontweight="bold")
    ax3.set_xlabel("Generation")
    ax3.set_ylabel("Δ Score")
    ax3.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=1))
    ax3.grid(True, alpha=0.3, axis="y")

    # ── Panel 4: Strength / weakness observation counts ─────────────────────
    ax4 = axes[1, 1]
    strength_counts = [len(m.strengths) for m in history]
    weakness_counts = [len(m.weaknesses) for m in history]
    ax4.plot(gens, strength_counts, "g-s", label="Strengths", linewidth=1.5, markersize=5)
    ax4.plot(gens, weakness_counts, "r-^", label="Weaknesses", linewidth=1.5, markersize=5)
    ax4.set_title("Strength / Weakness Observations", fontweight="bold")
    ax4.set_xlabel("Generation")
    ax4.set_ylabel("Count")
    ax4.legend(fontsize=8)
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()

    save_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(save_path, dpi=150, bbox_inches="tight")
    plt.close(fig)
    return save_path