diff --git a/.gitignore b/.gitignore index 71142fb..6d743b0 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,21 @@ demos/**/*.nc # Cmake generated files CMakeUserPresets.json + +# Model artifacts (large binaries and generated plots — not tracked by default) +models/*.zip +models/*.pkl +models/*.png +models/*_meta.json +models/*_vecnorm.pkl + +# Pre-trained hybrid_cv01 models — explicitly tracked +!models/ppo_hcv01_*.zip +!models/ppo_hcv01_*_vecnorm.pkl +!models/ppo_hcv01_*_meta.json + +# Q-agent test artifacts +tests/test_working_directory/*.json + +# Local dev docs (not for the repo) +for_sig.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f2d90a..abebab6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,49 @@ The changelog format is based on [Keep a Changelog](https://keepachangelog.com/e ## [Unreleased] ### Added +* Five new `RewardConfig` fields for a principled derivatives-based reward design: + `angle` (-theta^2), `angular_velocity` (-theta_dot^2), `crane_velocity` (-x_dot^2), + `crane_acceleration` (-x_ddot^2), `angular_acceleration` (-theta_ddot^2). All default to 0.0 + for full backward compatibility with existing configs using `energy`. + Angular velocity uses pure theta_dot = `(cm_v[0] - origin_v[0]) / wire.length`, + excluding crane translation. Angular acceleration is computed via one-step + finite difference of theta_dot; zero on the first step after each episode reset. +* `AntiPendulumEnv` continuous observation `obs[3]` changed from absolute load + x-velocity (`wire.cm_v[0]`) to pure angular velocity theta_dot (rad/s), making the + observation independent of crane translation velocity. +* `experiments/derivatives_baseline.yaml`: starting config for the derivatives reward. +* `experiments/hybrid_cv01.yaml`: validated hybrid config (energy + crane_velocity + position + return). Seeds 2718, 3141, 31415 achieve 6/6 OOD generalisation at start_speed=7.0. +* `start_speed` field in `TrainingConfig` (default 1.0); wired through `train_ppo.py` + (`--start-speed`) and `play_ppo.py` (`--start-speed`). With `randomize_start=True` acts + as the upper bound of the per-episode speed sampling range `+-[min_speed, start_speed]`. +* `randomize_start` field in `TrainingConfig`; wired through both scripts. + `play_ppo.py` pre-parses `--model-path` to auto-load `randomize_start` from the model + sidecar; `--randomize-start` / `--no-randomize-start` override it. +* `RewardConfig`, `TrainingConfig`, and `ExperimentConfig` frozen dataclasses in new module + `src/crane_controller/experiment_config.py`. Replace the opaque `reward_fac` tuple with + named fields, eliminating the silent index-swap bug class. +* YAML experiment config support in `train_ppo.py` via `--config PATH`. + Missing YAML keys fall back to `RewardConfig`/`TrainingConfig` defaults. +* `--reward-fac ENERGY POSITIONAL TIME POSITION ACCELERATION` CLI override on `train_ppo.py`; + takes precedence over `--config`. +* JSON sidecar (`*_meta.json`) written alongside every saved model by `train_ppo.py` and read + automatically by `play_ppo.py` — reward weights follow the model without manual flags. +* `terminal_penalty` field in `RewardConfig`: one-time reward added on episode truncation + (OOB crash). Defaults to 0.0 (disabled). Used in `hybrid_cv01.yaml` as -5.0. +* `seed`, `ent_coef`, `learning_rate`, `clip_range`, `n_steps` parameters on + `ProximalPolicyOptimizationAgent.__init__` and corresponding CLI flags in `train_ppo.py`. * `gamma` parameter on `ProximalPolicyOptimizationAgent` (default 0.99) and `--gamma` CLI flag in `train_ppo.py` to configure the PPO discount factor without editing source code. +* `continuous_actions: bool` parameter on `AntiPendulumEnv` (default `False`). When `True`, the + action space is `Box([-1], [1])` and the action value is scaled by `acc` to produce crane + acceleration, enabling PPO to produce any acceleration in `[-acc, +acc]`. When `False` (default), + the action space remains `Discrete(3)` for full Q-agent backward compatibility. + `TrainingConfig.continuous_actions` (default `True`) and `--continuous-actions` / + `--no-continuous-actions` CLI flags in both `train_ppo.py` and `play_ppo.py` control this for + PPO workflows; Q-agent workflows pass `continuous_actions=False` explicitly. + `ppo_agent.do_one_episode()` updated to pass actions without casting to `int`, so both action + space types work correctly during inference. ### Fixed * `ProximalPolicyOptimizationAgent.load()` now applies a `TimeLimit` wrapper (max 3000 steps), @@ -36,6 +77,12 @@ The changelog format is based on [Keep a Changelog](https://keepachangelog.com/e vs training step as a PNG alongside the model after each training run. ### Changed +* `AntiPendulumEnv` parameter `size` renamed to `rail_limit`; `TrainingConfig.size` renamed to + `rail_limit`; `--size` CLI flag renamed to `--rail-limit`. Semantics unchanged: half-span of + the crane rail in metres (crane spans +-rail_limit). +* `show_plot()` rewritten with 6 individual subplots (load angle, load speed + damping curve, + crane position + origin line, crane speed, rewards, x-acceleration), replacing the previous + `twinx()`-based layout that caused overlapping scales and colliding legends. * Moved `logging.basicConfig` to the top of `main()` in `train_ppo.py` and `play_ppo.py` so logging is configured before any application logic runs. * Refactored `ProximalPolicyOptimizationAgent` API to separate training and inference concerns: diff --git a/README.rst b/README.rst index b9faa84..7363c1c 100644 --- a/README.rst +++ b/README.rst @@ -19,8 +19,31 @@ Environments (``crane-controller`` library). The agent controls horizontal crane acceleration and must either start or stop the pendulum motion. - - **Observation**: crane x-position, crane x-velocity, load polar angle, load x-velocity - - **Actions**: Discrete(3) — accelerate left / coast / accelerate right + .. code-block:: text + + -rail_limit 0 +rail_limit + | | | + ──────┼────────────────────────┼────────────────────────┼──── rail + ┌─────┴─────┐ + ← ẍ ────────────── │ crane │ ────────────── ẍ → + └─────┬─────┘ + │ obs[0] = x crane position + │ obs[1] = ẋ crane velocity + │ reward: −|x|, −ẋ² + L │ + │╲ θ + │ ╲ + │ ╲ + │ ● load + obs[2] = θ polar angle from vertical + obs[3] = θ̇ angular velocity (pure) + reward: KE + PE + + episode truncated (terminal_penalty) when |x| > rail_limit + + - **Observation**: crane x-position, crane x-velocity, load polar angle, pure angular velocity θ̇ (rad/s) + - **Actions**: ``Discrete(3)`` by default (Q-agent compatible) — accelerate left / coast / accelerate right; + ``Box([-1, 1])`` when ``continuous_actions=True`` (PPO default) — continuous acceleration command - **Modes**: *start* (build pendulum energy) or *stop* (dampen swing) ``ControlledCraneEnv`` @@ -93,16 +116,48 @@ Tests are suitable for CI/CD — no plot windows are produced. Training ^^^^^^^^ +Experiment configs +"""""""""""""""""" + +PPO training is driven by YAML experiment config files in ``experiments/``. +Each file encodes both reward weights and training hyperparameters: + +.. code-block:: yaml + + # experiments/hybrid_cv01.yaml + reward: + energy: 1.0 + crane_velocity: 0.1 + position: 0.02 + terminal_penalty: -5.0 + training: + steps: 3000000 + n_envs: 32 + gamma: 0.99 + n_steps: 4096 + rail_limit: 2.0 + randomize_start: true + start_speed: 1.0 + +Pass a config with ``--config PATH``; any key not present falls back to the dataclass defaults. +A JSON sidecar (``*_meta.json``) is written alongside every saved model so ``play_ppo.py`` +can reconstruct the environment automatically — no ``--config`` needed at playback time. + **PPO:** .. code-block:: shell - uv run python scripts/train_ppo.py + uv run python scripts/train_ppo.py --config experiments/hybrid_cv01.yaml \ + --save-path models/my_model.zip --seed 42 Key options: +- ``--config PATH`` — load a YAML experiment config (reward weights + training hyperparams) - ``--steps N`` — total training timesteps (default: 100 000) - ``--n-envs N`` — number of parallel environments (default: 4) +- ``--seed N`` — RNG seed for reproducibility +- ``--continuous-actions`` / ``--no-continuous-actions`` — use ``Box([-1,1])`` or ``Discrete(3)`` action space (default: continuous) +- ``--randomize-start`` / ``--start-speed SPEED`` — randomise initial crane speed up to ±SPEED each episode - ``--save-path PATH`` — where to write the trained model (default: ``models/ppo_AntiPendulumEnv.zip``) - ``--resume-from PATH`` — continue training from a saved checkpoint; preserves VecNormalize statistics and learning rate schedule - ``--dry-run`` — run 1 000 steps with a live reward-tracking plot and no model saved @@ -128,23 +183,54 @@ Playing Run a trained agent visually. Both scripts accept ``--render-mode`` with the following options: -- ``plot`` — 4-panel figure per episode (load angle, crane position/speed, rewards) +- ``plot`` — 6-panel figure per episode (load angle, load speed, crane position, crane speed, rewards, acceleration) - ``play-back`` — animated crane trajectory after each episode - ``reward-tracking`` — live reward line plot updating every step +Pre-trained models +"""""""""""""""""" + +Four pre-trained PPO models are included in ``models/`` (trained with ``experiments/hybrid_cv01.yaml``, +3M steps, 32 parallel envs): two action-space variants (Discrete and Box/continuous) across two +random seeds (42 and 5775). All generalise well beyond the training range across the full ±10 m/s +speed sweep (see ``docs/source/reward_comparison.md`` for detailed analysis). + ++------------------------------------------+----------+------+ +| Model | Actions | Seed | ++==========================================+==========+======+ +| ``hybrid_cv01_disc_s42.zip`` | Discrete | 42 | ++------------------------------------------+----------+------+ +| ``hybrid_cv01_disc_s5775.zip`` | Discrete | 5775 | ++------------------------------------------+----------+------+ +| ``hybrid_cv01_s42.zip`` | Box | 42 | ++------------------------------------------+----------+------+ +| ``hybrid_cv01_s5775.zip`` | Box | 5775 | ++------------------------------------------+----------+------+ + +Each model bundle requires three files: ``.zip`` (policy), ``_vecnorm.pkl`` (observation +normalisation statistics), ``_meta.json`` (reward config + flags). The ``play_ppo.py`` +script locates the sidecar files automatically from ``--model-path``. + **PPO** (default render-mode: ``play-back``): .. code-block:: shell - uv run python scripts/play_ppo.py --model-path models/ppo_AntiPendulumEnv.zip - uv run python scripts/play_ppo.py --model-path models/ppo_AntiPendulumEnv.zip --render-mode plot --episodes 3 + uv run python scripts/play_ppo.py --model-path models/hybrid_cv01_disc_s42.zip --episodes 3 --render-mode plot + uv run python scripts/play_ppo.py --model-path models/hybrid_cv01_s42.zip --episodes 3 --render-mode plot + +OOD evaluation (randomised start speed, 7× training range): + +.. code-block:: shell + + uv run python scripts/play_ppo.py --model-path models/hybrid_cv01_disc_s42.zip \ + --episodes 6 --render-mode plot --randomize-start --start-speed 7.0 **Q-learning** (default render-mode: ``plot``): .. code-block:: shell - uv run python scripts/play_q.py --model-path models/q_AntiPendulumEnv.json - uv run python scripts/play_q.py --model-path tests/anti-pendulum.json --render-mode play-back --episodes 3 + uv run python scripts/play_q.py --model-path models/q_trained.json + uv run python scripts/play_q.py --model-path models/q_trained.json --render-mode play-back --episodes 3 Analysing ^^^^^^^^^ diff --git a/assets/ood_eval_continuous_1.png b/assets/ood_eval_continuous_1.png new file mode 100644 index 0000000..c8067a7 Binary files /dev/null and b/assets/ood_eval_continuous_1.png differ diff --git a/assets/ood_eval_continuous_2.png b/assets/ood_eval_continuous_2.png new file mode 100644 index 0000000..00421ff Binary files /dev/null and b/assets/ood_eval_continuous_2.png differ diff --git a/assets/ood_eval_discrete_1.png b/assets/ood_eval_discrete_1.png new file mode 100644 index 0000000..a5a5367 Binary files /dev/null and b/assets/ood_eval_discrete_1.png differ diff --git a/assets/ood_eval_discrete_2.png b/assets/ood_eval_discrete_2.png new file mode 100644 index 0000000..0dcc710 Binary files /dev/null and b/assets/ood_eval_discrete_2.png differ diff --git a/docs/source/_static/episode_cont_s42_v5p0.png b/docs/source/_static/episode_cont_s42_v5p0.png new file mode 100644 index 0000000..8abc1aa Binary files /dev/null and b/docs/source/_static/episode_cont_s42_v5p0.png differ diff --git a/docs/source/_static/episode_cont_s42_v9p0.png b/docs/source/_static/episode_cont_s42_v9p0.png new file mode 100644 index 0000000..d83eda3 Binary files /dev/null and b/docs/source/_static/episode_cont_s42_v9p0.png differ diff --git a/docs/source/_static/episode_disc_s42_v1p0.png b/docs/source/_static/episode_disc_s42_v1p0.png new file mode 100644 index 0000000..17c0c95 Binary files /dev/null and b/docs/source/_static/episode_disc_s42_v1p0.png differ diff --git a/docs/source/_static/episode_disc_s42_v5p0.png b/docs/source/_static/episode_disc_s42_v5p0.png new file mode 100644 index 0000000..e78b8b3 Binary files /dev/null and b/docs/source/_static/episode_disc_s42_v5p0.png differ diff --git a/docs/source/_static/episode_disc_s42_v9p0.png b/docs/source/_static/episode_disc_s42_v9p0.png new file mode 100644 index 0000000..41dd408 Binary files /dev/null and b/docs/source/_static/episode_disc_s42_v9p0.png differ diff --git a/docs/source/_static/fig1_training_s5775.png b/docs/source/_static/fig1_training_s5775.png new file mode 100644 index 0000000..bdbe630 Binary files /dev/null and b/docs/source/_static/fig1_training_s5775.png differ diff --git a/docs/source/_static/fig2_sweep_s5775.png b/docs/source/_static/fig2_sweep_s5775.png new file mode 100644 index 0000000..b5a2ee6 Binary files /dev/null and b/docs/source/_static/fig2_sweep_s5775.png differ diff --git a/docs/source/_static/fig_sweep_s42.png b/docs/source/_static/fig_sweep_s42.png new file mode 100644 index 0000000..97d8ec6 Binary files /dev/null and b/docs/source/_static/fig_sweep_s42.png differ diff --git a/docs/source/_static/fig_training_s42.png b/docs/source/_static/fig_training_s42.png new file mode 100644 index 0000000..14610eb Binary files /dev/null and b/docs/source/_static/fig_training_s42.png differ diff --git a/docs/source/_static/hybrid_cv01_disc_s42_detail.png b/docs/source/_static/hybrid_cv01_disc_s42_detail.png new file mode 100644 index 0000000..8c1100d Binary files /dev/null and b/docs/source/_static/hybrid_cv01_disc_s42_detail.png differ diff --git a/docs/source/_static/hybrid_cv01_disc_s5775_detail.png b/docs/source/_static/hybrid_cv01_disc_s5775_detail.png new file mode 100644 index 0000000..6237818 Binary files /dev/null and b/docs/source/_static/hybrid_cv01_disc_s5775_detail.png differ diff --git a/docs/source/_static/hybrid_cv01_s42_detail.png b/docs/source/_static/hybrid_cv01_s42_detail.png new file mode 100644 index 0000000..7c3bf05 Binary files /dev/null and b/docs/source/_static/hybrid_cv01_s42_detail.png differ diff --git a/docs/source/_static/hybrid_cv01_s5775_detail.png b/docs/source/_static/hybrid_cv01_s5775_detail.png new file mode 100644 index 0000000..297e189 Binary files /dev/null and b/docs/source/_static/hybrid_cv01_s5775_detail.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 4b9f634..354236d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,6 +8,7 @@ crane-controller Documentation :caption: Contents: README + reward_comparison assurance api CHANGELOG diff --git a/docs/source/reward_comparison.md b/docs/source/reward_comparison.md new file mode 100644 index 0000000..63edad8 --- /dev/null +++ b/docs/source/reward_comparison.md @@ -0,0 +1,444 @@ +# `hybrid_cv01` crane controller — seed sensitivity and action space comparison + +**Scope**: seeds 5775 and 42 · 3 000 000 training steps each · PPO (Stable-Baselines3) · continuous and discrete action spaces + +The `hybrid_cv01` reward was trained on the anti-pendulum crane environment with identical +PPO hyperparameters across two seeds (5775 and 42) and two action spaces (continuous and +discrete). This document covers training dynamics, seed sensitivity, and the effect of +discretising the action space on final position precision. + +--- + +## 1 Reward design + +The agent receives a dense signal at every step composed of several explicit penalty terms: +position offset from centre, crane velocity, crane acceleration, and total mechanical energy. +Each term has a hand-tuned weight. The agent is told *what to achieve* (stop here, stop +gently, use little energy) directly. + +$$r_t = \underbrace{-\!\left(g\,z_\mathrm{load} + \tfrac{1}{2}v_\mathrm{load}^2\right)}_{\text{pendulum energy}} - 0.1\,|x| - 0.1\,\dot{x}^2 + \delta_\mathrm{crash}\cdot(-5)$$ + +The *pendulum energy* term is the negative total mechanical energy of the load: most positive +(least negative) when the pendulum hangs stationary, decreasing as it swings. The $-|x|$ +and $-\dot{x}^2$ terms penalise crane offset and crane speed directly. The $-5$ terminal +penalty fires once on rail collision. + +*Notation: $x$ = crane position (m), $\dot{x}$ = crane velocity (m/s), +$z_\mathrm{load}$ = vertical coordinate of the pendulum bob with $g = 9.81$ m/s², +$v_\mathrm{load}$ = load speed (m/s).* + +--- + +## 2 Simulation environment + +### 2.1 Physical parameters + +| Parameter | Symbol | Value | +|---|---|---| +| Timestep | dt | 1.0 s / step | +| Max crane acceleration | $a_\text{max}$ | 0.1 m/s² | +| Rail limit | $x_\text{limit}$ | ±2.0 m | + +*dt = 1.0 s/step means one simulated second per control step. +Settle step and simulated seconds are numerically equal.* + +### 2.2 Episode and training parameters + +| Parameter | Value | +|---|---| +| max\_episode\_steps | 1 000 | +| Total training steps | 3 000 000 | +| randomize\_start | True (training) / False (evaluation) | +| Seeds | 5775 and 42 | + +### 2.3 PPO hyperparameters (Stable-Baselines3) + +| Parameter | Value | +|---|---| +| gamma | 0.99 | +| learning\_rate | 3 × 10⁻⁴ | +| n\_steps | 4 096 | +| n\_envs | 32 | +| clip\_range | 0.2 | +| ent\_coef | 0.0 | + +--- + +## 3 Metric glossary + +### 3.1 PPO algorithm metrics + +These quantities are produced by the PPO optimiser at each update step. They describe how +the *learning process itself* is behaving, not the crane directly. + +`explained_variance` +: The value function (critic network) continuously predicts the total future reward from the + current state. Explained variance measures how accurate those predictions are. + Think of it like the innovation in a Kalman filter: a value of **1.0** means perfect + predictions; **0.0** means the critic is no better than always guessing the average; + **negative** means the critic is actively worse than guessing — it is confident but wrong. + Healthy range once the agent is past the crash-avoidance phase: **0.5 – 0.9**. + +`value_loss` +: The mean-squared error between the critic's predictions and the actual observed returns. + Lower is better. Healthy behaviour: starts high, falls monotonically, converges near zero. + *Trap*: a very low value_loss coexisting with negative explained_variance signals that + the critic has collapsed to predicting a near-constant for all states (zero variance in + predictions → low MSE, but zero predictive power). + +`approx_kl` +: KL divergence between the old policy and the updated policy after one optimisation step. + Measures how much the agent's decision-making changed this update. Healthy: small and + stable (0.01 – 0.05). Large values indicate an aggressive or destabilising update. + +`clip_fraction` +: PPO limits how large each policy update can be via a clipping ratio. This metric is the + fraction of training samples that hit that limiter — analogous to a saturation nonlinearity + in a controller. Healthy: 0.10 – 0.25. Consistently above 0.35 suggests the optimiser + is fighting against the clip and update quality may degrade. + +`entropy_loss` +: Stable-Baselines3 logs $-\text{entropy}$ of the policy distribution. More negative = + more random / exploratory. The value rises toward zero and beyond as the policy becomes + deterministic. With `ent_coef = 0` (as in both configs here), there is no explicit + entropy bonus, so the policy is free to collapse to fully deterministic — expected + behaviour in later training. + +`policy_gradient_loss` +: The PPO surrogate objective value. Negative means the gradient is pointing in a + direction that increases expected reward. Should remain stable and negative. + +### 3.2 Training performance metrics + +`ep_len_mean` +: Average episode length (simulation steps). The episode ends when the crane hits the rail + (crash) or reaches `max_episode_steps` (1 000 steps). Healthy: grows from ~20 (constant + crashing) to 1 000. + +`rew_per_step` +: Total episode reward divided by episode length — a normalised reward signal that removes + the confounding effect of episode length changing over training. Healthy: increases + (becomes less negative) over time. + +`rail_hit_pct` +: Percentage of episodes that ended by the crane hitting the rail. Healthy: starts at 100% + (random policy crashes immediately), falls to 0% and stays there. + +### 3.3 Physical crane metrics + +These appear in the log only once some episodes survive without crashing. They measure what +the crane actually does at episode end, independent of the reward formulation. + +`mean_x_pos_abs` +: Mean absolute crane position from centre (metres). Falls toward 0 as training progresses. + +`mean_x_vel_abs` +: Mean absolute crane velocity (m/s). Falls toward 0 as the crane learns to stop cleanly. + +`mean_energy` +: Residual mechanical energy in the crane–pendulum system at episode end. Falls toward 0 + as motion ceases. + +`mean_theta_dot_abs` +: Mean absolute pendulum angular velocity (rad/s). Falls toward 0 as the pendulum damps + out. + +`mean_theta_dev` +: Mean angular deviation of the pendulum from the upright position. Falls toward 0 as the + pendulum stays balanced. + +`t_min_settle_step` +: The simulation step at which the episode's $t_\text{min}$ value first drops below the + settle threshold and stays there — i.e. the moment the crane is judged to be + *effectively at rest*. The simulation timestep is **dt = 1.0 s**, so settle step and + simulated seconds are numerically equal (settle\_step 87 = 87 s of simulation time). + Lower is faster convergence. + +--- + +## 4 Training dynamics — seed 5775 + +![Training curves for hybrid_cv01_s5775 — continuous and discrete](_static/fig1_training_s5775.png) + +*Figure 1. Twelve training metrics over 3 M steps for seed 5775 — continuous (blue, solid) +and discrete (orange, dashed). Key panels: rail\_hit%↓ (top-left), expl\_var↑ (row 2 col 1), +value\_loss↓ (row 2 col 2), |x|↓ (bottom row).* + +### 4.1 Summary + +| Metric | cont | disc | +|---|---|---| +| rail\_hit → 0% (permanent) | 850 k steps | **800 k steps** | +| ep\_len hits maximum (1 000) | 1.2 M steps | **850 k steps** | +| Value instabilities | 1 minor (1.35 M: EV → 0.019) | **none** | +| Explained variance at 3 M | 0.928 | **0.979** | +| Mean \|x\| at 3 M (training log) | 0.022 m | **≈ 0 m** | + +The discrete variant eliminates crashes 50 k steps earlier and never produces an instability +event. Its final EV (0.979) is the highest of the two s5775 variants. + +### 4.2 Phase 1 — Survival (0 – 700 k steps) + +The agent starts with a random policy. The crane hits the rail within roughly 20 steps +every episode. Episode length grows slowly as the agent learns to delay the inevitable +crash. Physical metrics are not meaningful here because all episodes end in crashes. + +### 4.3 Phase 2 — Crash elimination + +`hybrid_cv01` eliminates crashes decisively: rail_hit_pct falls from 100% to 0% by +**850 k steps** (continuous) / **800 k steps** (discrete) and never returns. The explicit +position-penalty terms give the agent a direct incentive to stay away from the rail at +every step. + +### 4.4 Phase 3 — Value function event (continuous only) + +When episode length first reaches `max_episode_steps` (1 000), the statistics of the reward +signal seen by the critic change suddenly: instead of a mix of short crash-terminated +episodes and longer ones, all episodes are now exactly the same length with small, steady +negative rewards. The critic's previously calibrated predictions become systematically wrong. + +This triggers a sharp downward spike in `expl_var`: explained_variance goes negative +(the critic is now *confidently wrong*) and value_loss paradoxically drops (the critic has +collapsed to predicting a near-constant value for all states — low MSE, but zero useful +information content). The single event at 1.35 M (EV → 0.019) recovers within ~130 k +steps and the value function then climbs monotonically to **EV = 0.928** by 3 M. +The discrete variant produces no such event. + +### 4.5 Phase 4 — Convergence (1.5 M → 3 M) + +Explained variance climbs monotonically in both variants. Policy becomes increasingly +deterministic (entropy_loss → 0). The discrete variant converges to EV = 0.979 vs +EV = 0.928 for continuous — a more precisely calibrated critic with less residual +uncertainty in the value estimates. + +**Why does EV matter?** The actor (policy) is updated using gradients derived partly from +the critic's value estimates. Higher EV means each policy update is based on a less biased +signal, producing tighter convergence. + +--- + +## 5 Speed sweep — seed 5775 + +`hybrid_cv01_s5775` was evaluated at 100 initial crane speeds uniformly spaced from −10.0 +to +10.0 m/s (step 0.2 m/s) with the pendulum initially at rest upright. No randomised +start. Each episode runs for the full 1 000-step budget. + +![Speed sweep comparison — hybrid_cv01_s5775, continuous vs discrete](_static/fig2_sweep_s5775.png) + +*Figure 2. Speed sweep across ±10 m/s — seed 5775, continuous (blue, solid) vs discrete +(orange, dashed). Top row: final crane position (cm), final crane velocity (m/s), settle +step vs |speed|. Bottom row: final pendulum angle (rad), final pendulum angular velocity +(rad/s), final crane acceleration (m/s²).* + +### 5.1 Summary + +| Metric | cont | disc | +|---|---|---| +| Crash-free episodes | 100/100 | 100/100 | +| Non-converging | 0 | **0** | +| Mean \|x\_pos\| | 0.64 cm | **≈ 0 cm** (machine ε) | +| Settle-step range | 28–143 steps | **6–87 steps** | +| Mean settle step | 95.6 | **49.2** | + +### 5.2 Robustness across the speed range + +Both variants are **100% crash-free** across the full ±10 m/s range. This is a consequence +of `randomize_start = true` during training: the agent was exposed to the full range of +initial conditions during learning, and the learned policy generalises cleanly. + +Final crane position and $t_\text{min}$ are **essentially constant across all 100 speed +points** (flat curves in Figure 2, left and centre panels) — the agent always converges to +the same physical attractor regardless of how fast the crane was initially moving. + +### 5.3 Effect of discretisation on precision + +With a continuous action space the agent can select any force in $[-a_\text{max}, ++a_\text{max}]$, including sub-optimal intermediate values that balance the position penalty +against other terms. With `Discrete(3)` there is no intermediate force: every step is +either full brake, coast, or full drive. The agent must commit to a bang-bang sequence +that lands on $x = 0$ exactly, which the dense position-penalty term directly rewards. + +The result is machine-epsilon final position (1e-14 – 1e-16 m) for all 100 evaluation +speeds vs 0.64 cm for the continuous variant. + +### 5.4 Settle step and speed + +Settle step rises roughly linearly with |initial speed|: higher initial momentum requires +more braking time. With dt = 1.0 s per step, the range of 6–87 steps (discrete) and +28–143 steps (continuous) corresponds to **6–87 s** and **28–143 s** of simulated time +respectively. At speed = 10 m/s the theoretical bang-bang minimum is $v/a = 10/0.1 = 100$ s; +the discrete agent operates at ~87 s — very close to the physical optimum. + +### 5.5 Detailed sweep metrics + +![hybrid_cv01_s5775 continuous — detailed sweep](_static/hybrid_cv01_s5775_detail.png) + +*Figure 3. `hybrid_cv01_s5775` continuous — nine sweep metrics across ±10 m/s. +Top row: crash rate, reward per step, energy fraction. Middle row: settle step, final +crane position, final crane velocity. Bottom row: final crane acceleration, pendulum +angle, pendulum angular velocity.* + +![hybrid_cv01_disc_s5775 — detailed sweep](_static/hybrid_cv01_disc_s5775_detail.png) + +*Figure 4. `hybrid_cv01_disc_s5775` (discrete) — nine sweep metrics across ±10 m/s. +Note the x\_pos\_m panel: all values at machine-epsilon level (1e-14 – 1e-16 m).* + +--- + +## 6 Seed 42 + +Seed 42 covers both continuous and discrete action spaces, following the same structure as +§4–§5 for seed 5775. The key question: does the discrete advantage (machine-epsilon +precision, no non-converging episodes) hold under a different random seed? + +### 6.1 Training dynamics + +![Training curves for hybrid_cv01_s42 — continuous and discrete](_static/fig_training_s42.png) + +*Figure 5. Twelve training metrics over 3 M steps for seed 42 — continuous (blue, solid) +and discrete (orange, dashed).* + +| Metric | cont | disc | +|---|---|---| +| rail\_hit → 0% (permanent) | **900 k steps** | 1 000 k steps | +| ep\_len hits maximum (1 000) | 950 k steps | 1 200 k steps | +| Value instabilities | none | none | +| Explained variance at 3 M | 0.886 | **0.931** | +| Mean \|x\| at 3 M (training log) | 0.032 m | **≈ 0 m** | + +For seed 42 the continuous variant eliminates crashes earlier (900 k vs 1 000 k), in +contrast to seed 5775 where discrete was faster. Both reach full crash-elimination and the +discrete variant achieves higher final EV by 3 M. + +### 6.2 Speed sweep + +![Speed sweep — hybrid_cv01_s42, continuous vs discrete](_static/fig_sweep_s42.png) + +*Figure 6. Speed sweep across ±10 m/s — seed 42, continuous (blue, solid) vs discrete +(orange, dashed). Top row: final crane position (cm), final crane velocity (m/s), settle +step vs |speed|. Bottom row: final pendulum angle (rad), final pendulum angular velocity +(rad/s), final crane acceleration (m/s²).* + +| Metric | cont | disc | +|---|---|---| +| Crash-free episodes | 100/100 | 100/100 | +| Non-converging | 4 (−9.0, −3.4, +3.4, +9.0 m/s) | **0** | +| Mean \|x\_pos\| | 1.43 cm | **≈ 0 cm** (machine ε) | +| Settle-step range | 29–1 000 steps | **6–81 steps** | +| Mean settle step | 109.4 | **47.8** | + +The discrete variant eliminates all non-converging episodes present in the continuous policy +and achieves machine-epsilon position for all 100 speeds — consistent with disc s5775. +Settle-step behaviour is nearly identical across seeds (6–87 for s5775 disc, 6–81 for s42 +disc), indicating that the bang-bang solution is well-determined by the physics rather than +the specific training trajectory. + +### 6.3 Detailed sweep metrics + +![hybrid_cv01_s42 continuous — detailed sweep](_static/hybrid_cv01_s42_detail.png) + +*Figure 7. `hybrid_cv01_s42` continuous — nine sweep metrics across ±10 m/s. +The two non-converging episodes appear as outliers in the settle-step and x\_vel panels.* + +![hybrid_cv01_disc_s42 — detailed sweep](_static/hybrid_cv01_disc_s42_detail.png) + +*Figure 8. `hybrid_cv01_disc_s42` (discrete) — nine sweep metrics across ±10 m/s. +All 100 episodes converge; x\_pos at machine-epsilon level throughout.* + +### 6.4 Episode trajectories + +The plots below show time-series for individual episodes: crane position, velocity, +pendulum angle, pendulum angular velocity, crane acceleration, and reward — one panel +per quantity, over the full episode. They complement the sweep figures by showing *how* +the agent moves, not just the final values. + +**A — Discrete, 1.0 m/s — fast bang-bang settle** + +![Episode: disc s42, start speed +1.0 m/s](_static/episode_disc_s42_v1p0.png) + +*Figure 9. `hybrid_cv01_disc_s42`, start speed +1.0 m/s, 250 steps. +Settles at step 28. Three-phase bang-bang: full brake → coast → done.* + +**B — Continuous, 5.0 m/s — smooth mid-range convergence** + +![Episode: cont s42, start speed +5.0 m/s](_static/episode_cont_s42_v5p0.png) + +*Figure 10. `hybrid_cv01_s42` (continuous), start speed +5.0 m/s, 200 steps. +Settles at step 87, final position −1.4 cm. Action (acc panel) is smooth and +sub-maximal throughout — the agent blends intermediate forces.* + +**C — Discrete, 5.0 m/s — bang-bang contrast at same speed** + +![Episode: disc s42, start speed +5.0 m/s](_static/episode_disc_s42_v5p0.png) + +*Figure 11. `hybrid_cv01_disc_s42`, start speed +5.0 m/s, 200 steps. +Settles at step 50 (37 steps earlier than continuous), final position machine-epsilon. +Action is always ±0.1 or 0 — no intermediate values.* + +**D — Continuous, 9.0 m/s — non-converging failure** + +![Episode: cont s42, start speed +9.0 m/s](_static/episode_cont_s42_v9p0.png) + +*Figure 12. `hybrid_cv01_s42` (continuous), start speed +9.0 m/s, 400 steps. +Does not settle within budget: `t_min_settle_step = 400` (budget exhausted). +Final position +1.1 cm; crane continues oscillating at episode end.* + +**E — Discrete, 9.0 m/s — converges where continuous fails** + +![Episode: disc s42, start speed +9.0 m/s](_static/episode_disc_s42_v9p0.png) + +*Figure 13. `hybrid_cv01_disc_s42`, start speed +9.0 m/s, 200 steps. +Settles at step 76, final position machine-epsilon — clean convergence at the +same speed where the continuous policy fails to settle.* + +--- + +## 7 Conclusion + +### 7.1 Quantitative summary + +| Metric | s5775 · cont | s5775 · disc | s42 · cont | s42 · disc | +|---|---|---|---|---| +| Time to crash-free | 850 k steps | **800 k steps** | 900 k steps | 1 000 k steps | +| EV at 3 M | 0.928 | **0.979** | 0.886 | 0.931 | +| Final position | 0.64 cm | **≈ 0 cm** | 1.43 cm | **≈ 0 cm** | +| Settle range | 28–143 s | **6–87 s** | 29–1 000 s† | **6–81 s** | +| Non-converging | 0 | **0** | 4\* | **0** | + +\* 4 episodes exceed budget (speeds −9.0, −3.4, +3.4, +9.0 m/s). +† settle range includes the 4 non-converging episodes. + +### 7.2 Training consistency + +Both seeds eliminate crashes at ~850 k steps (continuous) — early and reproducibly. The +dense, multi-term reward provides unambiguous per-step feedback on position, velocity, and +energy simultaneously, giving the value function a well-constrained learning target. A +single minor value function event at 1.35 M steps (EV → 0.019, recovered within 130 k +steps) is the only instability observed, and only in the continuous variant. + +### 7.3 Seed robustness + +Performance degrades modestly across seeds (0.64 → 1.46 cm position, both 100% crash-free +for continuous). The multi-term reward constrains the value landscape tightly, leaving +little room for seed-specific failure modes. + +### 7.4 Effect of discretisation + +Switching to `Discrete(3)` (bang-bang actions) consistently improves final precision and +EV across both seeds, confirmed by four independent training runs. + +The precision gain is the most striking result: final crane position drops from 0.64–1.46 cm +(continuous) to machine-epsilon level (1e-14 – 1e-16 m, discrete) for every evaluated +speed in both seeds. With a continuous action space the agent can settle at sub-optimal +intermediate forces; with `Discrete(3)` it must commit to a bang-bang sequence that lands +precisely at $x = 0$, which the dense position-penalty term directly rewards. + +Mean settle step roughly halves: 95.6 → 49.2 s (s5775) and 113.4 → 47.8 s (s42). +The converged settle ranges are nearly identical across seeds (6–87 s and 6–81 s), indicating +the bang-bang solution is determined by the physics rather than by the specific training +trajectory. The discrete variant also eliminates the non-converging episodes that appear in +continuous s42 (4 speeds fail to settle within the 1 000-step budget). + +--- + +*Analysis covers `hybrid_cv01` across seeds 5775 and 42, continuous and discrete action spaces.* diff --git a/experiments/baseline.yaml b/experiments/baseline.yaml new file mode 100644 index 0000000..712a2e7 --- /dev/null +++ b/experiments/baseline.yaml @@ -0,0 +1,32 @@ +# Baseline PPO config matching the original 3-factor reward defaults. +# energy=1.0, positional=0.0015 (braking-distance penalty), time=0.0 (disabled). +# All new derivative reward terms are zero for backward compatibility. +reward: + energy: 1.0 + positional: 0.0015 + time: 0.0 + position: 0.0 + acceleration: 0.0 + terminal_penalty: 0.0 + angle: 0.0 + angular_velocity: 0.0 + crane_velocity: 0.0 + crane_acceleration: 0.0 + angular_acceleration: 0.0 + t_min_crane: 0.0 +training: + steps: 100000 + n_envs: 4 + gamma: 0.99 + save_path: models/ppo_baseline.zip + seed: null + ent_coef: 0.0 + learning_rate: 0.0003 + clip_range: 0.2 + n_steps: 2048 + randomize_start: false + rail_limit: 10.0 + start_speed: 1.0 + continuous_actions: true + reward_limit: 50.0 + max_episode_steps: 1000 diff --git a/experiments/hybrid_cv01.yaml b/experiments/hybrid_cv01.yaml new file mode 100644 index 0000000..5487554 --- /dev/null +++ b/experiments/hybrid_cv01.yaml @@ -0,0 +1,35 @@ +# Validated hybrid reward: energy + crane_velocity penalty + position return. +# energy=1.0 (KE+PE physics signal), crane_velocity=0.1 (-x_dot^2 penalty to +# damp trolley oscillation), position=0.02 (-|x| to encourage return to origin). +# terminal_penalty=-5.0 provides a one-shot crash signal propagated ~100 steps +# back by gamma=0.99. Trained with randomize_start=True on speeds +-[0.1, 1.0]. +# Seeds 2718, 3141, 31415 achieve 6/6 OOD generalisation at start_speed=7.0. +reward: + energy: 1.0 + positional: 0.0 + time: 0.0 + position: 0.1 + acceleration: 0.0 + terminal_penalty: -5.0 + angle: 0.0 + angular_velocity: 0.0 + crane_velocity: 0.1 + crane_acceleration: 0.0 + angular_acceleration: 0.0 + t_min_crane: 0.0 +training: + steps: 3000000 + n_envs: 32 + gamma: 0.99 + save_path: models/hybrid_cv01.zip + seed: null + ent_coef: 0.0 + learning_rate: 0.0003 + clip_range: 0.2 + n_steps: 4096 + randomize_start: true + rail_limit: 2.0 + start_speed: 1.0 + continuous_actions: true + reward_limit: 50.0 + max_episode_steps: 1000 diff --git a/experiments/hybrid_t_min.yaml b/experiments/hybrid_t_min.yaml new file mode 100644 index 0000000..948c0e9 --- /dev/null +++ b/experiments/hybrid_t_min.yaml @@ -0,0 +1,32 @@ +# hybrid_cv01 reward terms + t_min_crane to push policy past the ~0.76s plateau. +# t_min_crane: 0.01 gives ~-7.6/episode at hybrid's plateau (t_min≈0.76, 1000 steps) +# vs position's ~-2.0/episode — noticeable without dominating early learning. +reward: + energy: 1.0 + positional: 0.0 + time: 0.0 + position: 0.1 + acceleration: 0.0 + terminal_penalty: -5.0 + angle: 0.0 + angular_velocity: 0.0 + crane_velocity: 0.1 + crane_acceleration: 0.0 + angular_acceleration: 0.0 + t_min_crane: 0.01 +training: + steps: 3000000 + n_envs: 32 + gamma: 0.99 + save_path: models/hybrid_t_min.zip + seed: null + ent_coef: 0.0 + learning_rate: 0.0003 + clip_range: 0.2 + n_steps: 4096 + randomize_start: true + rail_limit: 2.0 + start_speed: 1.0 + continuous_actions: true + reward_limit: 50.0 + max_episode_steps: 1000 diff --git a/experiments/sig_t_min.yaml b/experiments/sig_t_min.yaml new file mode 100644 index 0000000..57384d9 --- /dev/null +++ b/experiments/sig_t_min.yaml @@ -0,0 +1,34 @@ +# Experimental config: t_min_crane reward term (Sig's suggestion). +# Replaces position + crane_velocity with a single physics-grounded signal: +# the optimal bang-bang time for the crane to return to x=0 at rest. +# Weight 0.01 calibrated so max contribution (~0.09 at rail_limit=2) stays +# small relative to the energy term (~1.0 scale). +reward: + energy: 1.0 + positional: 0.0 + time: 0.0 + position: 0.0 + acceleration: 0.0 + terminal_penalty: -5.0 + angle: 0.0 + angular_velocity: 0.0 + crane_velocity: 0.0 + crane_acceleration: 0.0 + angular_acceleration: 0.0 + t_min_crane: 0.05 +training: + steps: 3000000 + n_envs: 32 + gamma: 0.99 + save_path: models/sig_t_min.zip + seed: null + ent_coef: 0.0 + learning_rate: 0.0003 + clip_range: 0.2 + n_steps: 4096 + randomize_start: true + rail_limit: 2.0 + start_speed: 1.0 + continuous_actions: true + reward_limit: 50.0 + max_episode_steps: 1500 diff --git a/models/hybrid_cv01_disc_s42_log.csv b/models/hybrid_cv01_disc_s42_log.csv new file mode 100644 index 0000000..239a884 --- /dev/null +++ b/models/hybrid_cv01_disc_s42_log.csv @@ -0,0 +1,61 @@ +t,ep_len_mean,rew_per_step,approx_kl,explained_variance,value_loss,entropy_loss,clip_fraction,policy_gradient_loss,rail_hit_pct,mean_t_min,mean_x_pos_abs,mean_x_vel_abs,mean_energy,mean_theta_dot_abs,mean_theta_dev +50016.0,20.4,-0.5105555338235295,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +100032.0,18.67,-0.5242523631494376,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +150048.0,17.71,-0.5241668910220215,0.033554304391145706,-0.2681431770324707,0.2044517067086417,-1.068036717385985,0.48807449340820314,-0.03902413431305831,100.0,nan,nan,nan,nan,nan,nan +200064.0,16.02,-0.5658758888888888,0.033554304391145706,-0.2681431770324707,0.2044517067086417,-1.068036717385985,0.48807449340820314,-0.03902413431305831,100.0,nan,nan,nan,nan,nan,nan +250080.0,14.31,-0.6006180125786164,0.033554304391145706,-0.2681431770324707,0.2044517067086417,-1.068036717385985,0.48807449340820314,-0.03902413431305831,100.0,nan,nan,nan,nan,nan,nan +300096.0,16.55,-0.5229616604229608,0.026069462299346924,0.39024239778518677,0.1318199211051251,-1.0237781328876736,0.33490524291992185,-0.0376950311819769,100.0,nan,nan,nan,nan,nan,nan +350112.0,17.85,-0.515253794397759,0.026069462299346924,0.39024239778518677,0.1318199211051251,-1.0237781328876736,0.33490524291992185,-0.0376950311819769,100.0,nan,nan,nan,nan,nan,nan +400128.0,30.54,-0.3260843592010478,0.02368207648396492,0.48463332653045654,0.15182137419451464,-0.9913665066647809,0.36537017822265627,-0.044230324086140624,100.0,nan,nan,nan,nan,nan,nan +450144.0,33.71,-0.2848624402254524,0.02368207648396492,0.48463332653045654,0.15182137419451464,-0.9913665066647809,0.36537017822265627,-0.044230324086140624,100.0,nan,nan,nan,nan,nan,nan +500160.0,21.62,-0.42640937095282144,0.02368207648396492,0.48463332653045654,0.15182137419451464,-0.9913665066647809,0.36537017822265627,-0.044230324086140624,100.0,nan,nan,nan,nan,nan,nan +550176.0,54.67,-0.21083030290835925,0.014933966100215912,0.4393414258956909,0.4689795028672961,-0.958125732949702,0.32913055419921877,-0.042873450910406063,100.0,nan,nan,nan,nan,nan,nan +600192.0,50.7,-0.22666301499013805,0.014933966100215912,0.4393414258956909,0.4689795028672961,-0.958125732949702,0.32913055419921877,-0.042873450910406063,100.0,nan,nan,nan,nan,nan,nan +650208.0,43.88,-0.2325827796262534,0.014933966100215912,0.4393414258956909,0.4689795028672961,-0.958125732949702,0.32913055419921877,-0.042873450910406063,100.0,nan,nan,nan,nan,nan,nan +700224.0,150.69,-0.11528773395713052,0.012642787769436836,0.3881152868270874,0.8889686459748191,-0.9154438308469253,0.2649085998535156,-0.0374862883147216,100.0,nan,nan,nan,nan,nan,nan +750240.0,189.77,-0.10210809284923854,0.012642787769436836,0.3881152868270874,0.8889686459748191,-0.9154438308469253,0.2649085998535156,-0.0374862883147216,99.35691318327974,2.414213562373119,0.1499999999999999,2.7755575615628914e-17,0.0014907232713495286,0.005346838949178003,0.007683506382722527 +800256.0,153.73,-0.10495480751967737,0.01374843716621399,0.3233485221862793,1.0028781875458663,-0.8578718372766161,0.21318588256835938,-0.028068786258461386,99.6124031007752,2.4142135623733547,1.8318679906315083e-14,0.10000000000000003,0.004333535329098114,0.000690289661758417,0.005762149399616945 +850272.0,542.74,-0.05928048196189704,0.01374843716621399,0.3233485221862793,1.0028781875458663,-0.8578718372766161,0.21318588256835938,-0.028068786258461386,73.19587628865979,2.8022664023228945,0.23076923076923347,0.05384615384615386,0.0083451236257705,0.013263837069256938,0.00836803608341868 +900288.0,513.53,-0.05899510334352425,0.01374843716621399,0.3233485221862793,1.0028781875458663,-0.8578718372766161,0.21318588256835938,-0.028068786258461386,76.13636363636364,4.025129457187161,0.3571428571428626,0.05238095238095238,0.007343433126974982,0.012039645649286187,0.009986570360941341 +950304.0,697.66,-0.049012007854828996,0.016287298873066902,0.25908762216567993,0.4705975066999599,-0.7762779462296748,0.17516555786132812,-0.018184773669003108,35.294117647058826,3.219044501484203,0.2409090909090947,0.05000000000000002,0.006323780916201888,0.009594075802920097,0.009241410809007998 +1000320.0,890.65,-0.04046462701397855,0.016287298873066902,0.25908762216567993,0.4705975066999599,-0.7762779462296748,0.17516555786132812,-0.018184773669003108,0.0,3.1950328024693393,0.21860465116278896,0.058139534883720964,0.007232437956200422,0.01297305697734919,0.008239604152854879 +1050336.0,978.63,-0.035493818000674415,0.019605591893196106,0.09912896156311035,0.03455129158760428,-0.7002643317973707,0.20930099487304688,-0.022214559429073688,7.017543859649122,2.782369172369947,0.20566037735849085,0.03773584905660379,0.006203210373521285,0.011217819727894849,0.008367781274049732 +1100352.0,971.46,-0.03215195438824038,0.019605591893196106,0.09912896156311035,0.03455129158760428,-0.7002643317973707,0.20930099487304688,-0.022214559429073688,2.127659574468085,2.717172844112933,0.19782608695652235,0.03478260869565219,0.005075626562958133,0.010382732916421672,0.007164298516865571 +1150368.0,990.18,-0.027990475681189276,0.019605591893196106,0.09912896156311035,0.03455129158760428,-0.7002643317973707,0.20930099487304688,-0.022214559429073688,0.0,2.6368297733474897,0.18400000000000072,0.03200000000000001,0.0028826272275210905,0.008372706076280195,0.007375474329347424 +1200384.0,1000.0,-0.026378024949999992,0.020384063944220543,0.14645177125930786,0.005603884250473356,-0.6429918361885939,0.2043212890625,-0.023433840820644036,0.0,2.414567931666805,0.15961538461538477,0.032692307692307694,0.002974563335819041,0.00770542721515627,0.007101684977559227 +1250400.0,1000.0,-0.023809068019999994,0.020384063944220543,0.14645177125930786,0.005603884250473356,-0.6429918361885939,0.2043212890625,-0.023433840820644036,0.0,2.1304294597300935,0.13260869565217384,0.019565217391304346,0.0030975722678427083,0.007444877500255148,0.005137017625309999 +1300416.0,1000.0,-0.020760008160000003,0.020384063944220543,0.14645177125930786,0.005603884250473356,-0.6429918361885939,0.2043212890625,-0.023433840820644036,0.0,2.091125116889792,0.1300000000000004,0.023333333333333334,0.003320633633635518,0.008134032344793839,0.0064614514271483054 +1350432.0,1000.0,-0.020005630620000003,0.015999868512153625,0.2268970012664795,0.0034789866793460077,-0.5800913424376631,0.18323135375976562,-0.02126378985793167,0.0,1.8535176476662745,0.11666666666666667,0.023809523809523808,0.0028129436607668783,0.007919713974374302,0.005942598695070776 +1400448.0,1000.0,-0.01816549462,0.015999868512153625,0.2268970012664795,0.0034789866793460077,-0.5800913424376631,0.18323135375976562,-0.02126378985793167,0.0,1.9976657496869943,0.10357142857142856,0.025,0.00292929902356943,0.008061866734983377,0.00632055025994499 +1450464.0,1000.0,-0.017387194170000002,0.012264963239431381,0.28028225898742676,0.002953011705929498,-0.5246789926022757,0.1715667724609375,-0.02063093408992813,0.0,2.300587478147985,0.13999999999999999,0.026666666666666665,0.0026359201425626114,0.00783737869432628,0.005784330445257948 +1500480.0,1000.0,-0.01599304801,0.012264963239431381,0.28028225898742676,0.002953011705929498,-0.5246789926022757,0.1715667724609375,-0.02063093408992813,0.0,1.365413556837453,0.0862745098039216,0.011764705882352941,0.002111461337415043,0.006127736129284633,0.004755724100261328 +1550496.0,1000.0,-0.01426567767,0.012264963239431381,0.28028225898742676,0.002953011705929498,-0.5246789926022757,0.1715667724609375,-0.02063093408992813,0.0,1.5885375279273137,0.08775510204081637,0.014285714285714285,0.002287063266180396,0.006630838123430339,0.006530433303682433 +1600512.0,1000.0,-0.013807208779999998,0.019821252673864365,0.33004438877105713,0.0024798079426375354,-0.42355793149326926,0.151751708984375,-0.019435017672185494,0.0,1.2310550576226862,0.060416666666666695,0.010416666666666666,0.0018016674519060005,0.005458288625800975,0.004777518723722267 +1650528.0,1000.0,-0.01090351956,0.019821252673864365,0.33004438877105713,0.0024798079426375354,-0.42355793149326926,0.151751708984375,-0.019435017672185494,0.0,1.0872872211977889,0.052631578947368446,0.012280701754385965,0.001953451383198494,0.0060944219677480345,0.0035582693190224645 +1700544.0,1000.0,-0.008902147400000001,0.019821252673864365,0.33004438877105713,0.0024798079426375354,-0.42355793149326926,0.151751708984375,-0.019435017672185494,0.0,1.187136479389355,0.05714285714285716,0.016666666666666666,0.0015898721740539063,0.006203069364726379,0.004868370287055092 +1750560.0,1000.0,-0.006953873780000002,0.02212357521057129,0.33627647161483765,0.0019282856252829106,-0.2800353410246316,0.10193252563476562,-0.014820773995643322,0.0,0.3478775405637245,0.015517241379310352,0.006896551724137932,0.0005868792122298343,0.002965519372786086,0.0022690835056163603 +1800576.0,1000.0,-0.00441644805,0.02212357521057129,0.33627647161483765,0.0019282856252829106,-0.2800353410246316,0.10193252563476562,-0.014820773995643322,0.0,0.4347007154209897,0.0186046511627907,0.004651162790697674,0.0005922644517785141,0.0026219592651767895,0.0021033514312686825 +1850592.0,1000.0,-0.00313582518,0.004543771967291832,0.5235401391983032,0.0013361499851107794,-0.09415871094515751,0.0302947998046875,-0.005561586053738665,0.0,0.07272728120872962,0.0036363636363636437,0.0,0.00015899852066298442,0.0011255874141837183,0.0013912465917897277 +1900608.0,1000.0,-0.0026143243499999994,0.004543771967291832,0.5235401391983032,0.0013361499851107794,-0.09415871094515751,0.0302947998046875,-0.005561586053738665,0.0,0.09391944387772724,0.0021276595744680903,0.002127659574468085,0.00022554983469086657,0.0011787445949855322,0.0007680745976539386 +1950624.0,1000.0,-0.00178721588,0.004543771967291832,0.5235401391983032,0.0013361499851107794,-0.09415871094515751,0.0302947998046875,-0.005561586053738665,0.0,0.09008599582884003,0.0020408163265306163,0.0020408163265306124,7.763655787176514e-05,0.000696648684373019,0.0005883968415822134 +2000640.0,1000.0,-0.0017833081900000003,0.0011808275012299418,0.6151938438415527,0.0008568925245972636,-0.03738750446457288,0.010208892822265624,-0.0025103175318406555,0.0,0.037735860372833385,0.0018867924528301987,0.0,0.0001429777188900495,0.0004887644833149518,0.0004511514787317175 +2050656.0,1000.0,-0.0016348133100000004,0.0011808275012299418,0.6151938438415527,0.0008568925245972636,-0.03738750446457288,0.010208892822265624,-0.0025103175318406555,0.0,5.1831125691528214e-09,4.3175339846533866e-18,0.0,4.6844255212817e-06,0.00010628313119707057,0.00019362631300309612 +2100672.0,1000.0,-0.0014209135899999998,0.0008211996173486114,0.689454197883606,0.0005961046291809602,-0.02262274484678528,0.006377410888671875,-0.0017469712890658684,0.0,0.09082483626572166,0.0033333333333333396,0.0016666666666666668,0.0003132526526883233,0.0010934333984114221,0.0005849854742490778 +2150688.0,1000.0,-0.0014721842199999996,0.0008211996173486114,0.689454197883606,0.0005961046291809602,-0.02262274484678528,0.006377410888671875,-0.0017469712890658684,0.0,6.199071511279121e-09,5.163828021512356e-18,0.0,3.570349981586555e-06,7.950060847375712e-05,0.00011805135018493159 +2200704.0,1000.0,-0.00139343286,0.0008211996173486114,0.689454197883606,0.0005961046291809602,-0.02262274484678528,0.006377410888671875,-0.0017469712890658684,0.0,6.664001874625055e-09,5.551115123125783e-18,0.0,1.571534008770973e-05,0.00018595122538324425,0.0003728315195349794 +2250720.0,1000.0,-0.0013109658899999998,0.0010077476035803556,0.7472338676452637,0.00046536089823197014,-0.016669508023380786,0.00546722412109375,-0.0017007665545413885,0.0,0.05248291004898218,5.430438707405657e-18,0.002173913043478261,3.728296797095126e-05,0.00041807950799265783,0.0001775949181514445 +2300736.0,1000.0,-0.0013249675000000002,0.0010077476035803556,0.7472338676452637,0.00046536089823197014,-0.016669508023380786,0.00546722412109375,-0.0017007665545413885,0.0,6.664001874625055e-09,5.551115123125783e-18,0.0,2.9801831740049043e-06,8.283807081630186e-05,0.00011160485667284448 +2350752.0,1000.0,-0.0012476264300000002,0.0010077476035803556,0.7472338676452637,0.00046536089823197014,-0.016669508023380786,0.00546722412109375,-0.0017007665545413885,0.0,6.533335171201034e-09,5.442269728554689e-18,0.0,7.71593960675507e-06,7.238238955245502e-05,4.7396672096080516e-05 +2400768.0,1000.0,-0.00127612288,0.0007149702287279069,0.823580265045166,0.0003018626460027041,-0.00900472547146478,0.003609466552734375,-0.0012863654332363693,0.0,6.5191322686549456e-09,5.430438707405657e-18,0.0,4.460726188949706e-06,5.538195178373655e-05,7.904308021349131e-05 +2450784.0,1000.0,-0.00113769717,0.0007149702287279069,0.823580265045166,0.0003018626460027041,-0.00900472547146478,0.003609466552734375,-0.0012863654332363693,0.0,2.8237296078919727e-09,2.3521674250532977e-18,0.0,3.6461989521846698e-09,1.8464535395782704e-06,4.994992688258498e-06 +2500800.0,1000.0,-0.00096220701,0.000718112918548286,0.8251644968986511,0.00018915658374498296,-0.005925362584120819,0.00265350341796875,-0.0006987140186744511,0.0,3.173334226011931e-09,2.64338815386942e-18,0.0,3.3949709190818447e-06,6.90786318792971e-05,1.3892869987171727e-05 +2550816.0,1000.0,-0.00096667258,0.000718112918548286,0.8251644968986511,0.00018915658374498296,-0.005925362584120819,0.00265350341796875,-0.0006987140186744511,0.0,0.04311096063594931,3.469446951953614e-18,0.0017857142857142859,2.7925098116216762e-05,0.00027883596979514383,6.474360181813703e-05 +2600832.0,1000.0,-0.0009266339800000002,0.000718112918548286,0.8251644968986511,0.00018915658374498296,-0.005925362584120819,0.00265350341796875,-0.0006987140186744511,0.0,4.543637641789811e-09,3.784851220313033e-18,0.0,8.278686837130764e-12,2.3149584402605422e-07,1.888397546213515e-07 +2650848.0,1000.0,-0.00089634034,0.0003722534456755966,0.8743447661399841,0.00013119237621695491,-0.0037503693166353715,0.001851654052734375,-0.0006094957999575712,0.0,6.1703721061343105e-09,5.13992141030165e-18,0.0,1.2440586283064217e-05,6.80451079812406e-05,7.62172719119249e-05 +2700864.0,1000.0,-0.0010359020299999999,0.0003722534456755966,0.8743447661399841,0.00013119237621695491,-0.0037503693166353715,0.001851654052734375,-0.0006094957999575712,0.0,9.216172805332524e-09,7.677074106450551e-18,0.0,2.520802072200139e-12,1.877744097083169e-07,2.0674044231643838e-07 +2750880.0,1000.0,-0.0010998116899999999,0.0003722534456755966,0.8743447661399841,0.00013119237621695491,-0.0037503693166353715,0.001851654052734375,-0.0006094957999575712,0.0,4.080001147729626e-09,3.398641912117826e-18,0.0,2.8553363079660455e-12,1.9233338672297828e-07,1.4016501821332702e-07 +2800896.0,1000.0,-0.00116435146,0.0005043481942266226,0.9278879165649414,9.89920950401138e-05,-0.0038611036153376687,0.00232696533203125,-0.000403893684330825,0.0,1.0058870754151026e-08,8.3790416952842e-18,0.0,2.3904065562663303e-12,1.8438488548362868e-07,1.6896425784712515e-07 +2850912.0,1000.0,-0.0010800735300000002,0.0005043481942266226,0.9278879165649414,9.89920950401138e-05,-0.0038611036153376687,0.00232696533203125,-0.000403893684330825,0.0,2.897392119402198e-09,2.413528314402514e-18,0.0,2.1923382247912788e-12,1.7933272495139422e-07,1.9399791507523428e-07 +2900928.0,1000.0,-0.0009440282300000001,0.00029863620875403285,0.9308943152427673,7.32951451053676e-05,-0.0032281664260869645,0.00157318115234375,-0.0002561717339162328,0.0,2.258983686313578e-09,1.8817339400426382e-18,0.0,4.39032700626905e-12,1.888084343860547e-07,2.9352846014436854e-07 +2950944.0,1000.0,-0.0009240552200000001,0.00029863620875403285,0.9308943152427673,7.32951451053676e-05,-0.0032281664260869645,0.00157318115234375,-0.0002561717339162328,0.0,5.424187572369232e-09,4.518349518823312e-18,0.0,2.2055870362302984e-12,1.8655140825956325e-07,1.6361592560385394e-07 +3000960.0,1000.0,-0.00100350651,0.00029863620875403285,0.9308943152427673,7.32951451053676e-05,-0.0032281664260869645,0.00157318115234375,-0.0002561717339162328,0.0,7.269820226863697e-09,6.0557619525008536e-18,0.0,2.3125244638064693e-12,1.8784515581561647e-07,1.5869956396292134e-07 diff --git a/models/hybrid_cv01_disc_s42_play_results.csv b/models/hybrid_cv01_disc_s42_play_results.csv new file mode 100644 index 0000000..5d15d45 --- /dev/null +++ b/models/hybrid_cv01_disc_s42_play_results.csv @@ -0,0 +1,101 @@ +start_speed,ep_steps,ep_reward,terminated,truncated,no_crash,t_min_start,t_min_min,t_min_final,t_min_mean_last100,t_min_settle_step,x_pos_final,x_vel_final,theta_final,theta_dot_final,energy_final,acc_final +-10.0,1000,-263.8599287001415,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,81,8.326672684688674e-17,0.0,3.141592653589793,4.33053459595212e-07,9.376764943369095e-12,0.0 +-9.8,1000,-247.6077370735904,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,80,-2.7755575615628914e-17,0.0,3.1415925343805036,4.458551977278417e-07,9.93934286704664e-12,0.0 +-9.6,1000,-236.623691804693,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,73,-2.7755575615628914e-17,0.0,3.1415923593154385,6.875309889534762e-08,2.363494303856725e-13,0.0 +-9.4,1000,-225.3698117525316,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,70,-2.7755575615628914e-17,0.0,3.1415923157571024,-1.8175191489940127e-07,1.65168792847996e-12,0.0 +-9.2,1000,-214.8314136350382,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,76,-2.7755575615628914e-17,0.0,3.1415918381000068,1.2417600105969035e-07,7.709839619588108e-13,0.0 +-9.0,1000,-201.00212467008006,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,76,8.326672684688674e-17,0.0,3.141592312161298,-3.556463266677085e-07,6.324215483611722e-12,0.0 +-8.8,1000,-186.8339353482046,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,75,2.7755575615628914e-17,0.0,3.141592364261578,1.9710216753822997e-07,1.9424632224134236e-12,0.0 +-8.6,1000,-176.67503488857307,False,True,True,0.0,0.0,0.0,0.0,67,0.0,0.0,3.1415925410884933,2.1671836160813136e-07,2.348342412905639e-12,0.0 +-8.4,1000,-167.14586103702527,False,True,True,0.0,0.0,0.0,0.0,67,0.0,0.0,3.141592182845154,-4.675484259879939e-09,1.093007653219253e-15,0.0 +-8.2,1000,-157.6601773260604,False,True,True,0.0,0.0,0.0,0.0,60,0.0,0.0,3.141592392500357,-2.685379268298118e-07,3.6056309073026675e-12,0.0 +-8.0,1000,-148.67452242142645,False,True,True,0.0,0.0,0.0,0.0,77,0.0,0.0,3.1415925570192313,-4.450137865081124e-07,9.901863509114392e-12,0.0 +-7.8,1000,-139.87746359231352,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,75,-2.7755575615628914e-17,0.0,3.1415921866339045,2.310696285888099e-07,2.6696586628085277e-12,0.0 +-7.6,1000,-131.1145861888769,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,64,-2.7755575615628914e-17,0.0,3.1415925978347534,4.0820183523789217e-07,8.331436914579163e-12,0.0 +-7.4,1000,-123.13570432338118,False,True,True,0.0,2.6023759268521414e-07,1.0520895190692225e-06,1.0255204118271738e-06,64,-2.7672308888782027e-14,-2.7755575615628914e-17,3.1415918767260753,-4.857945397778376e-08,1.1799816745246404e-13,0.0 +-7.2,1000,-114.9362282369884,False,True,True,0.0,2.5156055442632117e-07,1.0520895190692225e-06,1.0255204118271738e-06,60,-2.7672308888782027e-14,-2.7755575615628914e-17,3.141592305399681,-3.878223046510863e-07,7.520306999351642e-12,0.0 +-7.0,1000,-103.59693405121104,False,True,True,0.0,1.6989937826421605e-07,1.0356039022535481e-06,1.008596516592447e-06,60,2.681188604469753e-14,2.7755575615628914e-17,3.1415924161043627,-2.2604941187119294e-07,2.55491683030287e-12,0.0 +-6.8,1000,-96.30324052826306,False,True,True,0.0,1.6989937826421605e-07,1.0356039022535481e-06,1.008596516592447e-06,60,2.681188604469753e-14,2.7755575615628914e-17,3.141592195030919,3.8924161700264523e-07,7.575451820449733e-12,0.0 +-6.6,1000,-85.52275894674503,False,True,True,0.0,3.3320009373125275e-08,6.664001874625055e-08,6.664001874625056e-08,58,1.1102230246251565e-16,0.0,3.1415925461359917,-2.533736142238673e-07,3.2099094192432556e-12,0.0 +-6.4,1000,-78.17610973442827,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,62,2.7755575615628914e-17,0.0,3.141592495890726,3.339964727596319e-07,5.577682190793777e-12,0.0 +-6.2,1000,-71.98159175461227,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,63,2.7755575615628914e-17,0.0,3.1415922864643715,2.1840396268222038e-08,2.3850145457648355e-14,0.0 +-6.0,1000,-66.17786697101644,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,60,2.7755575615628914e-17,0.0,3.1415922795736155,-2.4857588759394917e-07,3.0894985946559824e-12,0.0 +-5.8,1000,-60.3132548556531,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415925558763447,-6.034874589508041e-07,1.820985565554492e-11,0.0 +-5.6,1000,-54.98682195668338,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415922250680457,-5.37156664324108e-08,1.442686410139012e-13,0.0 +-5.4,1000,-50.02093675567057,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,48,2.7755575615628914e-17,0.0,3.141592595877844,4.35583115798302e-07,9.486632538427849e-12,0.0 +-5.2,1000,-45.23719922406917,False,True,True,0.0,0.0,0.0,0.0,42,0.0,0.0,3.1415923193915942,3.976928032578098e-08,7.907978288152753e-14,0.0 +-5.0,1000,-40.88837632314286,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,50,-2.7755575615628914e-17,0.0,3.1415923833072097,-7.013854697228173e-08,2.4597078856914857e-13,0.0 +-4.8,1000,-36.43344214761157,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,41,-2.7755575615628914e-17,0.0,3.141592452010621,-2.7885730333266913e-07,3.888069781098413e-12,0.0 +-4.6,1000,-32.64491342570548,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,57,-2.7755575615628914e-17,0.0,3.1415923154286314,-1.281007660121411e-07,8.204903126448661e-13,0.0 +-4.4,1000,-28.90327723409404,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,-2.7755575615628914e-17,0.0,3.1415926088863095,-1.5249081755185682e-07,1.1626724718816843e-12,0.0 +-4.2,1000,-25.82555065174364,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,52,2.7755575615628914e-17,0.0,3.1415926141650266,-2.725869464322383e-07,3.7151821682625965e-12,0.0 +-4.0,1000,-22.447011794458618,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.1415924114745817,2.3494938060819792e-07,2.7600605724087923e-12,0.0 +-3.8,1000,-19.613687095801417,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,43,2.7755575615628914e-17,0.0,3.1415924876574848,9.536406292352989e-08,4.5471522486414843e-13,0.0 +-3.6,1000,-17.048115162149085,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,52,8.326672684688674e-17,0.0,3.1415926170895516,-3.194353403010168e-07,5.101946831661321e-12,0.0 +-3.4,1000,-14.002514011540525,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,32,2.7755575615628914e-17,0.0,3.141592344592738,-1.111620317466556e-07,6.178498651022235e-13,0.0 +-3.2,1000,-11.917554527037627,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415924015882912,4.764556427471083e-09,1.1350498975278002e-15,0.0 +-3.0,1000,-10.060889262285713,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.141592442329372,-1.0306204529548227e-07,5.31089259024402e-13,0.0 +-2.8,1000,-8.307338110998021,False,True,True,0.0,0.0,0.0,0.0,38,0.0,0.0,3.1415924850023993,-1.5846471442157646e-07,1.2555532858355891e-12,0.0 +-2.6,1000,-6.94631020860515,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,2.7755575615628914e-17,0.0,3.1415924087387297,-8.288893640196295e-08,3.4352878889243296e-13,0.0 +-2.4,1000,-5.688697584374456,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.141592409192578,3.523658242252528e-08,6.208083704097088e-14,0.0 +-2.2,1000,-4.388114433686466,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,22,-2.7755575615628914e-17,0.0,3.1415922725160845,-1.2893768427899746e-07,8.312463213615215e-13,0.0 +-2.0,1000,-3.331282956191044,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,19,-2.7755575615628914e-17,0.0,3.141592368513794,-2.2081925099581205e-07,2.4380570805175723e-12,0.0 +-1.8,1000,-2.5140216861671716,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,20,-2.7755575615628914e-17,0.0,3.1415924723093434,1.1647373519908047e-08,6.783065495612757e-15,0.0 +-1.6,1000,-2.0161793840623363,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,25,-2.7755575615628914e-17,0.0,3.1415923623492756,-1.1762769589207763e-07,6.918137420439549e-13,0.0 +-1.4,1000,-1.3956410175241682,False,True,True,0.0,0.0,0.0,0.0,20,0.0,0.0,3.1415925747402595,1.7140849245403974e-07,1.46904356426833e-12,0.0 +-1.2,1000,-0.9957910514372736,False,True,True,0.0,0.0,0.0,0.0,17,0.0,0.0,3.141592566701839,1.923586886289907e-07,1.8500932545532497e-12,0.0 +-1.0,1000,-0.7712151479606123,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.141592409192578,4.801083673263294e-08,1.1525202218837684e-13,0.0 +-0.8,1000,-0.4832509260527756,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415924951882768,-2.469556175395125e-07,3.049353851716099e-12,0.0 +-0.6,1000,-0.2717892762488744,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592611442945,-2.999193535243144e-07,4.497580930922133e-12,0.0 +-0.4,1000,-0.16640265400940085,False,True,True,0.0,0.0,0.0,0.0,10,0.0,0.0,3.141592517833754,5.90618976425867e-08,1.7441538765716943e-13,0.0 +-0.2,1000,-0.08525909107377136,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592509888199,-5.408643939226397e-08,1.462671463066522e-13,0.0 +0.2,1000,-0.08525909107377136,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592509888199,-5.408643939226397e-08,1.462671463066522e-13,0.0 +0.4,1000,-0.16640265400940085,False,True,True,0.0,0.0,0.0,0.0,10,0.0,0.0,3.141592517833754,5.90618976425867e-08,1.7441538765716943e-13,0.0 +0.6,1000,-0.2717892762488744,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592611442945,-2.999193535243144e-07,4.497580930922133e-12,0.0 +0.8,1000,-0.4832509260527756,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415924951882768,-2.469556175395125e-07,3.049353851716099e-12,0.0 +1.0,1000,-0.7712151479606123,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.141592409192578,4.801083673263294e-08,1.1525202218837684e-13,0.0 +1.2,1000,-0.9957910514372736,False,True,True,0.0,0.0,0.0,0.0,17,0.0,0.0,3.141592566701839,1.923586886289907e-07,1.8500932545532497e-12,0.0 +1.4,1000,-1.3956410175241682,False,True,True,0.0,0.0,0.0,0.0,20,0.0,0.0,3.1415925747402595,1.7140849245403974e-07,1.46904356426833e-12,0.0 +1.6,1000,-2.0161793840623363,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,25,-2.7755575615628914e-17,0.0,3.1415923623492756,-1.1762769589207763e-07,6.918137420439549e-13,0.0 +1.8,1000,-2.5140216861671716,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,20,-2.7755575615628914e-17,0.0,3.1415924723093434,1.1647373519908047e-08,6.783065495612757e-15,0.0 +2.0,1000,-3.331282956191044,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,19,-2.7755575615628914e-17,0.0,3.141592368513794,-2.2081925099581205e-07,2.4380570805175723e-12,0.0 +2.2,1000,-4.388114433686466,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,22,-2.7755575615628914e-17,0.0,3.1415922725160845,-1.2893768427899746e-07,8.312463213615215e-13,0.0 +2.4,1000,-5.688697584374456,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.141592409192578,3.523658242252528e-08,6.208083704097088e-14,0.0 +2.6,1000,-6.94631020860515,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,2.7755575615628914e-17,0.0,3.1415924087387297,-8.288893640196295e-08,3.4352878889243296e-13,0.0 +2.8,1000,-8.307338110998021,False,True,True,0.0,0.0,0.0,0.0,38,0.0,0.0,3.1415924850023993,-1.5846471442157646e-07,1.2555532858355891e-12,0.0 +3.0,1000,-10.060889262285713,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.141592442329372,-1.0306204529548227e-07,5.31089259024402e-13,0.0 +3.2,1000,-11.917554527037627,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415924015882912,4.764556427471083e-09,1.1350498975278002e-15,0.0 +3.4,1000,-14.002514011540525,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,32,2.7755575615628914e-17,0.0,3.141592344592738,-1.111620317466556e-07,6.178498651022235e-13,0.0 +3.6,1000,-17.048115162149085,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,52,8.326672684688674e-17,0.0,3.1415926170895516,-3.194353403010168e-07,5.101946831661321e-12,0.0 +3.8,1000,-19.613687095801417,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,43,2.7755575615628914e-17,0.0,3.1415924876574848,9.536406292352989e-08,4.5471522486414843e-13,0.0 +4.0,1000,-22.447011794458618,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,38,2.7755575615628914e-17,0.0,3.1415924114745817,2.3494938060819792e-07,2.7600605724087923e-12,0.0 +4.2,1000,-25.82555065174364,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,52,2.7755575615628914e-17,0.0,3.1415926141650266,-2.725869464322383e-07,3.7151821682625965e-12,0.0 +4.4,1000,-28.90327723409404,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,-2.7755575615628914e-17,0.0,3.1415926088863095,-1.5249081755185682e-07,1.1626724718816843e-12,0.0 +4.6,1000,-32.64491342570548,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,57,-2.7755575615628914e-17,0.0,3.1415923154286314,-1.281007660121411e-07,8.204903126448661e-13,0.0 +4.8,1000,-36.43344214761157,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,41,-2.7755575615628914e-17,0.0,3.141592452010621,-2.7885730333266913e-07,3.888069781098413e-12,0.0 +5.0,1000,-40.88837632314286,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,50,-2.7755575615628914e-17,0.0,3.1415923833072097,-7.013854697228173e-08,2.4597078856914857e-13,0.0 +5.2,1000,-45.23719922406917,False,True,True,0.0,0.0,0.0,0.0,42,0.0,0.0,3.1415923193915942,3.976928032578098e-08,7.907978288152753e-14,0.0 +5.4,1000,-50.02093675567057,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,48,2.7755575615628914e-17,0.0,3.141592595877844,4.35583115798302e-07,9.486632538427849e-12,0.0 +5.6,1000,-54.98682195668338,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415922250680457,-5.37156664324108e-08,1.442686410139012e-13,0.0 +5.8,1000,-60.3132548556531,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,44,2.7755575615628914e-17,0.0,3.1415925558763447,-6.034874589508041e-07,1.820985565554492e-11,0.0 +6.0,1000,-66.17786697101644,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,60,2.7755575615628914e-17,0.0,3.1415922795736155,-2.4857588759394917e-07,3.0894985946559824e-12,0.0 +6.2,1000,-71.98159175461227,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,63,2.7755575615628914e-17,0.0,3.1415922864643715,2.1840396268222038e-08,2.3850145457648355e-14,0.0 +6.4,1000,-78.17610973442827,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,62,2.7755575615628914e-17,0.0,3.141592495890726,3.339964727596319e-07,5.577682190793777e-12,0.0 +6.6,1000,-85.52275894674503,False,True,True,0.0,3.3320009373125275e-08,6.664001874625055e-08,6.664001874625056e-08,58,1.1102230246251565e-16,0.0,3.1415925461359917,-2.533736142238673e-07,3.2099094192432556e-12,0.0 +6.8,1000,-96.30324052826306,False,True,True,0.0,1.6989937826421605e-07,1.0356039022535481e-06,1.008596516592447e-06,60,2.681188604469753e-14,2.7755575615628914e-17,3.141592195030919,3.8924161700264523e-07,7.575451820449733e-12,0.0 +7.0,1000,-103.59693405121104,False,True,True,0.0,1.6989937826421605e-07,1.0356039022535481e-06,1.008596516592447e-06,60,2.681188604469753e-14,2.7755575615628914e-17,3.1415924161043627,-2.2604941187119294e-07,2.55491683030287e-12,0.0 +7.2,1000,-114.9362282369884,False,True,True,0.0,2.5156055442632117e-07,1.0520895190692225e-06,1.0255204118271738e-06,60,-2.7672308888782027e-14,-2.7755575615628914e-17,3.141592305399681,-3.878223046510863e-07,7.520306999351642e-12,0.0 +7.4,1000,-123.13570432338118,False,True,True,0.0,2.6023759268521414e-07,1.0520895190692225e-06,1.0255204118271738e-06,64,-2.7672308888782027e-14,-2.7755575615628914e-17,3.1415918767260753,-4.857945397778376e-08,1.1799816745246404e-13,0.0 +7.6,1000,-131.1145861888769,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,64,-2.7755575615628914e-17,0.0,3.1415925978347534,4.0820183523789217e-07,8.331436914579163e-12,0.0 +7.8,1000,-139.87746359231352,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,75,-2.7755575615628914e-17,0.0,3.1415921866339045,2.310696285888099e-07,2.6696586628085277e-12,0.0 +8.0,1000,-148.67452242142645,False,True,True,0.0,0.0,0.0,0.0,77,0.0,0.0,3.1415925570192313,-4.450137865081124e-07,9.901863509114392e-12,0.0 +8.2,1000,-157.6601773260604,False,True,True,0.0,0.0,0.0,0.0,60,0.0,0.0,3.141592392500357,-2.685379268298118e-07,3.6056309073026675e-12,0.0 +8.4,1000,-167.14586103702527,False,True,True,0.0,0.0,0.0,0.0,67,0.0,0.0,3.141592182845154,-4.675484259879939e-09,1.093007653219253e-15,0.0 +8.6,1000,-176.67503488857307,False,True,True,0.0,0.0,0.0,0.0,67,0.0,0.0,3.1415925410884933,2.1671836160813136e-07,2.348342412905639e-12,0.0 +8.8,1000,-186.8339353482046,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,75,2.7755575615628914e-17,0.0,3.141592364261578,1.9710216753822997e-07,1.9424632224134236e-12,0.0 +9.0,1000,-201.00212467008006,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,76,8.326672684688674e-17,0.0,3.141592312161298,-3.556463266677085e-07,6.324215483611722e-12,0.0 +9.2,1000,-214.8314136350382,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,76,-2.7755575615628914e-17,0.0,3.1415918381000068,1.2417600105969035e-07,7.709839619588108e-13,0.0 +9.4,1000,-225.3698117525316,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,70,-2.7755575615628914e-17,0.0,3.1415923157571024,-1.8175191489940127e-07,1.65168792847996e-12,0.0 +9.6,1000,-236.623691804693,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,73,-2.7755575615628914e-17,0.0,3.1415923593154385,6.875309889534762e-08,2.363494303856725e-13,0.0 +9.8,1000,-247.6077370735904,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,80,-2.7755575615628914e-17,0.0,3.1415925343805036,4.458551977278417e-07,9.93934286704664e-12,0.0 +10.0,1000,-263.8599287001415,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,81,8.326672684688674e-17,0.0,3.141592653589793,4.33053459595212e-07,9.376764943369095e-12,0.0 diff --git a/models/hybrid_cv01_disc_s5775_log.csv b/models/hybrid_cv01_disc_s5775_log.csv new file mode 100644 index 0000000..496498a --- /dev/null +++ b/models/hybrid_cv01_disc_s5775_log.csv @@ -0,0 +1,61 @@ +t,ep_len_mean,rew_per_step,approx_kl,explained_variance,value_loss,entropy_loss,clip_fraction,policy_gradient_loss,rail_hit_pct,mean_t_min,mean_x_pos_abs,mean_x_vel_abs,mean_energy,mean_theta_dot_abs,mean_theta_dev +50016.0,18.47,-0.5164860812127775,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +100032.0,19.42,-0.5208478609680741,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +150048.0,32.68,-0.3209917347001224,0.023738441988825798,0.16160213947296143,0.11848635091891993,-1.0793914558191318,0.3620849609375,-0.02988441463764957,100.0,nan,nan,nan,nan,nan,nan +200064.0,29.12,-0.35124760679945055,0.023738441988825798,0.16160213947296143,0.11848635091891993,-1.0793914558191318,0.3620849609375,-0.02988441463764957,100.0,nan,nan,nan,nan,nan,nan +250080.0,27.25,-0.3958385677064221,0.023738441988825798,0.16160213947296143,0.11848635091891993,-1.0793914558191318,0.3620849609375,-0.02988441463764957,100.0,nan,nan,nan,nan,nan,nan +300096.0,55.2,-0.22189591920289856,0.02237531915307045,0.5035263299942017,0.1923814533620316,-1.023446420149412,0.3641815185546875,-0.04216754654511305,100.0,nan,nan,nan,nan,nan,nan +350112.0,58.21,-0.20996829548187598,0.02237531915307045,0.5035263299942017,0.1923814533620316,-1.023446420149412,0.3641815185546875,-0.04216754654511305,100.0,nan,nan,nan,nan,nan,nan +400128.0,75.78,-0.1777599158089206,0.015065750107169151,0.44713079929351807,0.4328656866280653,-0.9596294373419368,0.29744949340820315,-0.03844324619346935,100.0,nan,nan,nan,nan,nan,nan +450144.0,140.87,-0.1238829911265706,0.015065750107169151,0.44713079929351807,0.4328656866280653,-0.9596294373419368,0.29744949340820315,-0.03844324619346935,99.04761904761905,1.4907125932385952,0.1666666666666733,2.7755575615628914e-17,0.008670481772551693,0.012840545357428611,0.00452802729874809 +500160.0,146.05,-0.12392866251283806,0.015065750107169151,0.44713079929351807,0.4328656866280653,-0.9596294373419368,0.29744949340820315,-0.03844324619346935,100.0,nan,nan,nan,nan,nan,nan +550176.0,301.59,-0.08747619834875162,0.018616359680891037,0.37519359588623047,0.5838327410070633,-0.8729762786038918,0.255859375,-0.031876627635330126,95.36082474226804,4.728542096092001,0.5666666666666667,0.033333333333333354,0.015766650984737052,0.016415650069135708,0.0073432026034093096 +600192.0,610.11,-0.06346287141662979,0.018616359680891037,0.37519359588623047,0.5838327410070633,-0.8729762786038918,0.255859375,-0.031876627635330126,64.38356164383562,3.4110208395137644,0.3153846153846211,0.05000000000000001,0.005834419248442863,0.009750803518568309,0.009722910700441716 +650208.0,490.45,-0.0649916728310735,0.018616359680891037,0.37519359588623047,0.5838327410070633,-0.8729762786038918,0.255859375,-0.031876627635330126,77.67857142857143,3.5421853048862677,0.3440000000000025,0.04000000000000001,0.007386230358354387,0.01002042708268473,0.011942566299757918 +700224.0,696.2,-0.050718706736569946,0.02787187695503235,0.27253812551498413,0.3213071395739462,-0.7796651521144667,0.23056564331054688,-0.02318568377923782,17.02127659574468,2.813415246040654,0.22564102564102465,0.030769230769230778,0.004101302519351899,0.009272282458762085,0.007717000969977736 +750240.0,975.2,-0.0365592386997539,0.02787187695503235,0.27253812551498413,0.3213071395739462,-0.7796651521144667,0.23056564331054688,-0.02318568377923782,1.7543859649122806,2.883334765180094,0.23214285714285632,0.04642857142857145,0.005984204295087547,0.01212335747515099,0.006638668636589977 +800256.0,997.95,-0.032674067207775936,0.03915299475193024,-0.02077627182006836,0.012306372171440216,-0.6732232532667695,0.23027191162109376,-0.029105521852523,0.0,2.7162943783482523,0.20222222222222347,0.04666666666666669,0.005422123318132467,0.01160208760450827,0.006972840684496933 +850272.0,1000.0,-0.027276230360000004,0.03915299475193024,-0.02077627182006836,0.012306372171440216,-0.6732232532667695,0.23027191162109376,-0.029105521852523,0.0,2.052195192767916,0.13018867924528343,0.024528301886792454,0.0025393901492552955,0.006744829968898374,0.006338034743704137 +900288.0,1000.0,-0.02209810995,0.03915299475193024,-0.02077627182006836,0.012306372171440216,-0.6732232532667695,0.23027191162109376,-0.029105521852523,0.0,2.5378078995513853,0.1638297872340425,0.05106382978723406,0.0027162281963888143,0.00938819475087192,0.005218789779940309 +950304.0,1000.0,-0.020161132100000003,0.02388637885451317,0.17505329847335815,0.002786001518640546,-0.5848989197591437,0.1781951904296875,-0.022498882361577443,0.0,1.8448062826506997,0.09999999999999902,0.025490196078431372,0.0033167737397342733,0.008511701598648013,0.004852176091888853 +1000320.0,1000.0,-0.017781325940000002,0.02388637885451317,0.17505329847335815,0.002786001518640546,-0.5848989197591437,0.1781951904296875,-0.022498882361577443,0.0,2.1311931804234465,0.11224489795918366,0.03673469387755103,0.0028763564152153977,0.008577155240476232,0.004624385511282446 +1050336.0,1000.0,-0.01591922581,0.01924017444252968,0.2585676312446594,0.001610120572989615,-0.499537658522604,0.1507904052734375,-0.01832941553702767,0.0,1.770084687620738,0.09000000000000002,0.028000000000000004,0.0031931306262494236,0.00790967745501862,0.00421411422338636 +1100352.0,1000.0,-0.014390704930000001,0.01924017444252968,0.2585676312446594,0.001610120572989615,-0.499537658522604,0.1507904052734375,-0.01832941553702767,0.0,1.7156306716668641,0.07592592592592597,0.029629629629629634,0.0022232079215487835,0.008486662411553597,0.004646760231274988 +1150368.0,1000.0,-0.012891692739999999,0.01924017444252968,0.2585676312446594,0.001610120572989615,-0.499537658522604,0.1507904052734375,-0.01832941553702767,0.0,1.640543483722282,0.06888888888888955,0.02222222222222222,0.0019073827305267623,0.0067854703438407805,0.004583394339656286 +1200384.0,1000.0,-0.01185883109,0.03328366205096245,0.37704187631607056,0.0010814237088752066,-0.391586305416422,0.1420928955078125,-0.016412486219090285,0.0,1.3095501744436104,0.05636363636363639,0.019999999999999997,0.0020898134411196493,0.006464605483212984,0.0044402464347186475 +1250400.0,1000.0,-0.01035696864,0.03328366205096245,0.37704187631607056,0.0010814237088752066,-0.391586305416422,0.1420928955078125,-0.016412486219090285,0.0,1.17643510763419,0.04888888888888892,0.017777777777777778,0.0017789028696737162,0.006560453334505577,0.004000733778836974 +1300416.0,1000.0,-0.00907007091,0.03328366205096245,0.37704187631607056,0.0010814237088752066,-0.391586305416422,0.1420928955078125,-0.016412486219090285,0.0,1.0385018941753243,0.04528301886792455,0.01320754716981132,0.0011719080934510348,0.005187898521531051,0.0036571174158591953 +1350432.0,1000.0,-0.00807399494,0.02287968620657921,0.3780140280723572,0.0009499632345566055,-0.28530496663734084,0.11873855590820312,-0.015324374535691732,0.0,0.6318497692855165,0.025531914893617034,0.014893617021276595,0.0010422591705828452,0.004620307254797262,0.0026536232495921884 +1400448.0,1000.0,-0.005566165089999999,0.02287968620657921,0.3780140280723572,0.0009499632345566055,-0.28530496663734084,0.11873855590820312,-0.015324374535691732,0.0,0.41454157920011275,0.01568627450980393,0.00784313725490196,0.00041805897856900877,0.0023437640849933096,0.0021977234735222934 +1450464.0,1000.0,-0.004376987319999999,0.006883770227432251,0.4654079079627991,0.0008868903505984349,-0.13835863925305603,0.04235916137695313,-0.006809767247921173,0.0,0.5196529168936082,0.024074074074074085,0.007407407407407408,0.001008746510908701,0.0036767539028265054,0.0018538891127177523 +1500480.0,1000.0,-0.0035960796799999997,0.006883770227432251,0.4654079079627991,0.0008868903505984349,-0.13835863925305603,0.04235916137695313,-0.006809767247921173,0.0,0.1787205319344614,0.004545454545454548,0.004545454545454546,0.00022045958839699053,0.0013702983780753085,0.0008788292639804286 +1550496.0,1000.0,-0.00235174342,0.006883770227432251,0.4654079079627991,0.0008868903505984349,-0.13835863925305603,0.04235916137695313,-0.006809767247921173,0.0,0.11058989704919352,0.003448275862068973,0.001724137931034483,0.0005993941594518653,0.0015068250072551863,0.00103858342316817 +1600512.0,1000.0,-0.00212690331,0.003966468386352062,0.6826978921890259,0.000598131096125698,-0.05299410087582146,0.013033294677734375,-0.0022057539085210466,0.0,2.7613401515740163e-08,6.276544940352447e-16,6.30808536718839e-19,1.6996845763093804e-05,0.00023545928928795948,0.0002457315159111487 +1650528.0,1000.0,-0.00168055121,0.003966468386352062,0.6826978921890259,0.000598131096125698,-0.05299410087582146,0.013033294677734375,-0.0022057539085210466,0.0,0.06387944461746765,0.001851851851851856,0.001851851851851852,6.469258012543712e-05,0.0003710918520681102,0.00011912318154194279 +1700544.0,1000.0,-0.0016868369999999998,0.003966468386352062,0.6826978921890259,0.000598131096125698,-0.05299410087582146,0.013033294677734375,-0.0022057539085210466,0.0,0.04347826594000144,0.0021739130434782657,0.0,7.402042810352667e-05,0.0002449394405600115,0.00012087025391529215 +1750560.0,1000.0,-0.00196965733,0.0017765987431630492,0.7451580762863159,0.0004606888382699736,-0.045917467905883316,0.01319732666015625,-0.00260445245876042,0.0,0.10479788654567714,0.0038461538461538533,0.0019230769230769232,1.6973420200406813e-05,0.0003427081104426649,0.00025485172370022833 +1800576.0,1000.0,-0.0017873054799999997,0.0017765987431630492,0.7451580762863159,0.0004606888382699736,-0.045917467905883316,0.01319732666015625,-0.00260445245876042,0.0,8.14400089321202e-09,7.517135062566164e-18,0.0,3.6154366495466124e-06,5.5161261429923854e-05,0.00017667231364705086 +1850592.0,1000.0,-0.0017142213899999997,0.0007220996776595712,0.7913888692855835,0.00044181304983030855,-0.038861704630869555,0.007444000244140625,-0.00241237116819093,0.0,8.663202437012572e-09,7.216449660063518e-18,0.0,3.6955059578737364e-05,0.00015253073903685928,0.00015130952454827095 +1900608.0,1000.0,-0.0019000904899999999,0.0007220996776595712,0.7913888692855835,0.00044181304983030855,-0.038861704630869555,0.007444000244140625,-0.00241237116819093,0.0,0.07407408656048098,0.0037037037037037164,0.0,0.00019967084437329228,0.0003312861760346282,0.00017773692191868374 +1950624.0,1000.0,-0.00171813307,0.0007220996776595712,0.7913888692855835,0.00044181304983030855,-0.038861704630869555,0.007444000244140625,-0.00241237116819093,0.0,0.07665533502185048,0.0022222222222222287,0.0022222222222222222,7.616705605391328e-05,0.00041325251433432455,0.00010782571279200286 +2000640.0,1000.0,-0.0015574690099999997,0.000964311184361577,0.860156774520874,0.0003305897440782246,-0.028766337589337353,0.007176971435546875,-0.002311596508118896,0.0,0.0363636448450933,0.001818181818181825,0.0,1.2659793300785207e-05,9.289811592695382e-05,9.4306890837577e-05 +2050656.0,1000.0,-0.0015864925599999998,0.000964311184361577,0.860156774520874,0.0003305897440782246,-0.028766337589337353,0.007176971435546875,-0.002311596508118896,0.0,5.0704362089538465e-09,4.2236745502044e-18,0.0,9.71681781494617e-09,2.869042924239662e-06,1.3209885456581986e-05 +2100672.0,1000.0,-0.0015059682000000002,0.0018593529239296913,0.8906139135360718,0.0002297365492049952,-0.016179356939629484,0.00578460693359375,-0.002048992766947322,0.0,5.766924699194759e-09,4.803849625781927e-18,0.0,2.8340537373943037e-08,5.154189583419286e-06,5.109655032053107e-06 +2150688.0,1000.0,-0.0013525365500000003,0.0018593529239296913,0.8906139135360718,0.0002297365492049952,-0.016179356939629484,0.00578460693359375,-0.002048992766947322,0.0,8.330002343281319e-09,6.938893903907228e-18,0.0,2.8226413650524457e-08,4.2463029889601505e-06,1.7361336514717014e-06 +2200704.0,1000.0,-0.0011205176799999998,0.0018593529239296913,0.8906139135360718,0.0002297365492049952,-0.016179356939629484,0.00578460693359375,-0.002048992766947322,0.0,5.331201499700044e-09,4.440892098500626e-18,0.0,2.040843920915837e-07,9.197044259730577e-06,2.859783671871341e-06 +2250720.0,1000.0,-0.0010696669300000001,0.0007375007262453437,0.9123029112815857,0.00012378766457079581,-0.00678328723820556,0.00286865234375,-0.0009284447107546611,0.0,6.1703721061343105e-09,5.13992141030165e-18,0.0,3.087332958998079e-10,5.84580065597916e-07,4.995501998728002e-07 +2300736.0,1000.0,-0.0011720584300000001,0.0007375007262453437,0.9123029112815857,0.00012378766457079581,-0.00678328723820556,0.00286865234375,-0.0009284447107546611,0.0,6.8154564626847155e-09,5.677276830469551e-18,0.0,4.043011082317254e-12,2.478522523682503e-07,1.6018754418882383e-07 +2350752.0,1000.0,-0.0011981594899999998,0.0007375007262453437,0.9123029112815857,0.00012378766457079581,-0.00678328723820556,0.00286865234375,-0.0009284447107546611,0.0,5.170346282036681e-09,4.306899664494142e-18,0.0,2.593745785422763e-12,1.9278702142964996e-07,2.04778685326764e-07 +2400768.0,1000.0,-0.0010133495000000001,0.0007231628405861557,0.9340035915374756,7.912878196284223e-05,-0.004181789937952729,0.001935577392578125,-0.0007137649957769554,0.0,3.786364701491509e-09,3.1540426835941946e-18,0.0,1.5996336308791187e-06,2.7210819360611395e-05,1.7651641757780595e-05 +2450784.0,1000.0,-0.0008643826399999998,0.0007231628405861557,0.9340035915374756,7.912878196284223e-05,-0.004181789937952729,0.001935577392578125,-0.0007137649957769554,0.0,3.0851860530671553e-09,2.569960705150825e-18,0.0,2.544879031463584e-12,1.881342047311474e-07,1.7482311393596856e-07 +2500800.0,1000.0,-0.00087803251,0.00029180062119849026,0.9209325313568115,5.017210141056577e-05,-0.002349705684543224,0.001113128662109375,-0.00020664185842349524,0.0,3.6217401492527474e-09,3.0169103930031427e-18,0.0,1.2120326362958974e-10,4.0554322310925027e-07,5.091177679632423e-07 +2550816.0,1000.0,-0.0009482528899999998,0.00029180062119849026,0.9209325313568115,5.017210141056577e-05,-0.002349705684543224,0.001113128662109375,-0.00020664185842349524,0.0,4.48538587715148e-09,3.736327486719277e-18,0.0,3.365917137180003e-06,3.614831847121907e-05,2.71921524823522e-06 +2600832.0,1000.0,-0.0009755043699999999,0.00029180062119849026,0.9209325313568115,5.017210141056577e-05,-0.002349705684543224,0.001113128662109375,-0.00020664185842349524,0.0,4.48538587715148e-09,3.736327486719277e-18,0.0,1.752594037104947e-12,1.6696225514512327e-07,1.7229298240243415e-07 +2650848.0,1000.0,-0.0011298026,0.00027062674053013325,0.9644361734390259,3.755899111464594e-05,-0.002178665793544887,0.0013092041015625,-0.0002709305666900974,0.0,9.416524388057143e-09,7.84396702180817e-18,0.0,2.5114493823947756e-12,1.94550159788363e-07,1.7725517396866733e-07 +2700864.0,1000.0,-0.0010840192,0.00027062674053013325,0.9644361734390259,3.755899111464594e-05,-0.002178665793544887,0.0013092041015625,-0.0002709305666900974,0.0,3.7022232636805866e-09,3.0839528461809902e-18,0.0,1.4368284917715984e-12,1.467174010038082e-07,1.7545946413484087e-07 +2750880.0,1000.0,-0.0009173338999999999,0.00027062674053013325,0.9644361734390259,3.755899111464594e-05,-0.002178665793544887,0.0013092041015625,-0.0002709305666900974,0.0,6.664001874625055e-09,5.551115123125783e-18,0.0,1.9251117858874953e-12,1.7252620909185446e-07,1.5155985565821577e-07 +2800896.0,1000.0,-0.0009629556000000002,0.00040336561505682766,0.972762942314148,2.92528252869996e-05,-0.0018657009343957752,0.001380157470703125,-0.0004034915850418486,0.0,4.1650011716406594e-09,3.469446951953614e-18,0.0,1.7430360771881853e-12,1.518477707231133e-07,1.6943412676059415e-07 +2850912.0,1000.0,-0.00095319835,0.00040336561505682766,0.972762942314148,2.92528252869996e-05,-0.0018657009343957752,0.001380157470703125,-0.0004034915850418486,0.0,5.0704362089538465e-09,4.2236745502044e-18,0.0,1.5369438662502213e-12,1.572062648359645e-07,1.3245528148693755e-07 +2900928.0,1000.0,-0.00100041446,0.00039656879380345345,0.9786955714225769,2.3004513134183834e-05,-0.0025202318951417003,0.0013824462890625,4.974618643167616e-05,0.0,4.573334619840724e-09,3.8095888099882826e-18,0.0,1.9807226588357077e-09,1.0553957170612618e-06,1.6978405776726772e-07 +2950944.0,1000.0,-0.0009233834999999999,0.00039656879380345345,0.9786955714225769,2.3004513134183834e-05,-0.0025202318951417003,0.0013824462890625,4.974618643167616e-05,0.0,2.7766674477604396e-09,2.3129646346357427e-18,0.0,1.6936000337394168e-12,1.6401597663839692e-07,1.7632007355087667e-07 +3000960.0,1000.0,-0.00088806341,0.00039656879380345345,0.9786955714225769,2.3004513134183834e-05,-0.0025202318951417003,0.0013824462890625,4.974618643167616e-05,0.0,3.998401124775033e-09,3.3306690738754695e-18,0.0,2.031863194746852e-12,1.721019811707389e-07,1.591327583660984e-07 diff --git a/models/hybrid_cv01_disc_s5775_play_results.csv b/models/hybrid_cv01_disc_s5775_play_results.csv new file mode 100644 index 0000000..4e298e7 --- /dev/null +++ b/models/hybrid_cv01_disc_s5775_play_results.csv @@ -0,0 +1,101 @@ +start_speed,ep_steps,ep_reward,terminated,truncated,no_crash,t_min_start,t_min_min,t_min_final,t_min_mean_last100,t_min_settle_step,x_pos_final,x_vel_final,theta_final,theta_dot_final,energy_final,acc_final +-10.0,1000,-263.85574747223137,False,True,True,0.0,3.3320009373125275e-08,1.0215721431767476e-06,9.94179941267175e-07,83,-2.609024107869118e-14,-2.7755575615628914e-17,3.141592566701839,7.834538537786623e-07,3.0689997049814436e-11,0.0 +-9.8,1000,-251.32044353309178,False,True,True,0.0,3.3320009373125275e-08,8.815645848363065e-08,8.815645848363065e-08,73,1.942890293094024e-16,0.0,3.1415923392494536,-2.3710087947245643e-07,2.810841352330616e-12,0.0 +-9.6,1000,-237.05024466461745,False,True,True,0.0,3.3320009373125275e-08,1.024285486980737e-06,9.969685667629053e-07,87,2.6229018956769323e-14,2.7755575615628914e-17,3.1415926088863095,-5.493220913437721e-07,1.508773800176231e-11,0.0 +-9.4,1000,-223.22849300951955,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,73,2.7755575615628914e-17,0.0,3.1415924930993757,3.340926831011341e-08,5.5808960450857416e-14,0.0 +-9.2,1000,-212.58641824070068,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,74,2.7755575615628914e-17,0.0,3.1415924055850444,5.478534768193192e-07,1.5007171603150812e-11,0.0 +-9.0,1000,-201.65029472306782,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,79,2.7755575615628914e-17,0.0,3.1415921737353827,-1.5322400584321946e-07,1.1738797983321473e-12,0.0 +-8.8,1000,-190.39831140920288,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,72,8.326672684688674e-17,0.0,3.141592137183393,-1.6304000117440177e-07,1.3291020991474465e-12,0.0 +-8.6,1000,-180.86572185870347,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,79,8.326672684688674e-17,0.0,3.1415921250710017,-6.809210495124395e-07,2.3182673783456103e-11,0.0 +-8.4,1000,-171.2841624434823,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,84,8.326672684688674e-17,0.0,3.141592653589793,5.93034567230262e-07,1.7584499896499206e-11,0.0 +-8.2,1000,-162.08163184009518,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,80,8.326672684688674e-17,0.0,3.1415921165274674,-5.141642216634824e-07,1.3218242341940733e-11,0.0 +-8.0,1000,-153.08527074396162,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,67,8.326672684688674e-17,0.0,3.141592446576246,6.00000635464729e-07,1.8000038127903932e-11,0.0 +-7.8,1000,-140.59910941356415,False,True,True,0.0,0.0,0.0,0.0,64,0.0,0.0,3.1415923154286314,-7.060908819493922e-07,2.492821667860353e-11,0.0 +-7.6,1000,-131.17615667051842,False,True,True,0.0,3.3320009373125275e-08,1.0329202908444396e-06,1.005840162182343e-06,67,2.6673108166619386e-14,2.7755575615628914e-17,3.1415923174042732,6.130749961545081e-07,1.8793047545662666e-11,0.0 +-7.4,1000,-121.46942689353187,False,True,True,0.0,0.0,0.0,0.0,84,0.0,0.0,3.1415924514606086,3.9203362795270335e-07,7.684518272287931e-12,0.0 +-7.2,1000,-113.34901678042955,False,True,True,0.0,0.0,0.0,0.0,61,0.0,0.0,3.1415925536297653,6.022195986100948e-07,1.813342224750519e-11,0.0 +-7.0,1000,-105.76746141165656,False,True,True,0.0,0.0,0.0,0.0,72,0.0,0.0,3.1415920620305844,-2.008723512602058e-07,2.017485075040175e-12,0.0 +-6.8,1000,-98.29787516907747,False,True,True,0.0,0.0,0.0,0.0,63,0.0,0.0,3.1415924627618206,-3.444958271323113e-07,5.933868745578765e-12,0.0 +-6.6,1000,-91.27002999954796,False,True,True,0.0,0.0,0.0,0.0,59,0.0,0.0,3.1415925998628924,-4.995864015610795e-07,1.2479328631237408e-11,0.0 +-6.4,1000,-83.18865498266696,False,True,True,0.0,0.0,0.0,0.0,58,0.0,0.0,3.1415924251593177,4.5655352420720567e-07,1.0422056023300976e-11,0.0 +-6.2,1000,-76.6486806865716,False,True,True,0.0,0.0,0.0,0.0,50,0.0,0.0,3.141592483038208,4.7414972482229875e-08,1.124089807745308e-13,0.0 +-6.0,1000,-68.22081046832346,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,60,-2.7755575615628914e-17,0.0,3.1415922427931284,-1.13279552228928e-07,6.416128476593214e-13,0.0 +-5.8,1000,-62.36552392226412,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,47,-2.7755575615628914e-17,0.0,3.141592384542323,-3.0481525522661e-07,4.64561699094317e-12,0.0 +-5.6,1000,-56.77744998498565,False,True,True,0.0,0.0,0.0,0.0,52,0.0,0.0,3.1415925617328666,3.375050842143215e-07,5.695484093525812e-12,0.0 +-5.4,1000,-53.367171992796266,False,True,True,0.0,1.9992005651630742e-07,1.0462695173143489e-06,1.019547345997927e-06,50,-2.736699755701011e-14,-2.7755575615628914e-17,3.1415920690158914,-5.929421352270015e-08,1.757901878802353e-13,0.0 +-5.2,1000,-46.18676538185041,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,63,-2.7755575615628914e-17,0.0,3.1415921826093687,3.299598689751041e-07,5.4436757567033926e-12,0.0 +-5.0,1000,-41.174267152694654,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,57,-2.7755575615628914e-17,0.0,3.141592214068683,4.513563614172761e-07,1.0186128249592137e-11,0.0 +-4.8,1000,-36.67601840316016,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,41,2.7755575615628914e-17,0.0,3.1415924559035506,7.664370818161209e-08,2.937129001914056e-13,0.0 +-4.6,1000,-32.95500004888273,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,55,2.7755575615628914e-17,0.0,3.141592611442945,-3.663799587389293e-07,6.711713708276976e-12,0.0 +-4.4,1000,-28.90327723409404,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,-2.7755575615628914e-17,0.0,3.1415926088863095,-1.5249081755185682e-07,1.1626724718816843e-12,0.0 +-4.2,1000,-25.798323309287273,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,39,-2.7755575615628914e-17,0.0,3.1415925068303756,-5.052527980906672e-07,1.2764019498922428e-11,0.0 +-4.0,1000,-21.718605371804756,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,39,8.326672684688674e-17,0.0,3.1415925921507317,4.7413091039604856e-07,1.1240006009649291e-11,0.0 +-3.8,1000,-18.82903181001733,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,35,8.326672684688674e-17,0.0,3.141592537208004,1.1729613756464843e-07,6.879191943792465e-13,0.0 +-3.6,1000,-16.403903392774144,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,30,8.326672684688674e-17,0.0,3.1415925271492475,-4.2531059553830974e-07,9.044455133857585e-12,0.0 +-3.4,1000,-14.272509452615306,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,44,8.326672684688674e-17,0.0,3.1415923570604294,-5.316679372616335e-08,1.4133539775602013e-13,0.0 +-3.2,1000,-12.061991849363068,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,29,8.326672684688674e-17,0.0,3.141592653589793,1.5964535081448501e-07,1.2743319018339993e-12,0.0 +-3.0,1000,-10.352451854072918,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,40,8.326672684688674e-17,0.0,3.1415922501561613,7.947865265170605e-08,3.1584281136652704e-13,0.0 +-2.8,1000,-8.770299727093134,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,43,8.326672684688674e-17,0.0,3.1415925271492475,-3.08942747563108e-07,4.772281063592114e-12,0.0 +-2.6,1000,-7.3963598139491005,False,True,True,0.0,1.1050996937933735e-07,1.0430812838468917e-06,1.0162744992826952e-06,31,-2.7200464103316335e-14,-2.7755575615628914e-17,3.141592344233648,-2.603119662799894e-07,3.388115989499968e-12,0.0 +-2.4,1000,-5.558346283479853,False,True,True,0.0,0.0,0.0,0.0,23,0.0,0.0,3.1415924217821263,1.643214647641864e-07,1.3500771891123877e-12,0.0 +-2.2,1000,-4.49328895227065,False,True,True,0.0,0.0,0.0,0.0,22,0.0,0.0,3.1415923015942115,-9.435368375411147e-08,4.4513088189854403e-13,0.0 +-2.0,1000,-3.5773737635742893,False,True,True,0.0,0.0,0.0,0.0,23,0.0,0.0,3.1415924276024945,2.270125850026493e-07,2.576735687479254e-12,0.0 +-1.8,1000,-2.8105005401157594,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415923041265867,-3.639228231397982e-08,6.62199106010204e-14,0.0 +-1.6,1000,-2.0009676079785805,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,31,-2.7755575615628914e-17,0.0,3.141592468671246,-1.8615727735939835e-07,1.7327265956931982e-12,0.0 +-1.4,1000,-1.366833202258477,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415926064681843,-4.492444918927752e-08,1.0091030674799889e-13,0.0 +-1.2,1000,-1.1745557026588718,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.1415924823884853,1.7098692149421286e-07,1.4618263661034055e-12,0.0 +-1.0,1000,-0.7031506838428273,False,True,True,0.0,0.0,0.0,0.0,22,0.0,0.0,3.1415925228326342,-9.720656708985042e-08,4.724558342696795e-13,0.0 +-0.8,1000,-0.4817488273497912,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.141592413315789,5.229995441251419e-08,1.3676426157755311e-13,0.0 +-0.6,1000,-0.2717892762488744,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592611442945,-2.999193535243144e-07,4.497580930922133e-12,0.0 +-0.4,1000,-0.22322414823933717,False,True,True,0.0,0.0,0.0,0.0,13,0.0,0.0,3.1415925083512395,1.0606225282229542e-07,5.624600736870256e-13,0.0 +-0.2,1000,-0.08525909107377136,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592509888199,-5.408643939226397e-08,1.462671463066522e-13,0.0 +0.2,1000,-0.08525909107377136,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592509888199,-5.408643939226397e-08,1.462671463066522e-13,0.0 +0.4,1000,-0.22322414823933717,False,True,True,0.0,0.0,0.0,0.0,13,0.0,0.0,3.1415925083512395,1.0606225282229542e-07,5.624600736870256e-13,0.0 +0.6,1000,-0.2717892762488744,False,True,True,0.0,0.0,0.0,0.0,6,0.0,0.0,3.141592611442945,-2.999193535243144e-07,4.497580930922133e-12,0.0 +0.8,1000,-0.4817488273497912,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.141592413315789,5.229995441251419e-08,1.3676426157755311e-13,0.0 +1.0,1000,-0.7031506838428273,False,True,True,0.0,0.0,0.0,0.0,22,0.0,0.0,3.1415925228326342,-9.720656708985042e-08,4.724558342696795e-13,0.0 +1.2,1000,-1.1745557026588718,False,True,True,0.0,0.0,0.0,0.0,28,0.0,0.0,3.1415924823884853,1.7098692149421286e-07,1.4618263661034055e-12,0.0 +1.4,1000,-1.366833202258477,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415926064681843,-4.492444918927752e-08,1.0091030674799889e-13,0.0 +1.6,1000,-2.0009676079785805,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,31,-2.7755575615628914e-17,0.0,3.141592468671246,-1.8615727735939835e-07,1.7327265956931982e-12,0.0 +1.8,1000,-2.8105005401157594,False,True,True,0.0,0.0,0.0,0.0,19,0.0,0.0,3.1415923041265867,-3.639228231397982e-08,6.62199106010204e-14,0.0 +2.0,1000,-3.5773737635742893,False,True,True,0.0,0.0,0.0,0.0,23,0.0,0.0,3.1415924276024945,2.270125850026493e-07,2.576735687479254e-12,0.0 +2.2,1000,-4.49328895227065,False,True,True,0.0,0.0,0.0,0.0,22,0.0,0.0,3.1415923015942115,-9.435368375411147e-08,4.4513088189854403e-13,0.0 +2.4,1000,-5.558346283479853,False,True,True,0.0,0.0,0.0,0.0,23,0.0,0.0,3.1415924217821263,1.643214647641864e-07,1.3500771891123877e-12,0.0 +2.6,1000,-7.3963598139491005,False,True,True,0.0,1.1050996937933735e-07,1.0430812838468917e-06,1.0162744992826952e-06,31,-2.7200464103316335e-14,-2.7755575615628914e-17,3.141592344233648,-2.603119662799894e-07,3.388115989499968e-12,0.0 +2.8,1000,-8.770299727093134,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,43,8.326672684688674e-17,0.0,3.1415925271492475,-3.08942747563108e-07,4.772281063592114e-12,0.0 +3.0,1000,-10.352451854072918,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,40,8.326672684688674e-17,0.0,3.1415922501561613,7.947865265170605e-08,3.1584281136652704e-13,0.0 +3.2,1000,-12.061991849363068,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,29,8.326672684688674e-17,0.0,3.141592653589793,1.5964535081448501e-07,1.2743319018339993e-12,0.0 +3.4,1000,-14.272509452615306,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,44,8.326672684688674e-17,0.0,3.1415923570604294,-5.316679372616335e-08,1.4133539775602013e-13,0.0 +3.6,1000,-16.403903392774144,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,30,8.326672684688674e-17,0.0,3.1415925271492475,-4.2531059553830974e-07,9.044455133857585e-12,0.0 +3.8,1000,-18.82903181001733,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,35,8.326672684688674e-17,0.0,3.141592537208004,1.1729613756464843e-07,6.879191943792465e-13,0.0 +4.0,1000,-21.718605371804756,False,True,True,0.0,5.7711949142924205e-08,5.7711949142924205e-08,5.771194914292422e-08,39,8.326672684688674e-17,0.0,3.1415925921507317,4.7413091039604856e-07,1.1240006009649291e-11,0.0 +4.2,1000,-25.798323309287273,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,39,-2.7755575615628914e-17,0.0,3.1415925068303756,-5.052527980906672e-07,1.2764019498922428e-11,0.0 +4.4,1000,-28.90327723409404,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,35,-2.7755575615628914e-17,0.0,3.1415926088863095,-1.5249081755185682e-07,1.1626724718816843e-12,0.0 +4.6,1000,-32.95500004888273,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,55,2.7755575615628914e-17,0.0,3.141592611442945,-3.663799587389293e-07,6.711713708276976e-12,0.0 +4.8,1000,-36.67601840316016,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,41,2.7755575615628914e-17,0.0,3.1415924559035506,7.664370818161209e-08,2.937129001914056e-13,0.0 +5.0,1000,-41.174267152694654,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,57,-2.7755575615628914e-17,0.0,3.141592214068683,4.513563614172761e-07,1.0186128249592137e-11,0.0 +5.2,1000,-46.18676538185041,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,63,-2.7755575615628914e-17,0.0,3.1415921826093687,3.299598689751041e-07,5.4436757567033926e-12,0.0 +5.4,1000,-53.367171992796266,False,True,True,0.0,1.9992005651630742e-07,1.0462695173143489e-06,1.019547345997927e-06,50,-2.736699755701011e-14,-2.7755575615628914e-17,3.1415920690158914,-5.929421352270015e-08,1.757901878802353e-13,0.0 +5.6,1000,-56.77744998498565,False,True,True,0.0,0.0,0.0,0.0,52,0.0,0.0,3.1415925617328666,3.375050842143215e-07,5.695484093525812e-12,0.0 +5.8,1000,-62.36552392226412,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,47,-2.7755575615628914e-17,0.0,3.141592384542323,-3.0481525522661e-07,4.64561699094317e-12,0.0 +6.0,1000,-68.22081046832346,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,60,-2.7755575615628914e-17,0.0,3.1415922427931284,-1.13279552228928e-07,6.416128476593214e-13,0.0 +6.2,1000,-76.6486806865716,False,True,True,0.0,0.0,0.0,0.0,50,0.0,0.0,3.141592483038208,4.7414972482229875e-08,1.124089807745308e-13,0.0 +6.4,1000,-83.18865498266696,False,True,True,0.0,0.0,0.0,0.0,58,0.0,0.0,3.1415924251593177,4.5655352420720567e-07,1.0422056023300976e-11,0.0 +6.6,1000,-91.27002999954796,False,True,True,0.0,0.0,0.0,0.0,59,0.0,0.0,3.1415925998628924,-4.995864015610795e-07,1.2479328631237408e-11,0.0 +6.8,1000,-98.29787516907747,False,True,True,0.0,0.0,0.0,0.0,63,0.0,0.0,3.1415924627618206,-3.444958271323113e-07,5.933868745578765e-12,0.0 +7.0,1000,-105.76746141165656,False,True,True,0.0,0.0,0.0,0.0,72,0.0,0.0,3.1415920620305844,-2.008723512602058e-07,2.017485075040175e-12,0.0 +7.2,1000,-113.34901678042955,False,True,True,0.0,0.0,0.0,0.0,61,0.0,0.0,3.1415925536297653,6.022195986100948e-07,1.813342224750519e-11,0.0 +7.4,1000,-121.46942689353187,False,True,True,0.0,0.0,0.0,0.0,84,0.0,0.0,3.1415924514606086,3.9203362795270335e-07,7.684518272287931e-12,0.0 +7.6,1000,-131.17615667051842,False,True,True,0.0,3.3320009373125275e-08,1.0329202908444396e-06,1.005840162182343e-06,67,2.6673108166619386e-14,2.7755575615628914e-17,3.1415923174042732,6.130749961545081e-07,1.8793047545662666e-11,0.0 +7.8,1000,-140.59910941356415,False,True,True,0.0,0.0,0.0,0.0,64,0.0,0.0,3.1415923154286314,-7.060908819493922e-07,2.492821667860353e-11,0.0 +8.0,1000,-153.08527074396162,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,67,8.326672684688674e-17,0.0,3.141592446576246,6.00000635464729e-07,1.8000038127903932e-11,0.0 +8.2,1000,-162.08163184009518,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,80,8.326672684688674e-17,0.0,3.1415921165274674,-5.141642216634824e-07,1.3218242341940733e-11,0.0 +8.4,1000,-171.2841624434823,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,84,8.326672684688674e-17,0.0,3.141592653589793,5.93034567230262e-07,1.7584499896499206e-11,0.0 +8.6,1000,-180.86572185870347,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,79,8.326672684688674e-17,0.0,3.1415921250710017,-6.809210495124395e-07,2.3182673783456103e-11,0.0 +8.8,1000,-190.39831140920288,False,True,True,0.0,3.3320009373125275e-08,5.7711949142924205e-08,5.771194914292422e-08,72,8.326672684688674e-17,0.0,3.141592137183393,-1.6304000117440177e-07,1.3291020991474465e-12,0.0 +9.0,1000,-201.65029472306782,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,79,2.7755575615628914e-17,0.0,3.1415921737353827,-1.5322400584321946e-07,1.1738797983321473e-12,0.0 +9.2,1000,-212.58641824070068,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,74,2.7755575615628914e-17,0.0,3.1415924055850444,5.478534768193192e-07,1.5007171603150812e-11,0.0 +9.4,1000,-223.22849300951955,False,True,True,0.0,3.3320009373125275e-08,3.3320009373125275e-08,3.332000937312528e-08,73,2.7755575615628914e-17,0.0,3.1415924930993757,3.340926831011341e-08,5.5808960450857416e-14,0.0 +9.6,1000,-237.05024466461745,False,True,True,0.0,3.3320009373125275e-08,1.024285486980737e-06,9.969685667629053e-07,87,2.6229018956769323e-14,2.7755575615628914e-17,3.1415926088863095,-5.493220913437721e-07,1.508773800176231e-11,0.0 +9.8,1000,-251.32044353309178,False,True,True,0.0,3.3320009373125275e-08,8.815645848363065e-08,8.815645848363065e-08,73,1.942890293094024e-16,0.0,3.1415923392494536,-2.3710087947245643e-07,2.810841352330616e-12,0.0 +10.0,1000,-263.85574747223137,False,True,True,0.0,3.3320009373125275e-08,1.0215721431767476e-06,9.94179941267175e-07,83,-2.609024107869118e-14,-2.7755575615628914e-17,3.141592566701839,7.834538537786623e-07,3.0689997049814436e-11,0.0 diff --git a/models/hybrid_cv01_s42_log.csv b/models/hybrid_cv01_s42_log.csv new file mode 100644 index 0000000..6d1e8e5 --- /dev/null +++ b/models/hybrid_cv01_s42_log.csv @@ -0,0 +1,61 @@ +t,ep_len_mean,rew_per_step,approx_kl,explained_variance,value_loss,entropy_loss,clip_fraction,policy_gradient_loss,rail_hit_pct,mean_t_min,mean_x_pos_abs,mean_x_vel_abs,mean_energy,mean_theta_dot_abs,mean_theta_dev +50016.0,18.45,-0.4897711653116531,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +100032.0,21.34,-0.46923887675726333,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +150048.0,25.71,-0.4377599766627771,0.012264333665370941,-0.41165149211883545,0.12059976152668242,-1.3833025918807835,0.14634323120117188,-0.014121020150992081,100.0,nan,nan,nan,nan,nan,nan +200064.0,30.15,-0.3685546398009951,0.012264333665370941,-0.41165149211883545,0.12059976152668242,-1.3833025918807835,0.14634323120117188,-0.014121020150992081,100.0,nan,nan,nan,nan,nan,nan +250080.0,24.97,-0.4483150024028834,0.012264333665370941,-0.41165149211883545,0.12059976152668242,-1.3833025918807835,0.14634323120117188,-0.014121020150992081,100.0,nan,nan,nan,nan,nan,nan +300096.0,44.94,-0.2844366059190031,0.021328475326299667,0.4712539315223694,0.16336263767807396,-1.318331574345939,0.23967437744140624,-0.03147981339132002,100.0,nan,nan,nan,nan,nan,nan +350112.0,34.11,-0.3315862574025212,0.021328475326299667,0.4712539315223694,0.16336263767807396,-1.318331574345939,0.23967437744140624,-0.03147981339132002,100.0,nan,nan,nan,nan,nan,nan +400128.0,51.39,-0.2633293273010313,0.020732475444674492,0.4429510831832886,0.2925788702756108,-1.2536049714544788,0.23795700073242188,-0.03152457195174065,100.0,nan,nan,nan,nan,nan,nan +450144.0,111.11,-0.16994157906579063,0.020732475444674492,0.4429510831832886,0.2925788702756108,-1.2536049714544788,0.23795700073242188,-0.03152457195174065,100.0,nan,nan,nan,nan,nan,nan +500160.0,85.71,-0.19350481752420956,0.020732475444674492,0.4429510831832886,0.2925788702756108,-1.2536049714544788,0.23795700073242188,-0.03152457195174065,100.0,nan,nan,nan,nan,nan,nan +550176.0,163.76,-0.1411309890693698,0.019123263657093048,0.40122294425964355,0.4057596905273385,-1.1852155870990828,0.21425018310546876,-0.027678548255966006,100.0,nan,nan,nan,nan,nan,nan +600192.0,464.68,-0.10197347426185764,0.019123263657093048,0.40122294425964355,0.4057596905273385,-1.1852155870990828,0.21425018310546876,-0.027678548255966006,80.35714285714286,5.073900623868071,0.6556835199410033,0.07880786347245293,0.010984951660703608,0.016364161090280997,0.011589971977765819 +650208.0,383.27,-0.10759392955357844,0.019123263657093048,0.40122294425964355,0.4057596905273385,-1.1852155870990828,0.21425018310546876,-0.027678548255966006,89.39393939393939,5.040410037931248,0.6278806696745499,0.07021585447780247,0.012107892106935815,0.013148691519824257,0.010167875645450186 +700224.0,601.23,-0.0863803548891439,0.01942143216729164,0.3379531502723694,0.23029101495731083,-1.1101535215624607,0.21035537719726563,-0.02199085417792048,28.846153846153847,5.109270666066513,0.6131834609836592,0.06254650738146914,0.008616754446662158,0.012337819950098496,0.010579784793756661 +750240.0,931.83,-0.0707959010763766,0.01942143216729164,0.3379531502723694,0.23029101495731083,-1.1101535215624607,0.21035537719726563,-0.02199085417792048,7.407407407407407,4.1237279715599815,0.4798549780715096,0.06825957301817832,0.007421330118097899,0.013428628752143951,0.011096223403427379 +800256.0,964.99,-0.06598312604275693,0.02125580608844757,0.22011643648147583,0.0384666190308053,-1.031843730265973,0.23701934814453124,-0.024634395249063346,7.8431372549019605,3.9647309188425,0.4276921464759399,0.05527389979118755,0.00674917463163323,0.011090831347798934,0.010103116671312036 +850272.0,961.74,-0.05504258680100651,0.02125580608844757,0.22011643648147583,0.0384666190308053,-1.031843730265973,0.23701934814453124,-0.024634395249063346,3.9215686274509802,3.390889548229741,0.3008227837500058,0.05216908954007897,0.006261252888601281,0.011576895881014234,0.0071811950730374914 +900288.0,980.71,-0.0436611149269407,0.02125580608844757,0.22011643648147583,0.0384666190308053,-1.031843730265973,0.23701934814453124,-0.024634395249063346,0.0,3.5651859814170495,0.2882921459472867,0.0600260749714924,0.0055447821393010185,0.010797454722015685,0.008025124420728702 +950304.0,1000.0,-0.03965438012,0.02295587584376335,0.30977189540863037,0.00909779150013037,-0.9541074007313,0.2484283447265625,-0.028428241827060673,0.0,3.085883468145044,0.22499827096114494,0.050176790340955645,0.0042478436373317024,0.010171857525558901,0.007465652894467285 +1000320.0,1000.0,-0.031986893429999996,0.02295587584376335,0.30977189540863037,0.00909779150013037,-0.9541074007313,0.2484283447265625,-0.028428241827060673,0.0,2.5408725226307327,0.1795409343149363,0.04523199480213227,0.002950190731448338,0.00756393970583168,0.006917145251349641 +1050336.0,1000.0,-0.027066319259999996,0.022098083049058914,0.3267143964767456,0.0038306405375237774,-0.8688422658713535,0.21312942504882812,-0.02508080979870897,0.0,2.9031190586909656,0.19641955740948916,0.049036405030546695,0.004469409455537309,0.009822866976151569,0.007727625943529641 +1100352.0,1000.0,-0.02343461793,0.022098083049058914,0.3267143964767456,0.0038306405375237774,-0.8688422658713535,0.21312942504882812,-0.02508080979870897,0.0,2.2174722743857003,0.10790226858876358,0.04530371948711428,0.0030126759968088766,0.008063436259009491,0.005475349150863233 +1150368.0,1000.0,-0.019130971559999998,0.022098083049058914,0.3267143964767456,0.0038306405375237774,-0.8688422658713535,0.21312942504882812,-0.02508080979870897,0.0,2.3842427691022055,0.12490246679322538,0.044678162611671714,0.0035420529699726542,0.008453275976479362,0.005040169634539941 +1200384.0,1000.0,-0.01785942435,0.019183045253157616,0.3576838970184326,0.0016739373757175713,-0.7716128682834096,0.15691604614257812,-0.017543718150426456,0.0,2.031872860945568,0.09543964238472372,0.04156211465505846,0.0028699799143318285,0.007908933782041473,0.004973611155820351 +1250400.0,1000.0,-0.016707143520000002,0.019183045253157616,0.3576838970184326,0.0016739373757175713,-0.7716128682834096,0.15691604614257812,-0.017543718150426456,0.0,1.9614075135910782,0.08990189656425032,0.042373652407443355,0.001977801635386013,0.008180431752996807,0.0038188987642710196 +1300416.0,1000.0,-0.015550079409999999,0.019183045253157616,0.3576838970184326,0.0016739373757175713,-0.7716128682834096,0.15691604614257812,-0.017543718150426456,0.0,2.284399265932987,0.10949409460557519,0.046310360002331456,0.0020778258939635457,0.007556927719578691,0.0044165575882415984 +1350432.0,1000.0,-0.014918390240000001,0.019417008385062218,0.43330085277557373,0.001055096302738434,-0.6905607186839916,0.16458587646484374,-0.01599897345367367,0.0,1.8741946176132307,0.08248996126565268,0.03593958552068502,0.0014902218863673295,0.005987685126300385,0.003871102983261819 +1400448.0,1000.0,-0.013476521609999997,0.019417008385062218,0.43330085277557373,0.001055096302738434,-0.6905607186839916,0.16458587646484374,-0.01599897345367367,0.0,1.5999336245918763,0.06492382598945973,0.03643266864089005,0.002005496886464436,0.007080251799011887,0.004150804473040821 +1450464.0,1000.0,-0.012995177270000002,0.017241712659597397,0.38981765508651733,0.0007762854596471414,-0.6152971964678727,0.16675186157226562,-0.015454020399329239,0.0,1.9131545646518775,0.08871510605453296,0.03852888748246745,0.0018068652818139864,0.006599160498574478,0.004404095035571982 +1500480.0,1000.0,-0.01249471908,0.017241712659597397,0.38981765508651733,0.0007762854596471414,-0.6152971964678727,0.16675186157226562,-0.015454020399329239,0.0,1.830000111065836,0.07482095626031403,0.030072748555255192,0.0015894577140468141,0.006477554706601335,0.003978641904636979 +1550496.0,1000.0,-0.0117418388,0.017241712659597397,0.38981765508651733,0.0007762854596471414,-0.6152971964678727,0.16675186157226562,-0.015454020399329239,0.0,1.9006881978759684,0.08281966685592305,0.039186611377597616,0.0020693492005333545,0.008223893762930702,0.004256776475459588 +1600512.0,1000.0,-0.01151136651,0.017015673220157623,0.4750363826751709,0.0006058138856381845,-0.5474783921265043,0.1722259521484375,-0.016044295765914283,0.0,1.810758861163724,0.0724693341711496,0.038757383087649956,0.0017497386685748116,0.0075405263981323835,0.0036891576746471964 +1650528.0,1000.0,-0.010797942040000001,0.017015673220157623,0.4750363826751709,0.0006058138856381845,-0.5474783921265043,0.1722259521484375,-0.016044295765914283,0.0,1.7181713094221343,0.06470161696803292,0.04154741702973843,0.001786877171529338,0.008370364300003255,0.0034621486617374053 +1700544.0,1000.0,-0.01034402877,0.017015673220157623,0.4750363826751709,0.0006058138856381845,-0.5474783921265043,0.1722259521484375,-0.016044295765914283,0.0,1.6367107585099496,0.06116192779456639,0.03817872910341247,0.0015183198061243044,0.00669313596825809,0.0036133202039161636 +1750560.0,1000.0,-0.01003534025,0.01647292822599411,0.5719795227050781,0.00045051349429527934,-0.47915743988269244,0.16797103881835937,-0.015162974443239819,0.0,1.7708631845723126,0.060674616042530646,0.037604784401494784,0.0013834471141857608,0.0069969057568664865,0.0036803822658851688 +1800576.0,1000.0,-0.009416374980000001,0.01647292822599411,0.5719795227050781,0.00045051349429527934,-0.47915743988269244,0.16797103881835937,-0.015162974443239819,0.0,1.8515390436936199,0.07174416983170616,0.03906736268077554,0.0010031212931882172,0.006434754877076554,0.0034376714743989674 +1850592.0,1000.0,-0.00918603328,0.016165394335985184,0.5994337797164917,0.0003434841986273085,-0.4119152837025467,0.16284637451171874,-0.014296052434951889,0.0,1.5067774501070366,0.04907797530257115,0.03412248886774715,0.0012963779575706968,0.00716818122795649,0.003639962328362126 +1900608.0,1000.0,-0.008647024369999999,0.016165394335985184,0.5994337797164917,0.0003434841986273085,-0.4119152837025467,0.16284637451171874,-0.014296052434951889,0.0,1.6633108144176172,0.053095546719608935,0.03552744392212479,0.0018269575209769414,0.007911514408049113,0.0028642994947153076 +1950624.0,1000.0,-0.00823767218,0.016165394335985184,0.5994337797164917,0.0003434841986273085,-0.4119152837025467,0.16284637451171874,-0.014296052434951889,0.0,1.5474843587083331,0.051767460658432525,0.03556789403276829,0.0013515299990683677,0.006927793185080473,0.0027350932929404714 +2000640.0,1000.0,-0.00839268505,0.015473511070013046,0.6257394552230835,0.0002577692670223897,-0.34621427980600855,0.15934524536132813,-0.01419348849770472,0.0,1.5429147409963775,0.05401923261244529,0.032625551778804884,0.0014248730849458398,0.006862174205585684,0.0032857320921677417 +2050656.0,1000.0,-0.008128950270000001,0.015473511070013046,0.6257394552230835,0.0002577692670223897,-0.34621427980600855,0.15934524536132813,-0.01419348849770472,0.0,1.593039617256465,0.0478957613669328,0.036751366468146446,0.0012517381849091117,0.007384382349878344,0.002786069728920424 +2100672.0,1000.0,-0.00789556898,0.015155300498008728,0.6970324516296387,0.0001966494882848835,-0.28544679007027296,0.1595550537109375,-0.014389435121654515,0.0,1.5079672243605393,0.0435231108455978,0.04191159723820735,0.0013763404454929695,0.007978263459125353,0.0030971260444051588 +2150688.0,1000.0,-0.007570490589999999,0.015155300498008728,0.6970324516296387,0.0001966494882848835,-0.28544679007027296,0.1595550537109375,-0.014389435121654515,0.0,1.3196084693333678,0.03251449773241625,0.03457030088397648,0.001614853816981046,0.0078451722645716,0.002787303567981035 +2200704.0,1000.0,-0.0071508801799999985,0.015155300498008728,0.6970324516296387,0.0001966494882848835,-0.28544679007027296,0.1595550537109375,-0.014389435121654515,0.0,1.5694112934980726,0.045993785127349016,0.03841884697915292,0.0012611538255556706,0.00784436611524726,0.0030136113694541797 +2250720.0,1000.0,-0.0070856945499999996,0.013699253089725971,0.768794059753418,0.0001502362882940922,-0.22409108184510843,0.13931808471679688,-0.012147467576255622,0.0,1.3836863780592228,0.039893089202588064,0.03600417494935053,0.0012454619527173564,0.007275513701710095,0.0033448807365097878 +2300736.0,1000.0,-0.006862869270000001,0.013699253089725971,0.768794059753418,0.0001502362882940922,-0.22409108184510843,0.13931808471679688,-0.012147467576255622,0.0,1.3353414826472798,0.034739414433469326,0.03605094004381935,0.001642624290682802,0.008174545257608044,0.0027472310157537314 +2350752.0,1000.0,-0.00662826922,0.013699253089725971,0.768794059753418,0.0001502362882940922,-0.22409108184510843,0.13931808471679688,-0.012147467576255622,0.0,1.3424878521625436,0.03229470628806588,0.037061931514272506,0.001731614174661992,0.008223925683079036,0.0021294171846283983 +2400768.0,1000.0,-0.006472159529999999,0.013629868626594543,0.8025278449058533,0.00011333517746461296,-0.1519794294727035,0.1366424560546875,-0.011575764475958294,0.0,1.418028417619428,0.033240909428327296,0.04054193733353168,0.0013342914421525264,0.008176569925390505,0.0022047280814310975 +2450784.0,1000.0,-0.006276457409999998,0.013629868626594543,0.8025278449058533,0.00011333517746461296,-0.1519794294727035,0.1366424560546875,-0.011575764475958294,0.0,1.5276548629652746,0.03889647840894733,0.0368113869042136,0.0011066858361329895,0.007264753792837644,0.0023711488536960344 +2500800.0,1000.0,-0.006160007049999999,0.013536822982132435,0.8547919392585754,9.121082321597029e-05,-0.08311829594313166,0.13227310180664062,-0.010653127914577,0.0,1.3797507479175908,0.03267009524302697,0.041097583701368415,0.0009446479531555277,0.007556269261343948,0.001936442568352197 +2550816.0,1000.0,-0.006041214979999998,0.013536822982132435,0.8547919392585754,9.121082321597029e-05,-0.08311829594313166,0.13227310180664062,-0.010653127914577,0.0,1.4431419989972192,0.030103105962352605,0.039137997027790666,0.0009812478258334734,0.0074268724481800505,0.0022675061770905244 +2600832.0,1000.0,-0.005834858079999999,0.013536822982132435,0.8547919392585754,9.121082321597029e-05,-0.08311829594313166,0.13227310180664062,-0.010653127914577,0.0,1.3457807044370884,0.033627800007040014,0.03531916656248344,0.0014954914295345761,0.0077402985815940364,0.0019294011774453291 +2650848.0,1000.0,-0.00581739114,0.012169823050498962,0.8615037202835083,6.809911696645799e-05,-0.024754707340616734,0.12374725341796874,-0.009304166592414731,0.0,1.422918795712752,0.03501549348709251,0.032646579505400924,0.0009017638947422421,0.006554598063102546,0.001765166275369999 +2700864.0,1000.0,-0.005818421089999999,0.012169823050498962,0.8615037202835083,6.809911696645799e-05,-0.024754707340616734,0.12374725341796874,-0.009304166592414731,0.0,1.3300672908125313,0.029071050455488762,0.03441718502161917,0.0015716382206701451,0.008245081466117537,0.002108447619358332 +2750880.0,1000.0,-0.005803946540000001,0.012169823050498962,0.8615037202835083,6.809911696645799e-05,-0.024754707340616734,0.12374725341796874,-0.009304166592414731,0.0,1.2483094191253423,0.027799731656480055,0.03705842346646528,0.001199572092695186,0.007868752123765135,0.0018240021955834886 +2800896.0,1000.0,-0.005766379189999999,0.012385446578264236,0.9048743844032288,5.5109371876760346e-05,0.016511529410490767,0.13313446044921876,-0.010222132940714345,0.0,1.3697234518310033,0.028630714773152127,0.040098662831473605,0.0014266758649439768,0.008591343359022346,0.0019398032572436614 +2850912.0,1000.0,-0.00556432414,0.012385446578264236,0.9048743844032288,5.5109371876760346e-05,0.016511529410490767,0.13313446044921876,-0.010222132940714345,0.0,1.5446985562857365,0.03670531438662125,0.040427990386262505,0.0015808726595403497,0.008703550915727006,0.0018304472908311719 +2900928.0,1000.0,-0.00560529429,0.011459192261099815,0.885941743850708,4.610644294507082e-05,0.05934766010032035,0.12739410400390624,-0.009988951914704104,0.0,1.3961192113885315,0.030434203935242304,0.037714013753325816,0.0013170289803157466,0.007896180327067528,0.0017844510286382776 +2950944.0,1000.0,-0.00557614474,0.011459192261099815,0.885941743850708,4.610644294507082e-05,0.05934766010032035,0.12739410400390624,-0.009988951914704104,0.0,1.391480442237002,0.03337770091579413,0.03659926489972527,0.0009375277536918043,0.007152710502256251,0.0015052407076502372 +3000960.0,1000.0,-0.0053939904,0.011459192261099815,0.885941743850708,4.610644294507082e-05,0.05934766010032035,0.12739410400390624,-0.009988951914704104,0.0,1.3914872686461734,0.03194591118497559,0.03450239003800294,0.001375277601970016,0.0077417455983553535,0.0017406107247717924 diff --git a/models/hybrid_cv01_s42_play_results.csv b/models/hybrid_cv01_s42_play_results.csv new file mode 100644 index 0000000..655b2bd --- /dev/null +++ b/models/hybrid_cv01_s42_play_results.csv @@ -0,0 +1,101 @@ +start_speed,ep_steps,ep_reward,terminated,truncated,no_crash,t_min_start,t_min_min,t_min_final,t_min_mean_last100,t_min_settle_step,x_pos_final,x_vel_final,theta_final,theta_dot_final,energy_final,acc_final +-10.0,1000,-263.7437488389931,False,True,True,0.0,0.22246334752626565,0.7634171530877322,0.7634170693628076,90,-0.014570142887529284,-2.2351741737731194e-09,3.141592653589793,-1.2516669196055647e-09,1.0880844141057164e-16,-1.30385160446167e-09 +-9.8,1000,-252.04514549548708,False,True,True,0.0,0.1687816346640765,0.7634170994089053,0.7634170864353002,111,-0.014570140838562583,-2.2351741168328946e-09,3.141592653589793,1.4239016181080525e-10,3.290815337081383e-19,-1.30385160446167e-09 +-9.6,1000,-240.45424708207318,False,True,True,0.0,0.1637385040521618,0.7634170347426733,0.7634170630946467,104,-0.014570139721073645,1.3038515650711111e-09,3.141592653589793,6.185096673426094e-10,2.8042172858238456e-17,4.6566128730773926e-09 +-9.4,1000,-229.61783702644482,False,True,True,0.0,0.17949047128080606,0.7634171312600655,0.7634170545286207,113,-0.014570141769953509,-2.9802322480980583e-09,3.141592653589793,5.61180691291641e-10,3.462592607241318e-18,-1.30385160446167e-09 +-9.2,1000,-218.43588390169342,False,True,True,0.0,0.12444746906986427,0.7634172853659801,0.7634170529782643,114,-0.014570146799123274,-5.215406457767262e-09,3.141592653589793,3.288376856344386e-10,1.8567215416691478e-18,-1.30385160446167e-09 +-9.0,1000,-209.90991174604665,False,True,True,0.0,0.19426420033516487,1.386512354019927,1.7145479248419033,1000,0.007568750530476481,0.049552988633513456,3.1415227833192896,-0.010387006178882047,0.0014751722180794864,0.1 +-8.8,1000,-195.33321541577217,False,True,True,0.0,0.21269170489875885,0.7634170312011523,0.7634170771246602,98,-0.014570140652372018,4.097819314561063e-09,3.141592653589793,-6.46339262685078e-11,5.956357274995134e-18,4.6566128730773926e-09 +-8.6,1000,-185.36205048974617,False,True,True,0.0,0.13712154224244535,0.7634168748656479,0.7634170482859147,77,-0.014570133760629411,1.6763805739203415e-09,3.141592653589793,-2.964428674367837e-10,8.295339540616624e-19,1.6763806343078614e-09 +-8.4,1000,-175.68528108850907,False,True,True,0.0,0.19714917622799172,0.7634171835979793,0.7634170455724315,99,-0.014570142701254955,-5.7741999482549055e-09,3.141592653589793,4.699173711484827e-10,5.778407046456006e-19,-4.284083843231202e-09 +-8.2,1000,-165.88194353956368,False,True,True,0.0,0.043541556187100934,0.7634170946387897,0.7634170302902089,92,-0.014570143073840872,4.097819283805923e-09,3.141592653589793,1.2609555542306602e-11,8.920728283457892e-18,1.0617077350616455e-08 +-8.0,1000,-155.70212299075342,False,True,True,0.0,0.15057139488485374,0.7634170569098565,0.7634170616022448,94,-0.014570138789745066,-3.3527613006784208e-09,3.141592653589793,-4.1423953529559176e-10,2.8088686635382656e-17,-1.30385160446167e-09 +-7.8,1000,-145.2178545564669,False,True,True,0.0,0.07285309856365246,0.7634170635720755,0.763417079660027,93,-0.014570140466022797,3.725291028715303e-10,3.141592653589793,2.2924368938876665e-10,3.5510218824210415e-18,-1.30385160446167e-09 +-7.6,1000,-135.07959206766608,False,True,True,0.0,0.18765214662357285,0.7634171868008116,0.7634170397641574,107,-0.014570143818892572,-3.166496792161494e-09,3.141592653589793,-2.97854740468381e-10,1.8880784090849133e-17,-1.30385160446167e-09 +-7.4,1000,-125.31711680480392,False,True,True,0.0,0.1664224483827286,0.7634170059877311,0.7634170434621508,87,-0.014570137485893031,-1.6763806650846006e-09,3.141592653589793,3.919377554080013e-10,2.5155175233506392e-18,-1.0244548320770264e-08 +-7.2,1000,-115.83120179489983,False,True,True,0.0,0.1442485167439822,0.7634168146293592,0.7634170402360348,80,-0.014570132456741703,4.28408382047686e-09,3.141592653589793,9.785266290111048e-11,1.3847534370772098e-17,4.6566128730773926e-09 +-7.0,1000,-107.22348204959184,False,True,True,0.0,0.22358062194411019,0.7634171409418026,0.7634170581676986,86,-0.014570143632587712,9.313225778948621e-10,3.141592653589793,2.2669845602241907e-10,5.114584275479524e-18,1.6763806343078614e-09 +-6.8,1000,-99.2810746840453,False,True,True,0.0,0.2838240569758799,0.7634169127472015,0.7634170424849152,100,-0.014570135064402892,1.303851624566333e-09,3.141592653589793,1.2808247052379731e-09,9.957566452439502e-17,-7.264316082000733e-09 +-6.6,1000,-91.5150049483819,False,True,True,0.0,0.24515920787695178,0.7634168527732808,0.7634170424382991,86,-0.014570133201740082,2.421438732773416e-09,3.141592653589793,4.670247601161795e-10,2.514600753498276e-17,1.6763806343078614e-09 +-6.4,1000,-84.00842713610909,False,True,True,0.0,0.09985952016033046,0.7634173212002855,0.7634170631722137,92,-0.01457014717156667,-7.82310957550326e-09,3.141592653589793,-5.702994927120751e-10,9.14777515021636e-17,-4.284083843231202e-09 +-6.2,1000,-76.6925310141817,False,True,True,0.0,0.4286885132005104,0.7634168237575643,0.7634170536574096,80,-0.014570131525395859,9.313225754455675e-10,3.141592653589793,3.26303682141536e-10,8.796325371536091e-18,-1.0244548320770264e-08 +-6.0,1000,-70.09595658997532,False,True,True,0.0,0.17725753791094007,0.7634168764655925,0.7634170656444632,80,-0.014570134319391807,2.9802322106325023e-09,3.141592653589793,-4.993851872305113e-10,2.0273320715663867e-18,4.6566128730773926e-09 +-5.8,1000,-64.05290166223465,False,True,True,0.0,0.10790402474610772,0.763417005987038,0.7634170500056007,80,-0.014570137485866582,-1.6763806397481118e-09,3.141592653589793,1.1536700770974096e-10,1.3661316639260866e-19,-1.30385160446167e-09 +-5.6,1000,-58.40869548775641,False,True,True,0.0,0.16232429760268133,0.7634171133404535,0.7634170450427702,66,-0.01457014158363795,-1.67638059028145e-09,3.141592653589793,9.92350213184438e-10,3.400750686381027e-17,-1.30385160446167e-09 +-5.4,1000,-53.28407389663978,False,True,True,0.0,0.2110197169895717,0.7634169828501517,0.7634170717855242,81,-0.014570139162266819,5.029141877633799e-09,3.141592653589793,-1.2332636864496346e-09,2.6670519514368604e-17,1.0617077350616455e-08 +-5.2,1000,-48.012380553374776,False,True,True,0.0,0.11822377785358172,0.763416824495349,0.7634170739782873,67,-0.014570135250691516,1.0617077346073674e-08,3.141592653589793,-1.6751646546567771e-09,1.8816469639754386e-17,1.3597309589385988e-08 +-5.0,1000,-43.067016035403995,False,True,True,0.0,0.19462615623614987,0.7634170744103722,0.763417056606188,80,-0.01457014251500113,4.656612874786274e-09,3.141592653589793,-1.5619559883058216e-09,6.009310355280606e-17,1.0617077350616455e-08 +-4.8,1000,-38.50702340912238,False,True,True,0.0,0.24121400950326363,0.7634171240709017,0.7634170506138928,54,-0.014570141211141932,-3.725290287581552e-09,3.141592653589793,-1.0235934546871678e-10,1.1275948401033654e-17,-4.284083843231202e-09 +-4.6,1000,-34.626301009737425,False,True,True,0.0,0.05984436550970863,0.7634171433271644,0.7634170491543686,81,-0.014570142514960069,-2.23517413541544e-09,3.141592653589793,-6.292617999537348e-10,3.6361619353943927e-17,1.6763806343078614e-09 +-4.4,1000,-30.925277129824597,False,True,True,0.0,0.14761927625712246,0.7634170482264094,0.7634170502572508,68,-0.014570140093561862,9.313225771801712e-10,3.141592653589793,7.359589601635883e-10,3.436961237153898e-17,1.6763806343078614e-09 +-4.2,1000,-27.334635730068634,False,True,True,0.0,0.21140118644883701,0.7634169539098193,0.7634170635424009,67,-0.014570135995726204,-3.725290098808228e-10,3.141592653589793,9.973812589986221e-10,4.6092323197643355e-17,-1.0244548320770264e-08 +-4.0,1000,-23.92938178825022,False,True,True,0.0,0.25219562132040435,0.7634168157832283,0.7634170322788403,60,-0.014570132642983319,4.656612874179808e-09,3.141592653589793,4.623408721811921e-10,4.3059400416904275e-17,4.6566128730773926e-09 +-3.8,1000,-20.8245578648867,False,True,True,0.0,0.08735476488357127,0.7634170181634349,0.7634170456766861,60,-0.014570140652402423,5.401670887950384e-09,3.141592653589793,-6.893902657260923e-10,1.1133778260573138e-18,1.0617077350616455e-08 +-3.6,1000,-18.053413036405352,False,True,True,0.0,0.22513127579719674,0.7634171417263087,0.7634170652962787,61,-0.014570141956162682,-3.539025735476742e-09,3.141592653589793,-5.521618190662698e-10,4.104763417980828e-17,-1.30385160446167e-09 +-3.4,1000,-19.175845343737294,False,True,True,0.0,0.35847894847393075,1.3559736347345457,1.7120983005977612,1000,0.006053073704273618,0.0497457250952721,3.1415898357727254,-0.01036437986683371,0.001452501167443764,0.1 +-3.2,1000,-13.402636619320885,False,True,True,0.0,0.18063813919936367,0.7634172198042019,0.7634170565962843,54,-0.014570144936462776,-3.539025804454626e-09,3.141592653589793,-6.982921696735893e-10,5.535568961516277e-17,-1.30385160446167e-09 +-3.0,1000,-11.377479840304627,False,True,True,0.0,0.08852552370741044,0.7634168775415506,0.7634170414167923,54,-0.014570135995732892,7.264316095073499e-09,3.141592653589793,-1.5177689013248889e-09,3.13107354729219e-17,1.0617077350616455e-08 +-2.8,1000,-9.478702574300701,False,True,True,0.0,0.08814025350001184,0.7634172321332601,0.7634170559171409,69,-0.014570145122678452,-4.28408381417328e-09,3.141592653589793,1.7723631105245715e-09,9.031071570495998e-17,-7.264316082000733e-09 +-2.6,1000,-7.743821984064498,False,True,True,0.0,0.15335320576721206,0.7634168807432482,0.7634170483976294,49,-0.01457013711332662,9.872019296995179e-09,3.141592653589793,-1.2171463645794065e-09,2.643722155413918e-18,1.6577541828155518e-08 +-2.4,1000,-6.531204090536471,False,True,True,0.0,0.229102845270618,0.7634172302714146,0.7634170604513182,62,-0.014570145122708968,-4.097819330227194e-09,3.141592653589793,1.3655528237699483e-09,4.5674899778433676e-17,-7.264316082000733e-09 +-2.2,1000,-5.501197815549303,False,True,True,0.0,0.18802532142021092,0.7634170733334662,0.7634170423909329,42,-0.014570140838623385,3.7252902961161097e-10,3.141592653589793,9.43698569372045e-10,4.811328955933871e-17,-1.30385160446167e-09 +-2.0,1000,-4.564890187904272,False,True,True,0.0,0.11813494100757414,0.7634170177170916,0.7634170365719615,48,-0.014570140279871394,4.470348315935407e-09,3.141592653589793,-1.1049998297634823e-09,2.164589694383432e-17,7.636845111846925e-09 +-1.8,1000,-3.7697721838119502,False,True,True,0.0,0.1944231991224142,0.7634167891822286,0.7634170564360635,55,-0.014570133760562416,1.024454832989542e-08,3.141592653589793,-1.4555453169348302e-09,9.291950269355682e-18,1.3597309589385988e-08 +-1.6,1000,-3.2206288096764624,False,True,True,0.0,0.12285176938336577,0.7634172444641988,0.7634170725298377,48,-0.014570145308965872,-5.029141897432442e-09,3.141592653589793,6.556238665088991e-10,1.1660122679254886e-18,-7.264316082000733e-09 +-1.4,1000,-2.7079446179979785,False,True,True,0.0,0.23740864240149934,0.763417101719276,0.7634170579598287,55,-0.014570141211146416,-1.4901161138199398e-09,3.141592653589793,-5.687327510331118e-10,2.575784848903174e-17,1.6763806343078614e-09 +-1.2,1000,-2.261348227712544,False,True,True,0.0,0.033051915466785695,0.7634172583947898,0.7634170477171023,41,-0.014570146054004852,-4.470348333388172e-09,3.141592653589793,2.0418341188940458e-09,1.271692380693176e-16,-7.264316082000733e-09 +-1.0,1000,-1.9956626504442376,False,True,True,0.0,0.08078501734935,0.7634167493292905,0.7634170683660048,35,-0.014570130035281873,4.4703483576051456e-09,3.141592653589793,-4.5720263253014083e-10,5.169204607414431e-21,4.6566128730773926e-09 +-0.8,1000,-1.7839641695510435,False,True,True,0.0,0.14685393085712253,0.7634171629247173,0.7634170668022203,42,-0.014570141769941312,-6.1467289882799046e-09,3.141592653589793,3.270378585788841e-10,4.1366958200076e-18,-1.0244548320770264e-08 +-0.6,1000,-1.6837355763841517,False,True,True,0.0,0.13257263475099165,0.7634171916794491,0.7634170363525022,43,-0.01457014400511437,-3.166496748197271e-09,3.141592653589793,1.7021429800868704e-09,9.597958493816242e-17,-7.264316082000733e-09 +-0.4,1000,-1.6092826715430306,False,True,True,0.0,0.05780649185180663,0.7634171916797367,0.7634170552043545,29,-0.014570144005125338,-3.1664967592114523e-09,3.141592653589793,-2.0989249369582415e-10,1.3862332819417286e-17,-1.30385160446167e-09 +-0.2,1000,-1.496435616836673,False,True,True,0.0,0.19513455841077915,0.7634173079016019,0.7634170591546008,35,-0.014570147730426176,-5.029141919286249e-09,3.141592653589793,-7.571411401264381e-10,7.93869719869391e-17,-1.30385160446167e-09 +0.2,1000,-1.496435616836673,False,True,True,0.0,0.19513455841077915,0.7634173079016019,0.7634170591546008,35,-0.014570147730426176,-5.029141919286249e-09,3.141592653589793,-7.571411401264381e-10,7.93869719869391e-17,-1.30385160446167e-09 +0.4,1000,-1.6092826715430306,False,True,True,0.0,0.05780649185180663,0.7634171916797367,0.7634170552043545,29,-0.014570144005125338,-3.1664967592114523e-09,3.141592653589793,-2.0989249369582415e-10,1.3862332819417286e-17,-1.30385160446167e-09 +0.6,1000,-1.6837355763841517,False,True,True,0.0,0.13257263475099165,0.7634171916794491,0.7634170363525022,43,-0.01457014400511437,-3.166496748197271e-09,3.141592653589793,1.7021429800868704e-09,9.597958493816242e-17,-7.264316082000733e-09 +0.8,1000,-1.7839641695510435,False,True,True,0.0,0.14685393085712253,0.7634171629247173,0.7634170668022203,42,-0.014570141769941312,-6.1467289882799046e-09,3.141592653589793,3.270378585788841e-10,4.1366958200076e-18,-1.0244548320770264e-08 +1.0,1000,-1.9956626504442376,False,True,True,0.0,0.08078501734935,0.7634167493292905,0.7634170683660048,35,-0.014570130035281873,4.4703483576051456e-09,3.141592653589793,-4.5720263253014083e-10,5.169204607414431e-21,4.6566128730773926e-09 +1.2,1000,-2.261348227712544,False,True,True,0.0,0.033051915466785695,0.7634172583947898,0.7634170477171023,41,-0.014570146054004852,-4.470348333388172e-09,3.141592653589793,2.0418341188940458e-09,1.271692380693176e-16,-7.264316082000733e-09 +1.4,1000,-2.7079446179979785,False,True,True,0.0,0.23740864240149934,0.763417101719276,0.7634170579598287,55,-0.014570141211146416,-1.4901161138199398e-09,3.141592653589793,-5.687327510331118e-10,2.575784848903174e-17,1.6763806343078614e-09 +1.6,1000,-3.2206288096764624,False,True,True,0.0,0.12285176938336577,0.7634172444641988,0.7634170725298377,48,-0.014570145308965872,-5.029141897432442e-09,3.141592653589793,6.556238665088991e-10,1.1660122679254886e-18,-7.264316082000733e-09 +1.8,1000,-3.7697721838119502,False,True,True,0.0,0.1944231991224142,0.7634167891822286,0.7634170564360635,55,-0.014570133760562416,1.024454832989542e-08,3.141592653589793,-1.4555453169348302e-09,9.291950269355682e-18,1.3597309589385988e-08 +2.0,1000,-4.564890187904272,False,True,True,0.0,0.11813494100757414,0.7634170177170916,0.7634170365719615,48,-0.014570140279871394,4.470348315935407e-09,3.141592653589793,-1.1049998297634823e-09,2.164589694383432e-17,7.636845111846925e-09 +2.2,1000,-5.501197815549303,False,True,True,0.0,0.18802532142021092,0.7634170733334662,0.7634170423909329,42,-0.014570140838623385,3.7252902961161097e-10,3.141592653589793,9.43698569372045e-10,4.811328955933871e-17,-1.30385160446167e-09 +2.4,1000,-6.531204090536471,False,True,True,0.0,0.229102845270618,0.7634172302714146,0.7634170604513182,62,-0.014570145122708968,-4.097819330227194e-09,3.141592653589793,1.3655528237699483e-09,4.5674899778433676e-17,-7.264316082000733e-09 +2.6,1000,-7.743821984064498,False,True,True,0.0,0.15335320576721206,0.7634168807432482,0.7634170483976294,49,-0.01457013711332662,9.872019296995179e-09,3.141592653589793,-1.2171463645794065e-09,2.643722155413918e-18,1.6577541828155518e-08 +2.8,1000,-9.478702574300701,False,True,True,0.0,0.08814025350001184,0.7634172321332601,0.7634170559171409,69,-0.014570145122678452,-4.28408381417328e-09,3.141592653589793,1.7723631105245715e-09,9.031071570495998e-17,-7.264316082000733e-09 +3.0,1000,-11.377479840304627,False,True,True,0.0,0.08852552370741044,0.7634168775415506,0.7634170414167923,54,-0.014570135995732892,7.264316095073499e-09,3.141592653589793,-1.5177689013248889e-09,3.13107354729219e-17,1.0617077350616455e-08 +3.2,1000,-13.402636619320885,False,True,True,0.0,0.18063813919936367,0.7634172198042019,0.7634170565962843,54,-0.014570144936462776,-3.539025804454626e-09,3.141592653589793,-6.982921696735893e-10,5.535568961516277e-17,-1.30385160446167e-09 +3.4,1000,-19.175845343737294,False,True,True,0.0,0.35847894847393075,1.3559736347345457,1.7120983005977612,1000,0.006053073704273618,0.0497457250952721,3.1415898357727254,-0.01036437986683371,0.001452501167443764,0.1 +3.6,1000,-18.053413036405352,False,True,True,0.0,0.22513127579719674,0.7634171417263087,0.7634170652962787,61,-0.014570141956162682,-3.539025735476742e-09,3.141592653589793,-5.521618190662698e-10,4.104763417980828e-17,-1.30385160446167e-09 +3.8,1000,-20.8245578648867,False,True,True,0.0,0.08735476488357127,0.7634170181634349,0.7634170456766861,60,-0.014570140652402423,5.401670887950384e-09,3.141592653589793,-6.893902657260923e-10,1.1133778260573138e-18,1.0617077350616455e-08 +4.0,1000,-23.92938178825022,False,True,True,0.0,0.25219562132040435,0.7634168157832283,0.7634170322788403,60,-0.014570132642983319,4.656612874179808e-09,3.141592653589793,4.623408721811921e-10,4.3059400416904275e-17,4.6566128730773926e-09 +4.2,1000,-27.334635730068634,False,True,True,0.0,0.21140118644883701,0.7634169539098193,0.7634170635424009,67,-0.014570135995726204,-3.725290098808228e-10,3.141592653589793,9.973812589986221e-10,4.6092323197643355e-17,-1.0244548320770264e-08 +4.4,1000,-30.925277129824597,False,True,True,0.0,0.14761927625712246,0.7634170482264094,0.7634170502572508,68,-0.014570140093561862,9.313225771801712e-10,3.141592653589793,7.359589601635883e-10,3.436961237153898e-17,1.6763806343078614e-09 +4.6,1000,-34.626301009737425,False,True,True,0.0,0.05984436550970863,0.7634171433271644,0.7634170491543686,81,-0.014570142514960069,-2.23517413541544e-09,3.141592653589793,-6.292617999537348e-10,3.6361619353943927e-17,1.6763806343078614e-09 +4.8,1000,-38.50702340912238,False,True,True,0.0,0.24121400950326363,0.7634171240709017,0.7634170506138928,54,-0.014570141211141932,-3.725290287581552e-09,3.141592653589793,-1.0235934546871678e-10,1.1275948401033654e-17,-4.284083843231202e-09 +5.0,1000,-43.067016035403995,False,True,True,0.0,0.19462615623614987,0.7634170744103722,0.763417056606188,80,-0.01457014251500113,4.656612874786274e-09,3.141592653589793,-1.5619559883058216e-09,6.009310355280606e-17,1.0617077350616455e-08 +5.2,1000,-48.012380553374776,False,True,True,0.0,0.11822377785358172,0.763416824495349,0.7634170739782873,67,-0.014570135250691516,1.0617077346073674e-08,3.141592653589793,-1.6751646546567771e-09,1.8816469639754386e-17,1.3597309589385988e-08 +5.4,1000,-53.28407389663978,False,True,True,0.0,0.2110197169895717,0.7634169828501517,0.7634170717855242,81,-0.014570139162266819,5.029141877633799e-09,3.141592653589793,-1.2332636864496346e-09,2.6670519514368604e-17,1.0617077350616455e-08 +5.6,1000,-58.40869548775641,False,True,True,0.0,0.16232429760268133,0.7634171133404535,0.7634170450427702,66,-0.01457014158363795,-1.67638059028145e-09,3.141592653589793,9.92350213184438e-10,3.400750686381027e-17,-1.30385160446167e-09 +5.8,1000,-64.05290166223465,False,True,True,0.0,0.10790402474610772,0.763417005987038,0.7634170500056007,80,-0.014570137485866582,-1.6763806397481118e-09,3.141592653589793,1.1536700770974096e-10,1.3661316639260866e-19,-1.30385160446167e-09 +6.0,1000,-70.09595658997532,False,True,True,0.0,0.17725753791094007,0.7634168764655925,0.7634170656444632,80,-0.014570134319391807,2.9802322106325023e-09,3.141592653589793,-4.993851872305113e-10,2.0273320715663867e-18,4.6566128730773926e-09 +6.2,1000,-76.6925310141817,False,True,True,0.0,0.4286885132005104,0.7634168237575643,0.7634170536574096,80,-0.014570131525395859,9.313225754455675e-10,3.141592653589793,3.26303682141536e-10,8.796325371536091e-18,-1.0244548320770264e-08 +6.4,1000,-84.00842713610909,False,True,True,0.0,0.09985952016033046,0.7634173212002855,0.7634170631722137,92,-0.01457014717156667,-7.82310957550326e-09,3.141592653589793,-5.702994927120751e-10,9.14777515021636e-17,-4.284083843231202e-09 +6.6,1000,-91.5150049483819,False,True,True,0.0,0.24515920787695178,0.7634168527732808,0.7634170424382991,86,-0.014570133201740082,2.421438732773416e-09,3.141592653589793,4.670247601161795e-10,2.514600753498276e-17,1.6763806343078614e-09 +6.8,1000,-99.2810746840453,False,True,True,0.0,0.2838240569758799,0.7634169127472015,0.7634170424849152,100,-0.014570135064402892,1.303851624566333e-09,3.141592653589793,1.2808247052379731e-09,9.957566452439502e-17,-7.264316082000733e-09 +7.0,1000,-107.22348204959184,False,True,True,0.0,0.22358062194411019,0.7634171409418026,0.7634170581676986,86,-0.014570143632587712,9.313225778948621e-10,3.141592653589793,2.2669845602241907e-10,5.114584275479524e-18,1.6763806343078614e-09 +7.2,1000,-115.83120179489983,False,True,True,0.0,0.1442485167439822,0.7634168146293592,0.7634170402360348,80,-0.014570132456741703,4.28408382047686e-09,3.141592653589793,9.785266290111048e-11,1.3847534370772098e-17,4.6566128730773926e-09 +7.4,1000,-125.31711680480392,False,True,True,0.0,0.1664224483827286,0.7634170059877311,0.7634170434621508,87,-0.014570137485893031,-1.6763806650846006e-09,3.141592653589793,3.919377554080013e-10,2.5155175233506392e-18,-1.0244548320770264e-08 +7.6,1000,-135.07959206766608,False,True,True,0.0,0.18765214662357285,0.7634171868008116,0.7634170397641574,107,-0.014570143818892572,-3.166496792161494e-09,3.141592653589793,-2.97854740468381e-10,1.8880784090849133e-17,-1.30385160446167e-09 +7.8,1000,-145.2178545564669,False,True,True,0.0,0.07285309856365246,0.7634170635720755,0.763417079660027,93,-0.014570140466022797,3.725291028715303e-10,3.141592653589793,2.2924368938876665e-10,3.5510218824210415e-18,-1.30385160446167e-09 +8.0,1000,-155.70212299075342,False,True,True,0.0,0.15057139488485374,0.7634170569098565,0.7634170616022448,94,-0.014570138789745066,-3.3527613006784208e-09,3.141592653589793,-4.1423953529559176e-10,2.8088686635382656e-17,-1.30385160446167e-09 +8.2,1000,-165.88194353956368,False,True,True,0.0,0.043541556187100934,0.7634170946387897,0.7634170302902089,92,-0.014570143073840872,4.097819283805923e-09,3.141592653589793,1.2609555542306602e-11,8.920728283457892e-18,1.0617077350616455e-08 +8.4,1000,-175.68528108850907,False,True,True,0.0,0.19714917622799172,0.7634171835979793,0.7634170455724315,99,-0.014570142701254955,-5.7741999482549055e-09,3.141592653589793,4.699173711484827e-10,5.778407046456006e-19,-4.284083843231202e-09 +8.6,1000,-185.36205048974617,False,True,True,0.0,0.13712154224244535,0.7634168748656479,0.7634170482859147,77,-0.014570133760629411,1.6763805739203415e-09,3.141592653589793,-2.964428674367837e-10,8.295339540616624e-19,1.6763806343078614e-09 +8.8,1000,-195.33321541577217,False,True,True,0.0,0.21269170489875885,0.7634170312011523,0.7634170771246602,98,-0.014570140652372018,4.097819314561063e-09,3.141592653589793,-6.46339262685078e-11,5.956357274995134e-18,4.6566128730773926e-09 +9.0,1000,-209.90991174604665,False,True,True,0.0,0.19426420033516487,1.386512354019927,1.7145479248419033,1000,0.007568750530476481,0.049552988633513456,3.1415227833192896,-0.010387006178882047,0.0014751722180794864,0.1 +9.2,1000,-218.43588390169342,False,True,True,0.0,0.12444746906986427,0.7634172853659801,0.7634170529782643,114,-0.014570146799123274,-5.215406457767262e-09,3.141592653589793,3.288376856344386e-10,1.8567215416691478e-18,-1.30385160446167e-09 +9.4,1000,-229.61783702644482,False,True,True,0.0,0.17949047128080606,0.7634171312600655,0.7634170545286207,113,-0.014570141769953509,-2.9802322480980583e-09,3.141592653589793,5.61180691291641e-10,3.462592607241318e-18,-1.30385160446167e-09 +9.6,1000,-240.45424708207318,False,True,True,0.0,0.1637385040521618,0.7634170347426733,0.7634170630946467,104,-0.014570139721073645,1.3038515650711111e-09,3.141592653589793,6.185096673426094e-10,2.8042172858238456e-17,4.6566128730773926e-09 +9.8,1000,-252.04514549548708,False,True,True,0.0,0.1687816346640765,0.7634170994089053,0.7634170864353002,111,-0.014570140838562583,-2.2351741168328946e-09,3.141592653589793,1.4239016181080525e-10,3.290815337081383e-19,-1.30385160446167e-09 +10.0,1000,-263.7437488389931,False,True,True,0.0,0.22246334752626565,0.7634171530877322,0.7634170693628076,90,-0.014570142887529284,-2.2351741737731194e-09,3.141592653589793,-1.2516669196055647e-09,1.0880844141057164e-16,-1.30385160446167e-09 diff --git a/models/hybrid_cv01_s5775_log.csv b/models/hybrid_cv01_s5775_log.csv new file mode 100644 index 0000000..741c83e --- /dev/null +++ b/models/hybrid_cv01_s5775_log.csv @@ -0,0 +1,61 @@ +t,ep_len_mean,rew_per_step,approx_kl,explained_variance,value_loss,entropy_loss,clip_fraction,policy_gradient_loss,rail_hit_pct,mean_t_min,mean_x_pos_abs,mean_x_vel_abs,mean_energy,mean_theta_dot_abs,mean_theta_dev +50016.0,22.53,-0.46634678783843764,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +100032.0,20.72,-0.484760430019305,nan,nan,nan,nan,nan,nan,100.0,nan,nan,nan,nan,nan,nan +150048.0,24.58,-0.42493947477624094,0.010541556403040886,-0.38331782817840576,0.14092134966886077,-1.4169601713656448,0.1435455322265625,-0.014994687308234233,100.0,nan,nan,nan,nan,nan,nan +200064.0,22.53,-0.4310668703950288,0.010541556403040886,-0.38331782817840576,0.14092134966886077,-1.4169601713656448,0.1435455322265625,-0.014994687308234233,100.0,nan,nan,nan,nan,nan,nan +250080.0,19.87,-0.4917359310518369,0.010541556403040886,-0.38331782817840576,0.14092134966886077,-1.4169601713656448,0.1435455322265625,-0.014994687308234233,100.0,nan,nan,nan,nan,nan,nan +300096.0,36.18,-0.31845708678828083,0.02202294021844864,0.474281370639801,0.14500066997916292,-1.3416458107705693,0.24495391845703124,-0.031626339453669064,100.0,nan,nan,nan,nan,nan,nan +350112.0,35.06,-0.3173993476896748,0.02202294021844864,0.474281370639801,0.14500066997916292,-1.3416458107705693,0.24495391845703124,-0.031626339453669064,100.0,nan,nan,nan,nan,nan,nan +400128.0,56.03,-0.23837393128681067,0.021311495453119278,0.45957911014556885,0.3106199108813598,-1.2764616660715546,0.23931427001953126,-0.03226027002870069,100.0,nan,nan,nan,nan,nan,nan +450144.0,103.57,-0.1700475533455634,0.021311495453119278,0.45957911014556885,0.3106199108813598,-1.2764616660715546,0.23931427001953126,-0.03226027002870069,100.0,nan,nan,nan,nan,nan,nan +500160.0,71.12,-0.2036031133295838,0.021311495453119278,0.45957911014556885,0.3106199108813598,-1.2764616660715546,0.23931427001953126,-0.03226027002870069,100.0,nan,nan,nan,nan,nan,nan +550176.0,168.17,-0.12625538740560147,0.01910100132226944,0.4285324215888977,0.49545763749047184,-1.2111072726140264,0.21199569702148438,-0.028160874424099803,100.0,nan,nan,nan,nan,nan,nan +600192.0,419.39,-0.09025685924795539,0.01910100132226944,0.4285324215888977,0.49545763749047184,-1.2111072726140264,0.21199569702148438,-0.028160874424099803,82.75862068965517,4.965653219835884,0.6446276013413593,0.0987629003101029,0.004922355801633268,0.012294729726875986,0.013232667197115421 +650208.0,363.45,-0.09294433250791033,0.01910100132226944,0.4285324215888977,0.49545763749047184,-1.2111072726140264,0.21199569702148438,-0.028160874424099803,85.5072463768116,4.9982985843936465,0.6467877357639327,0.0848788554174826,0.011811007078833287,0.014638398331958282,0.01228759108309796 +700224.0,551.67,-0.07244767101709357,0.019526444375514984,0.3648099899291992,0.31120517323670355,-1.1381306111987215,0.20385513305664063,-0.022195850586001598,33.333333333333336,3.2518415771639915,0.292873214747907,0.06022928322070379,0.007555595493217405,0.012844134036986912,0.00941990391345572 +750240.0,848.5,-0.05575566571596935,0.019526444375514984,0.3648099899291992,0.31120517323670355,-1.1381306111987215,0.20385513305664063,-0.022195850586001598,20.967741935483872,3.305166776145542,0.28474422051686143,0.057821642956221296,0.005655258261684793,0.00918833254245378,0.010160671600963641 +800256.0,846.35,-0.05223496818101257,0.021799694746732712,0.35452961921691895,0.08105577911192086,-1.0501224066130816,0.21047134399414064,-0.021904225244446705,18.181818181818183,3.336212680838151,0.3089800376258608,0.05427280267390113,0.0074482300940024605,0.012298461184700876,0.010211263412951817 +850272.0,963.89,-0.04168992387098113,0.021799694746732712,0.35452961921691895,0.08105577911192086,-1.0501224066130816,0.21047134399414064,-0.021904225244446705,0.0,3.0020394582899437,0.22321100628247767,0.05595510859137656,0.004716538744346741,0.010276801511187067,0.006816571817619454 +900288.0,990.14,-0.034495633496273256,0.021799694746732712,0.35452961921691895,0.08105577911192086,-1.0501224066130816,0.21047134399414064,-0.021904225244446705,2.1739130434782608,3.031464988036606,0.24684007645688605,0.05962293540644979,0.005626130903489474,0.010531340411503247,0.007658194977610558 +950304.0,989.31,-0.031162195186544155,0.023127662017941475,0.21978706121444702,0.013655464278974705,-0.9635076844046125,0.21238021850585936,-0.023078742444806722,1.8181818181818181,3.165311071236364,0.23025875485402245,0.05498702882129391,0.004349302218033651,0.010271271579924836,0.00758569859201062 +1000320.0,969.55,-0.026494327141457382,0.023127662017941475,0.21978706121444702,0.013655464278974705,-0.9635076844046125,0.21238021850585936,-0.023078742444806722,6.382978723404255,2.1796417838610647,0.1205321261722928,0.04687811031683602,0.003244650803549799,0.009209728359758985,0.006241817129916904 +1050336.0,970.32,-0.022839868507296565,0.02209949493408203,0.5374059677124023,0.007962424280194114,-0.8630484546010848,0.1770538330078125,-0.019873938534084347,1.7857142857142858,2.409384281885942,0.12068710189186375,0.05231919897423891,0.0030948810530421443,0.009734172435460545,0.005243558187579766 +1100352.0,960.46,-0.020956373352352,0.02209949493408203,0.5374059677124023,0.007962424280194114,-0.8630484546010848,0.1770538330078125,-0.019873938534084347,6.0,2.3286564534324343,0.12012673539822369,0.040695041295220225,0.0027110692385867575,0.0066748000967064465,0.005338496298709126 +1150368.0,960.5,-0.01847191775117126,0.02209949493408203,0.5374059677124023,0.007962424280194114,-0.8630484546010848,0.1770538330078125,-0.019873938534084347,1.9607843137254901,2.128218112886297,0.1067682459447542,0.04265482378937306,0.0023518638212911714,0.008184134045453378,0.00497323507621199 +1200384.0,990.12,-0.017322241324283928,0.021059542894363403,0.5936254262924194,0.01440607655013082,-0.7598123987525469,0.16010284423828125,-0.015211807108414632,0.0,2.030776249849429,0.09872520177244114,0.04654268906969161,0.002382926135100147,0.008989170178779624,0.005003052578011097 +1250400.0,1000.0,-0.01542704874,0.021059542894363403,0.5936254262924194,0.01440607655013082,-0.7598123987525469,0.16010284423828125,-0.015211807108414632,0.0,2.1436372910976798,0.09879750015317712,0.04348271692689096,0.002591425047989371,0.008897550956709417,0.003855253988283527 +1300416.0,1000.0,-0.01360629836,0.021059542894363403,0.5936254262924194,0.01440607655013082,-0.7598123987525469,0.16010284423828125,-0.015211807108414632,0.0,1.979366995284129,0.0853356778979252,0.0461872591215863,0.0018475101796131485,0.008326542484857296,0.005253439865963851 +1350432.0,1000.0,-0.013114750999999997,0.01743980497121811,0.01877915859222412,0.002189063057882379,-0.6666099329275312,0.14016342163085938,-0.012573641861547458,0.0,1.7382894611393256,0.06777263272198922,0.040525315777225374,0.000993964631713662,0.006376051015661434,0.005284887947236374 +1400448.0,1000.0,-0.011652855580000001,0.01743980497121811,0.01877915859222412,0.002189063057882379,-0.6666099329275312,0.14016342163085938,-0.012573641861547458,0.0,1.9794750400094263,0.07800800732354364,0.038462860903522966,0.0017597770104034501,0.00712741973016833,0.0031054469093795335 +1450464.0,1000.0,-0.010977692170000001,0.01615862362086773,0.38573670387268066,0.0008691658534289104,-0.582296199168195,0.12805252075195311,-0.010622871459844418,0.0,1.8178732990197026,0.06069372446407642,0.04788019022671506,0.001846501158540244,0.008384690158791314,0.004104302579286623 +1500480.0,1000.0,-0.010524550760000003,0.01615862362086773,0.38573670387268066,0.0008691658534289104,-0.582296199168195,0.12805252075195311,-0.010622871459844418,0.0,1.9090002932044468,0.06597300268029486,0.04237024690418575,0.0024963255101145476,0.008756307185239906,0.0031144541538016867 +1550496.0,1000.0,-0.009829232129999997,0.01615862362086773,0.38573670387268066,0.0008691658534289104,-0.582296199168195,0.12805252075195311,-0.010622871459844418,0.0,1.4677037130046668,0.04912735834175549,0.03697474315413271,0.0009636867417948,0.006026026348703385,0.0039189285795583435 +1600512.0,1000.0,-0.009527990069999999,0.0141903106123209,0.5555599331855774,0.0005966875327061416,-0.4969831779540982,0.12238693237304688,-0.010149621653124542,0.0,1.7344870670420731,0.0533778025350564,0.04117357098278873,0.0018038105515041666,0.008596681757244895,0.004089210340900851 +1650528.0,1000.0,-0.00893908012,0.0141903106123209,0.5555599331855774,0.0005966875327061416,-0.4969831779540982,0.12238693237304688,-0.010149621653124542,0.0,1.5944554896575167,0.04716181595896411,0.04126257605342702,0.0018532711180503448,0.008381539823578293,0.0034728318543028876 +1700544.0,1000.0,-0.00849318875,0.0141903106123209,0.5555599331855774,0.0005966875327061416,-0.4969831779540982,0.12238693237304688,-0.010149621653124542,0.0,1.5331446364317738,0.04702957565068658,0.037514261053875085,0.00212939327857015,0.007871622291304274,0.002672963635144132 +1750560.0,1000.0,-0.008159863540000001,0.014287134632468224,0.6558700799942017,0.0004432933696737251,-0.41135670975927496,0.11719131469726562,-0.009975081573577427,0.0,1.4979784939474405,0.040631136112760834,0.03797812089408665,0.001438036514602193,0.007429178362551698,0.002852432509330742 +1800576.0,1000.0,-0.00759444257,0.014287134632468224,0.6558700799942017,0.0004432933696737251,-0.41135670975927496,0.11719131469726562,-0.009975081573577427,0.0,1.4269930837706999,0.04085018368522191,0.036791176341516414,0.0018281819071964856,0.00783758237083504,0.0038319340681930033 +1850592.0,1000.0,-0.00748877712,0.013138293288648129,0.6827524900436401,0.0003385018973103726,-0.3246342498518061,0.11322021484375,-0.008766790947737491,0.0,1.6833868448897042,0.04804470017129792,0.0377799808403308,0.0019330471555365646,0.008353578593685369,0.003096913592373389 +1900608.0,1000.0,-0.007237390350000001,0.013138293288648129,0.6827524900436401,0.0003385018973103726,-0.3246342498518061,0.11322021484375,-0.008766790947737491,0.0,1.546465395600206,0.04197267697965712,0.03963301779821196,0.0016705090434980666,0.008087228589514658,0.0025005355365007565 +1950624.0,1000.0,-0.0068578300399999995,0.013138293288648129,0.6827524900436401,0.0003385018973103726,-0.3246342498518061,0.11322021484375,-0.008766790947737491,0.0,1.4204839272918304,0.03258341772762231,0.035202045678161084,0.0014946415888835126,0.007847466977189294,0.0025937134958291443 +2000640.0,1000.0,-0.0068402266699999995,0.011470991186797619,0.7626498341560364,0.00024790592935541866,-0.24954425873584113,0.10132064819335937,-0.007294789635119514,0.0,1.4113470881586923,0.041149628313295876,0.030460381151728874,0.0014676779836321637,0.007040880392567568,0.0030067768786853394 +2050656.0,1000.0,-0.006625306890000001,0.011470991186797619,0.7626498341560364,0.00024790592935541866,-0.24954425873584113,0.10132064819335937,-0.007294789635119514,0.0,1.509172695372454,0.03533370639198464,0.04110908087446456,0.0012997171022503496,0.008145811617880732,0.0019427694682432773 +2100672.0,1000.0,-0.0063395902,0.010341539978981018,0.8139201998710632,0.00019180159112810457,-0.18829700005007907,0.10136337280273437,-0.007254113179322985,0.0,1.4930977680878201,0.03797275130722394,0.03846660060300068,0.000996667999513323,0.007111064585775013,0.002342601972598251 +2150688.0,1000.0,-0.0062362345800000005,0.010341539978981018,0.8139201998710632,0.00019180159112810457,-0.18829700005007907,0.10136337280273437,-0.007254113179322985,0.0,1.4618676744890513,0.031754608340558904,0.039088426523117566,0.0012370570788048174,0.007911907956589524,0.0019165028056243353 +2200704.0,1000.0,-0.00615758561,0.010341539978981018,0.8139201998710632,0.00019180159112810457,-0.18829700005007907,0.10136337280273437,-0.007254113179322985,0.0,1.3261937700566966,0.029681261748244706,0.03961811735845087,0.0017840531297145027,0.009069409858856639,0.0025050801030632123 +2250720.0,1000.0,-0.0061633504700000005,0.011541677638888359,0.8263936638832092,0.0001421325750064817,-0.12600554142263717,0.11242904663085937,-0.007944659565720259,0.0,1.4571341646813252,0.03595054092955815,0.03604649789235059,0.001833836946562782,0.008739466753678532,0.0019514128931270344 +2300736.0,1000.0,-0.006140843599999999,0.011541677638888359,0.8263936638832092,0.0001421325750064817,-0.12600554142263717,0.11242904663085937,-0.007944659565720259,0.0,1.432098239904577,0.031528944754058756,0.03721995568131188,0.0013281068128959178,0.008067413168103525,0.0023674925665827886 +2350752.0,1000.0,-0.006064011799999999,0.011541677638888359,0.8263936638832092,0.0001421325750064817,-0.12600554142263717,0.11242904663085937,-0.007944659565720259,0.0,1.456004882343902,0.03140963146276209,0.04297537569887937,0.001366095327412596,0.008571661071573568,0.001977659481712868 +2400768.0,1000.0,-0.005948548309999999,0.01070053968578577,0.8822217583656311,0.00011441200287212183,-0.07185450882534497,0.11621322631835937,-0.008358922694134208,0.0,1.3508797017025211,0.029166304642108492,0.03933823603562059,0.0015331451925415336,0.00859167370112169,0.002206250121861852 +2450784.0,1000.0,-0.00574221754,0.01070053968578577,0.8822217583656311,0.00011441200287212183,-0.07185450882534497,0.11621322631835937,-0.008358922694134208,0.0,1.4136141027330622,0.03185992610386348,0.03736358360980044,0.0013462438243709244,0.008093202300777378,0.0020030799282766895 +2500800.0,1000.0,-0.005612918610000001,0.010731259360909462,0.874031662940979,8.923432455540948e-05,-0.019075807166518643,0.11375808715820312,-0.007748086326273551,0.0,1.3713669580078354,0.025053956434124947,0.041419678446319344,0.0013631483192302098,0.008515190937255217,0.0020613718098284333 +2550816.0,1000.0,-0.005425881100000001,0.010731259360909462,0.874031662940979,8.923432455540948e-05,-0.019075807166518643,0.11375808715820312,-0.007748086326273551,0.0,1.3598589587095997,0.02483435189096954,0.03886425209993667,0.000980950422889617,0.007539542960888872,0.0020705742968614724 +2600832.0,1000.0,-0.005336709190000001,0.010731259360909462,0.874031662940979,8.923432455540948e-05,-0.019075807166518643,0.11375808715820312,-0.007748086326273551,0.0,1.2554142823871313,0.024163872143254297,0.03452539837556994,0.0008799773949699242,0.006873162973895642,0.0013961965561831787 +2650848.0,1000.0,-0.00532259803,0.012080168351531029,0.8794018030166626,6.645590811413805e-05,0.041207450261572375,0.11869888305664063,-0.008049558976370007,0.0,1.2911658378237467,0.026962051301863132,0.03347940753552726,0.0010417520179588039,0.007056224955957887,0.0016315397462914984 +2700864.0,1000.0,-0.005316921200000001,0.012080168351531029,0.8794018030166626,6.645590811413805e-05,0.041207450261572375,0.11869888305664063,-0.008049558976370007,0.0,1.3687470125846863,0.02771420778261338,0.04153390633266257,0.0010654578577620713,0.00813718919645452,0.001826018379872676 +2750880.0,1000.0,-0.005221960739999999,0.012080168351531029,0.8794018030166626,6.645590811413805e-05,0.041207450261572375,0.11869888305664063,-0.008049558976370007,0.0,1.282512569730736,0.025157022085970086,0.03988135972060263,0.0012857369077847952,0.008406045815844811,0.001791657356579055 +2800896.0,1000.0,-0.005262618629999999,0.010907622054219246,0.909285843372345,5.694806509595196e-05,0.08551667798310518,0.12401046752929687,-0.008621546246740764,0.0,1.5141487025179234,0.03268820147550318,0.04378282107634868,0.0012755357988368258,0.00878704292730342,0.001710799195875595 +2850912.0,1000.0,-0.0052979464399999995,0.010907622054219246,0.909285843372345,5.694806509595196e-05,0.08551667798310518,0.12401046752929687,-0.008621546246740764,0.0,1.551470451660599,0.033599883391844235,0.045056646900712115,0.001420095273698301,0.009385915314156126,0.0015522742119981976 +2900928.0,1000.0,-0.005213292720000001,0.011565236374735832,0.9277364611625671,5.158331767778357e-05,0.1314061277837027,0.12809906005859376,-0.009006264986072664,0.0,1.315463017205957,0.028104677178068888,0.03745863927114339,0.001135523205287351,0.007778675000814196,0.001595930178098115 +2950944.0,1000.0,-0.005226689820000001,0.011565236374735832,0.9277364611625671,5.158331767778357e-05,0.1314061277837027,0.12809906005859376,-0.009006264986072664,0.0,1.316407728052698,0.024151420153743407,0.03586149570816713,0.0010433253880346655,0.007668785615657837,0.0019540632424873967 +3000960.0,1000.0,-0.00521451529,0.011565236374735832,0.9277364611625671,5.158331767778357e-05,0.1314061277837027,0.12809906005859376,-0.009006264986072664,0.0,1.3264829799534519,0.022310619685295564,0.04182806695483372,0.0011117640900659338,0.008298517435853373,0.0017000227166848751 diff --git a/models/hybrid_cv01_s5775_play_results.csv b/models/hybrid_cv01_s5775_play_results.csv new file mode 100644 index 0000000..fc11ab0 --- /dev/null +++ b/models/hybrid_cv01_s5775_play_results.csv @@ -0,0 +1,101 @@ +start_speed,ep_steps,ep_reward,terminated,truncated,no_crash,t_min_start,t_min_min,t_min_final,t_min_mean_last100,t_min_settle_step,x_pos_final,x_vel_final,theta_final,theta_dot_final,energy_final,acc_final +-10.0,1000,-1214.1989385654504,False,True,True,0.0,0.04261976519235258,0.5061802115939015,0.5061803870643528,131,0.00640546157948229,-5.587935533630298e-09,3.141592653589793,3.210452971250527e-09,3.5156488342539246e-16,-9.313225746154785e-09 +-9.8,1000,-1169.6079346180388,False,True,True,0.0,0.023986591398797514,0.506180389137896,0.5061803752997022,138,0.00640546418727756,1.8626451633181626e-09,3.141592653589793,-5.033479555348362e-10,5.027095371504792e-18,5.587935447692871e-09 +-9.6,1000,-1117.8599837209374,False,True,True,0.0,0.06681217622164985,0.5061805918490341,0.5061803766331983,143,0.006405468657714095,4.470348463883693e-09,3.141592653589793,-7.403530084534683e-10,4.301777210110929e-18,8.568167686462403e-09 +-9.4,1000,-1053.3879951091856,False,True,True,0.0,0.03368456169837573,0.506180215504989,0.5061803727921932,129,0.006405461207051013,-3.7252902794403334e-09,3.141592653589793,1.612396614695048e-10,2.2321598191328095e-18,-9.313225746154785e-09 +-9.2,1000,-1013.8990323753987,False,True,True,0.0,0.06026182433276756,0.50618042584755,0.5061803682369228,135,0.0064054653049297975,1.1175871696542328e-09,3.141592653589793,-7.060397858955776e-10,1.7658499445299496e-17,2.60770320892334e-09 +-9.0,1000,-952.4195371716991,False,True,True,0.0,0.045944495046144875,0.5061807947343459,0.5061803891459686,129,0.006405472755426201,8.568167682915117e-09,3.141592653589793,-4.443804738668278e-10,8.505184848974724e-18,5.587935447692871e-09 +-8.8,1000,-893.2373594964072,False,True,True,0.0,0.08189205408495372,0.5061803224460402,0.5061803685247364,129,0.006405463442206582,-1.8626451484044878e-09,3.141592653589793,1.7534516859956996e-09,1.2280378147232073e-16,-9.313225746154785e-09 +-8.6,1000,-861.8386224082088,False,True,True,0.0,0.05495099179465652,0.5061804843598388,0.506180391488973,129,0.006405467540086311,-1.862645084138988e-09,3.141592653589793,2.0343584668509722e-09,1.7077256395984432e-16,-9.313225746154785e-09 +-8.4,1000,-814.3632218211999,False,True,True,0.0,0.06722846627235377,0.5061803921336189,0.5061803860411976,115,0.006405465677348031,-3.725290333934114e-09,3.141592653589793,7.419382557779237e-10,6.8231586788053955e-18,-9.313225746154785e-09 +-8.2,1000,-774.4814573421951,False,True,True,0.0,0.12184587751149162,0.5061802522121797,0.5061803778004519,122,0.006405462324640618,-4.470348336479698e-09,3.141592653589793,1.3425407898179491e-09,4.0096545878433485e-17,-9.313225746154785e-09 +-8.0,1000,-737.422631283283,False,True,True,0.0,0.04514092952013071,0.5061801970619234,0.5061803847143103,132,0.006405460834558634,-4.097819272791472e-09,3.141592653589793,-8.82740294727816e-10,8.353068472364606e-17,-9.313225746154785e-09 +-7.8,1000,-700.5303313667413,False,True,True,0.0,0.021117677642621478,0.5061801452674374,0.5061803599746718,128,0.006405460089391802,-6.3329935641237185e-09,3.141592653589793,1.5538497935923311e-09,4.237065536653665e-17,-9.313225746154785e-09 +-7.6,1000,-658.1861949040662,False,True,True,0.0,0.04155633545063904,0.5061802522140944,0.5061803842714554,129,0.006405462324689067,-4.470348285257519e-09,3.141592653589793,1.3437297780631557e-09,4.020309162750561e-17,-9.313225746154785e-09 +-7.4,1000,-625.2701111472566,False,True,True,0.0,0.04247146099805829,0.506180444470912,0.5061803751465646,122,0.00640546530485159,2.9802322375888527e-09,3.141592653589793,7.848532022612027e-10,5.863106769738082e-17,5.587935447692871e-09 +-7.2,1000,-595.156697741887,False,True,True,0.0,0.13462854883115485,0.5061802559366653,0.506180385293762,139,0.006405462324620275,-4.09781932853519e-09,3.141592653589793,2.266814002281346e-09,1.7242840534194403e-16,-9.313225746154785e-09 +-7.0,1000,-559.2399000853422,False,True,True,0.0,0.053022294243671164,0.5061805512285581,0.5061803864910405,122,0.006405467912499886,3.352761210485511e-09,3.141592653589793,-2.323880697953047e-09,1.9772740817083726e-16,2.60770320892334e-09 +-6.8,1000,-522.3350449250493,False,True,True,0.0,0.08441661720114901,0.5061805586801605,0.5061803697011813,121,0.0064054679125257145,4.0978192979717416e-09,3.141592653589793,-8.576617763848341e-10,1.0029817847510777e-17,8.568167686462403e-09 +-6.6,1000,-483.87970813467706,False,True,True,0.0,0.07383941484913693,0.50618042638891,0.5061803953087638,116,0.006405464187229628,5.587935410941486e-09,3.141592653589793,-1.3226284201823373e-09,2.9172186123122375e-17,1.4528632164001466e-08 +-6.4,1000,-451.1744511316095,False,True,True,0.0,0.03946676455801096,0.5061804077636681,0.506180384566695,115,0.006405464187260318,3.7252902953284414e-09,3.141592653589793,-9.8409755972754e-10,1.870080335827975e-17,8.568167686462403e-09 +-6.2,1000,-421.097059566619,False,True,True,0.0,0.03667968151633019,0.5061803965883934,0.5061803864862302,110,0.0064054641872754395,2.6077032202635384e-09,3.141592653589793,-1.5056039508283748e-10,6.07311399582496e-19,8.568167686462403e-09 +-6.0,1000,-393.369948767081,False,True,True,0.0,0.06817679957354064,0.5061805698588016,0.506180377219704,110,0.006405467912595753,5.215406461025484e-09,3.141592653589793,-3.6860620485368843e-10,1.1694471657639307e-18,2.60770320892334e-09 +-5.8,1000,-364.04858864795693,False,True,True,0.0,0.075887033381594,0.5061804040424062,0.5061803750048568,119,0.006405464187362259,3.3527613698168167e-09,3.141592653589793,-2.197439893554594e-09,1.7338269281592935e-16,8.568167686462403e-09 +-5.6,1000,-335.3431022191106,False,True,True,0.0,0.03924605280465396,0.5061802084180146,0.5061803733807063,101,0.006405460461986253,-1.4901161073024818e-09,3.141592653589793,-8.77249528658963e-10,5.266059630683777e-17,-3.7252902984619143e-10 +-5.4,1000,-302.3761139695096,False,True,True,0.0,0.09200249177380355,0.5061803405262472,0.5061803986969151,104,0.006405464559782868,-4.470348369443842e-09,3.141592653589793,1.3306512827697275e-09,3.9038901168305746e-17,-9.313225746154785e-09 +-5.2,1000,-277.0282485405479,False,True,True,0.0,0.12415254108719766,0.5061807358601147,0.5061804015053124,97,0.0064054712653762655,8.568167748854945e-09,3.141592653589793,-7.023939881671004e-11,3.0935198428979494e-17,2.60770320892334e-09 +-5.0,1000,-253.34538357380555,False,True,True,0.0,0.06500004520135769,0.5061804629153279,0.5061803699929293,104,0.006405465677378239,3.3527612659925043e-09,3.141592653589793,-1.506490294211211e-09,6.858713132456484e-17,8.568167686462403e-09 +-4.8,1000,-228.16694756564556,False,True,True,0.0,0.10772977678864754,0.5061802821957172,0.5061803648395834,102,0.006405461952093397,4.235771473250809e-18,3.141592653589793,4.0712267663529695e-10,8.287443709190559e-18,2.60770320892334e-09 +-4.6,1000,-202.05313329684245,False,True,True,0.0,0.10713086387740417,0.5061802708366975,0.50618035910339,104,0.006405462324591758,-2.6077032361381567e-09,3.141592653589793,2.0589139206519515e-09,1.6166601977185795e-16,-9.313225746154785e-09 +-4.4,1000,-178.48973453658598,False,True,True,0.0,0.05589090341028235,0.5061801527187161,0.5061803805822367,96,0.006405460089409508,-5.587935484049264e-09,3.141592653589793,1.0810558522793573e-09,1.3637895702031216e-17,-9.313225746154785e-09 +-4.2,1000,-156.82530876191555,False,True,True,0.0,0.094823299127044,0.5061803265366773,0.5061803701470707,90,0.006405462697185687,1.4901161567212993e-09,3.141592653589793,-9.110644450829761e-10,2.903622574360237e-17,5.587935447692871e-09 +-4.0,1000,-137.28292256383494,False,True,True,0.0,0.07477891439308829,0.5061803636047144,0.5061803654341341,90,0.006405463069640598,3.7252902618034283e-09,3.141592653589793,-1.5246074871569554e-09,6.636423900967039e-17,1.1548399925231934e-08 +-3.8,1000,-120.51162534589803,False,True,True,0.0,0.10144800574149239,0.5061802078691471,0.5061803875009785,90,0.0064054615794958275,-5.9604645461169265e-09,3.141592653589793,9.86698064104386e-10,7.63043399882613e-18,-9.313225746154785e-09 +-3.6,1000,-104.84526668660968,False,True,True,0.0,0.05115880021514763,0.5061805510505252,0.5061803882465056,75,0.0064054682851279405,1.862645193836967e-09,3.141592653589793,-7.844756821413602e-10,1.7892829764610526e-17,2.60770320892334e-09 +-3.4,1000,-89.33936616287062,False,True,True,0.0,0.04261085449329155,0.506180249034118,0.5061804326953788,78,0.006405461207089442,-3.7252896890006725e-10,3.141592653589793,9.293583954595985e-10,3.9792611031488666e-17,2.60770320892334e-09 +-3.2,1000,-73.90755510742989,False,True,True,0.0,0.0696559213101867,0.506180322265233,0.5061803914877907,69,0.006405463814764233,-3.352761238591493e-09,3.141592653589793,3.335966087869482e-09,4.502070130000057e-16,-6.3329935073852546e-09 +-3.0,1000,-60.81019570375669,False,True,True,0.0,0.06521689504107042,0.506180377780078,0.5061803978110034,80,0.006405464559806439,-7.450580461975857e-10,3.141592653589793,-9.072129977031429e-10,4.818859034476776e-17,2.60770320892334e-09 +-2.8,1000,-49.78016118315286,False,True,True,0.0,0.07443161307301864,0.5061802190497497,0.5061803885376264,86,0.006405461579615613,-4.842877331686013e-09,3.141592653589793,9.99121394774603e-10,1.3252684955568747e-17,-9.313225746154785e-09 +-2.6,1000,-40.610178918525335,False,True,True,0.0,0.06050944090379019,0.5061800789406613,0.5061803826392856,64,0.006405458599293588,-7.078051603828042e-09,3.141592653589793,2.4298792242992854e-09,1.4827695407726082e-16,-9.313225746154785e-09 +-2.4,1000,-32.902917938864434,False,True,True,0.0,0.04718623327704459,0.5061803896829364,0.5061803761711928,64,0.006405463069670578,6.332993500228213e-09,3.141592653589793,1.04845110628019e-09,1.4141422986452068e-16,1.1548399925231934e-08 +-2.2,1000,-26.247292491607038,False,True,True,0.0,0.12811471153720297,0.5061803962256087,0.5061803797200729,72,0.006405464932361293,-3.7252899054403214e-10,3.141592653589793,-1.0416060534617549e-09,5.819683196852094e-17,2.60770320892334e-09 +-2.0,1000,-20.41870802749699,False,True,True,0.0,0.04451615822436562,0.5061805847572153,0.5061803901554497,72,0.006405467912526148,6.705522508262946e-09,3.141592653589793,-1.685241785718106e-09,5.147974261448244e-17,5.587935447692871e-09 +-1.8,1000,-15.312223032051941,False,True,True,0.0,0.059722977123875934,0.5061804375646054,0.5061803724919841,64,0.006405464187225117,6.705522497138897e-09,3.141592653589793,-7.037070322298489e-10,5.496198006222366e-20,1.4528632164001466e-08 +-1.6,1000,-11.070121326376672,False,True,True,0.0,0.09693197372717448,0.5061803779625097,0.5061803869228898,66,0.006405464187289837,7.450580852989141e-10,3.141592653589793,-1.3830135033043504e-09,8.56096193610469e-17,5.587935447692871e-09 +-1.4,1000,-7.596125440570532,False,True,True,0.0,0.12721944255953882,0.5061804044008309,0.5061803826463895,60,0.006405463442166021,6.332993466179699e-09,3.141592653589793,-6.011841779906064e-10,5.156920288365188e-20,1.4528632164001466e-08 +-1.2,1000,-4.964492453104809,False,True,True,0.0,0.09573942929391736,0.5061802449421218,0.5061803772030707,45,0.006405461952075808,-3.725290313170885e-09,3.141592653589793,1.0746569004840868e-10,3.512928744321728e-18,-3.3527612686157227e-09 +-1.0,1000,-3.1909419635062233,False,True,True,0.0,0.07161062210798277,0.5061803226284596,0.5061803873585208,38,0.006405463069689714,-3.7252901740235233e-10,3.141592653589793,-2.554527967711467e-10,4.283831296408895e-18,-3.7252902984619143e-10 +-0.8,1000,-2.041775523240173,False,True,True,0.0,0.03898874297738102,0.5061803928637026,0.5061803736719832,53,0.0064054641872906175,2.235174205492222e-09,3.141592653589793,-9.360623100763075e-10,2.5386010979895183e-17,5.587935447692871e-09 +-0.6,1000,-1.2661664357086386,False,True,True,0.0,0.04348688572645185,0.5061803593349434,0.5061803911885855,39,0.006405464187261628,-1.1175870924464786e-09,3.141592653589793,-1.1379804931435958e-09,7.80924036976462e-17,2.60770320892334e-09 +-0.4,1000,-0.9509936597230134,False,True,True,0.0,0.023212567997676382,0.5061804958974122,0.5061803751540379,33,0.006405466794972953,2.235174184477996e-09,3.141592653589793,-5.587399012436598e-10,5.618705648087311e-18,2.60770320892334e-09 +-0.2,1000,-0.7434836884788546,False,True,True,0.0,0.05201352546127101,0.5061802311343896,0.5061803836739631,33,0.006405459716947406,3.72529033006736e-09,3.141592653589793,-6.049830900636273e-10,2.7017444329947324e-18,1.4528632164001466e-08 +0.2,1000,-0.7089548699592554,False,True,True,0.0,0.07992176505053844,0.5061802853746817,0.5061803785407057,29,0.006405463069667497,-4.097819338191227e-09,3.141592653589793,-2.5558292365238523e-11,9.476055626733445e-18,-9.313225746154785e-09 +0.4,1000,-0.9262348565369444,False,True,True,0.0,0.07340281337879744,0.5061801048369083,0.5061803701308746,36,0.0064054589718517685,-5.960464483716591e-09,3.141592653589793,3.771516562512385e-10,2.3957465013995297e-18,-9.313225746154785e-09 +0.6,1000,-1.291092027086773,False,True,True,0.0,0.06528790667653088,0.5061803373473284,0.5061803985447866,35,0.006405463442209814,-3.7252902652573987e-10,3.141592653589793,-2.7515847379568975e-09,3.8888076928953065e-16,8.568167686462403e-09 +0.8,1000,-2.0232725243750065,False,True,True,0.0,0.09619122369767205,0.5061801562641868,0.5061803889796581,35,0.006405460461992022,-6.705522519284098e-09,3.141592653589793,1.1283152120904625e-09,1.0477346385245527e-17,-9.313225746154785e-09 +1.0,1000,-3.249426494107614,False,True,True,0.0,0.05502944439649604,0.506180174890772,0.5061803632193332,28,0.006405460461995511,-4.842877366327935e-09,3.141592653589793,1.8514226927789852e-09,9.345289941362796e-17,-9.313225746154785e-09 +1.2,1000,-4.926484654512598,False,True,True,0.0,0.0568099461454934,0.5061805847580787,0.5061804116499549,56,0.006405467912547992,6.705522530277809e-09,3.141592653589793,1.4685628015349798e-09,2.2879066083687827e-16,2.60770320892334e-09 +1.4,1000,-7.243503594774466,False,True,True,0.0,0.059684390465311196,0.5061802006034344,0.5061803730898378,48,0.006405461207040993,-5.2154064090631e-09,3.141592653589793,2.914898657812987e-09,2.8640812986024495e-16,-9.313225746154785e-09 +1.6,1000,-10.704069689877846,False,True,True,0.0,0.04834698158118269,0.5061805549561506,0.5061803780978102,48,0.006405467912558124,3.725290301739339e-09,3.141592653589793,5.681177268281424e-11,9.216676250708282e-18,8.568167686462403e-09 +1.8,1000,-15.021161373030642,False,True,True,0.0,0.15171412881844212,0.506180373690536,0.5061803811797553,62,0.006405465304855084,-4.097819324677683e-09,3.141592653589793,1.9199560131900285e-10,2.3715443009791715e-18,-1.2293457984924317e-08 +2.0,1000,-20.40921908557481,False,True,True,0.0,0.07448810226761332,0.5061801637123092,0.506180367192882,69,0.006405460461929869,-5.960464522383459e-09,3.141592653589793,5.2376481487397e-10,2.612317548143768e-19,-9.313225746154785e-09 +2.2,1000,-26.858552909293163,False,True,True,0.0,0.08846808770490033,0.5061802669317897,0.5061803742609381,67,0.0064054626971794554,-4.470348326161044e-09,3.141592653589793,-5.268220644177364e-11,1.2485855954876393e-17,-9.313225746154785e-09 +2.4,1000,-34.10956364757607,False,True,True,0.0,0.022351874770603217,0.5061802970965431,0.5061803898655771,53,0.0064054619520849026,1.4901161141245712e-09,3.141592653589793,6.635625868848167e-10,3.301384138455892e-17,-3.3527612686157227e-09 +2.6,1000,-43.12637743083727,False,True,True,0.0,0.022224791347980263,0.5061802594808459,0.5061803733737987,74,0.006405462697170225,-5.215406396288877e-09,3.141592653589793,2.819684850179424e-09,2.640733406152887e-16,-9.313225746154785e-09 +2.8,1000,-51.562579629710065,False,True,True,0.0,0.018906459212302967,0.50618046200496,0.5061803947278676,81,0.006405467540006873,-4.097819346798474e-09,3.141592653589793,-3.0833124433975087e-10,2.578432689459278e-17,-9.313225746154785e-09 +3.0,1000,-61.59754187854701,False,True,True,0.0,0.04183692112565104,0.5061807172336616,0.506180377809199,81,0.006405471265376376,6.705522598866922e-09,3.141592653589793,8.122248724120419e-10,1.0993140120519228e-16,2.60770320892334e-09 +3.2,1000,-72.44771384996812,False,True,True,0.0,0.06477941596987213,0.5061802557557464,0.5061803823654366,74,0.0064054626971750345,-5.587935419564533e-09,3.141592653589793,1.9369413762899453e-09,9.496457265876749e-17,-9.313225746154785e-09 +3.4,1000,-84.84054554719033,False,True,True,0.0,0.13899047070914516,0.5061805515953285,0.5061803702917079,95,0.0064054671675145974,6.332993525384099e-09,3.141592653589793,-1.8161599302287996e-09,6.995795731220202e-17,1.1548399925231934e-08 +3.6,1000,-99.03821260210033,False,True,True,0.0,0.10907571151021335,0.5061804729978503,0.5061803745561013,80,0.006405467912509673,-4.470348404382391e-09,3.141592653589793,1.6467231776463866e-09,7.196260532683787e-17,-1.2293457984924317e-08 +3.8,1000,-114.62895827522934,False,True,True,0.0,0.06798906701257133,0.5061803666045842,0.5061803802936732,87,0.006405464559815971,-1.862645126844988e-09,3.141592653589793,2.3515530433872256e-11,1.324361560911492e-18,-9.313225746154785e-09 +4.0,1000,-132.17925516170808,False,True,True,0.0,0.04989305668532442,0.5061804003151354,0.5061803981103341,94,0.006405464187312162,2.9802322875081236e-09,3.141592653589793,-4.409969009665059e-10,1.0220735473318251e-18,2.60770320892334e-09 +4.2,1000,-149.79590326881618,False,True,True,0.0,0.06420953571796462,0.5061801672590882,0.5061804094421025,86,0.006405460834545489,-7.078051522863686e-09,3.141592653589793,-1.4648617313570162e-09,2.3602406935602784e-16,-9.313225746154785e-09 +4.4,1000,-169.10819531699298,False,True,True,0.0,0.06915686570342756,0.5061804075831603,0.5061803851568343,101,0.006405464559825649,2.235174214324116e-09,3.141592653589793,-4.685608694438834e-10,3.002314571171238e-18,-3.7252902984619143e-10 +4.6,1000,-190.51515540574488,False,True,True,0.0,0.06483929604291866,0.5061803000923462,0.5061803723462447,100,0.006405463442157123,-4.097819380454051e-09,3.141592653589793,-1.5768652569469423e-10,1.6101022868354686e-17,-9.313225746154785e-09 +4.8,1000,-214.09044275559302,False,True,True,0.0,0.07360469274256216,0.5061801894261649,0.5061803782274984,111,0.00640546120700554,-6.332993535127815e-09,3.141592653589793,2.342901268521995e-09,1.4613693539290164e-16,-9.313225746154785e-09 +5.0,1000,-236.49471856370974,False,True,True,0.0,0.09860162085007293,0.5061802484881839,0.5061803702930571,110,0.006405462324673357,-4.8428773314692705e-09,3.141592653589793,3.2865034461598076e-10,1.2111498359442902e-18,-9.313225746154785e-09 +5.2,1000,-260.73834795555433,False,True,True,0.0,0.03738842042169099,0.5061802935533088,0.5061804026659423,106,0.0064054615795588465,2.607703208307686e-09,3.141592653589793,2.5421102594448817e-10,1.3260289374133263e-17,2.60770320892334e-09 +5.4,1000,-287.19830749389234,False,True,True,0.0,0.017613615840673433,0.5061802531210802,0.5061803695496717,87,0.0064054604619756435,2.9802322411090277e-09,3.141592653589793,-5.718673989285513e-10,3.749531605259033e-18,5.587935447692871e-09 +5.6,1000,-320.16434928532004,False,True,True,0.0,0.11729633075382247,0.5061801115586345,0.506180411946928,114,0.006405460461939276,-1.1175870930687242e-08,3.141592653589793,-1.3850123352920015e-09,3.131501949480866e-16,-1.825392246246338e-08 +5.8,1000,-355.6572080393701,False,True,True,0.0,0.04693236399618705,0.5061802527530244,0.5061804022187253,105,0.006405461206927917,-1.0740595898712059e-16,3.141592653589793,-1.612793810777009e-09,1.3005519553863715e-16,2.60770320892334e-09 +6.0,1000,-378.9248813757,False,True,True,0.0,0.08697120518641979,0.5061802160528165,0.506180395160026,112,0.006405460089515055,7.450581353757364e-10,3.141592653589793,-8.177655116217891e-10,2.7621748942757905e-17,2.60770320892334e-09 +6.2,1000,-404.52378383348787,False,True,True,0.0,0.06973174951792462,0.5061803896821424,0.5061803585073775,106,0.006405463069650484,6.332993478595197e-09,3.141592653589793,-1.132804685469061e-09,1.247527911505283e-17,1.4528632164001466e-08 +6.4,1000,-444.2450983061969,False,True,True,0.0,0.034736638064925655,0.5061802449415832,0.5061803845658058,113,0.006405461952062179,-3.725290327388888e-09,3.141592653589793,-1.4988869354537192e-09,1.7510988629462403e-16,2.60770320892334e-09 +6.6,1000,-475.52406969162865,False,True,True,0.0,0.07511631210611806,0.5061805033474476,0.5061803957604245,115,0.00640546679495914,2.9802322303207554e-09,3.141592653589793,-5.304069803275809e-10,2.700110532858189e-18,2.60770320892334e-09 +6.8,1000,-505.6831233050859,False,True,True,0.0,0.04250633198888281,0.5061802449443455,0.5061803792767874,101,0.006405461952132075,-3.725290254593995e-09,3.141592653589793,8.014520417888978e-10,9.198747698436165e-18,-6.3329935073852546e-09 +7.0,1000,-536.3146492869728,False,True,True,0.0,0.019325640052557352,0.5061804517410001,0.5061803707358075,119,0.006405465677417313,2.2351742187317973e-09,3.141592653589793,1.8329518402686446e-10,8.274824814834698e-18,5.587935447692871e-09 +7.2,1000,-579.0225294284534,False,True,True,0.0,0.07847714162824887,0.5061803153576706,0.5061803826463298,118,0.006405462697106351,3.725289853210901e-10,3.141592653589793,-3.75765904185748e-10,5.729552750614198e-18,5.587935447692871e-09 +7.4,1000,-618.6891117750764,False,True,True,0.0,0.07144082876028292,0.5061804215702023,0.5061803786761969,119,0.006405466422359186,-3.72529038232621e-09,3.141592653589793,1.8473859890819444e-09,1.087601512694491e-16,-9.313225746154785e-09 +7.6,1000,-662.4141983457926,False,True,True,0.0,0.023522436618805542,0.5061801711623604,0.5061803936869775,119,0.006405460461916518,-5.215406478570683e-09,3.141592653589793,1.722405059099551e-09,7.210376671370831e-17,-9.313225746154785e-09 +7.8,1000,-713.8000696242717,False,True,True,0.0,0.09774070831011139,0.5061804854483976,0.5061803894289839,125,0.006405465304833477,7.078051548183195e-09,3.141592653589793,-2.2254760618224197e-10,1.1773744427030181e-17,1.4528632164001466e-08 +8.0,1000,-764.1336533706678,False,True,True,0.0,0.026734437740757653,0.5061804517408183,0.5061803986985118,115,0.006405465677412712,2.2351742138692053e-09,3.141592653589793,-1.1409973977925358e-10,5.986114526926799e-19,2.60770320892334e-09 +8.2,1000,-805.0949741666458,False,True,True,0.0,0.048558789304353776,0.5061801206478683,0.5061803939870445,133,0.0064054571092100835,2.980232237310556e-09,3.141592653589793,-2.5143955306559857e-09,2.4561531014745363e-16,1.4528632164001466e-08 +8.4,1000,-848.7395825065809,False,True,True,0.0,0.11097865765295069,0.5061805366955694,0.5061803794153125,118,0.006405467167550156,4.842877446430854e-09,3.141592653589793,-1.6491464096718405e-09,6.784478547630207e-17,8.568167686462403e-09 +8.6,1000,-887.2162616604179,False,True,True,0.0,0.05851825904212463,0.5061807241362293,0.5061803975272646,125,0.006405472382908859,2.980232244133426e-09,3.141592653589793,-1.6680146020667287e-09,9.384381874204282e-17,2.60770320892334e-09 +8.8,1000,-928.8054947793502,False,True,True,0.0,0.07207121227676261,0.5061803000969053,0.5061803888360837,132,0.006405463442272476,-4.0978192557579124e-09,3.141592653589793,1.3655636634967372e-09,4.567593652678882e-17,-9.313225746154785e-09 +9.0,1000,-962.4101910101691,False,True,True,0.0,0.05091958865523381,0.5061803038214252,0.5061803913358177,142,0.006405463442253001,-3.7252902533800727e-09,3.141592653589793,4.702130691612331e-10,4.771086209033783e-19,-9.313225746154785e-09 +9.2,1000,-1000.7557220633695,False,True,True,0.0,0.07860302366492519,0.5061804774524495,0.5061803761784904,133,0.006405466422432433,1.8626451419723115e-09,3.141592653589793,-1.5670290239215962e-09,9.532553156130331e-17,5.587935447692871e-09 +9.4,1000,-1049.4634626690959,False,True,True,0.0,0.12451004198411238,0.506180208599964,0.5061803901569926,132,0.006405460089457573,1.2330935246921918e-17,3.141592653589793,-5.933499699860521e-10,1.7603209272071574e-17,5.587935447692871e-09 +9.6,1000,-1101.5308714644862,False,True,True,0.0,0.01606287807226182,0.5061803779615363,0.5061803810342791,132,0.006405464187265208,7.450580589808775e-10,3.141592653589793,-5.706372529324489e-10,1.2307320638454357e-17,5.587935447692871e-09 +9.8,1000,-1140.1771830985397,False,True,True,0.0,0.07037546558387904,0.5061805845743531,0.5061803881048084,135,0.006405468285032058,5.215406365144191e-09,3.141592653589793,1.090824366816693e-09,1.299860452034071e-16,2.60770320892334e-09 +10.0,1000,-1188.108585510169,False,True,True,0.0,0.0724183656275269,0.5061803705096194,0.506180402817447,132,0.0064054641872313975,-3.236233469847761e-17,3.141592653589793,-2.0912073248736993e-09,2.1865740445138894e-16,8.568167686462403e-09 diff --git a/scripts/play_ppo.py b/scripts/play_ppo.py index a7dbbf2..f2ad02f 100644 --- a/scripts/play_ppo.py +++ b/scripts/play_ppo.py @@ -6,41 +6,240 @@ uv run python scripts/play_ppo.py --model-path models/ppo_AntiPendulumEnv.zip uv run python scripts/play_ppo.py --model-path models/ppo.zip --render-mode plot --episodes 3 + uv run python scripts/play_ppo.py --model-path models/ppo.zip --speed-sweep --render-mode none """ import argparse +import collections +import csv +import dataclasses import logging +import statistics +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np from crane_controller.crane_factory import build_crane from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv -from crane_controller.ppo_agent import ProximalPolicyOptimizationAgent +from crane_controller.experiment_config import load_training_sidecar +from crane_controller.ppo_agent import EpisodeResult, ProximalPolicyOptimizationAgent LOGGER = logging.getLogger(__name__) +_neg = [-round(s, 1) for s in np.arange(10.0, 0.0, -0.2)] +_pos = [round(s, 1) for s in np.arange(0.2, 10.2, 0.2)] +SWEEP_SPEEDS = _neg + _pos # 100 values: -10.0 … -0.2 +0.2 … +10.0 + + +def _save_sweep_png(results: list[EpisodeResult], stem: str, out_dir: Path) -> None: + """Save a 3×3 summary figure for a speed sweep — one subplot per metric.""" + buckets: dict[float, list[EpisodeResult]] = collections.defaultdict(list) + for r in results: + buckets[r.start_speed].append(r) + xs = sorted(buckets) + + nocrash_pct = [100.0 * sum(r.no_crash for r in buckets[sp]) / len(buckets[sp]) for sp in xs] + rew_per_step = [statistics.mean(r.ep_reward / r.ep_steps for r in buckets[sp]) for sp in xs] + energy_frac = [statistics.mean(r.energy_final / (0.5 * sp**2) for r in buckets[sp]) for sp in xs] + settle_step = [statistics.mean(r.t_min_settle_step for r in buckets[sp]) for sp in xs] + x_pos_m = [statistics.mean(r.x_pos_final for r in buckets[sp]) for sp in xs] + x_vel_f = [statistics.mean(r.x_vel_final for r in buckets[sp]) for sp in xs] + x_acc_f = [statistics.mean(r.acc_final for r in buckets[sp]) for sp in xs] + theta_f = [statistics.mean(r.theta_final for r in buckets[sp]) for sp in xs] + thdot_f = [statistics.mean(r.theta_dot_final for r in buckets[sp]) for sp in xs] + + panels: list[tuple[str, list[float]]] = [ + ("nocrash%↑", nocrash_pct), + ("rew/step↑", rew_per_step), + ("energy_frac↓", energy_frac), + ("settle_step↓", settle_step), + ("x_pos_m↓", x_pos_m), + ("x_vel_f", x_vel_f), + ("x_acc_f", x_acc_f), + ("theta_f", theta_f), + ("thdot_f↓", thdot_f), + ] + + fig, axes = plt.subplots(3, 3, figsize=(15, 10)) + for ax, (title, ys) in zip(axes.flat, panels, strict=True): + ax.plot(xs, ys, "o-", linewidth=1.5, markersize=5) + ax.axvline(0, color="gray", linestyle="--", linewidth=0.8) + ax.set_title(title, fontsize=10) + ax.set_xlabel("start_speed (m/s)", fontsize=8) + ax.tick_params(labelsize=8) + ax.grid(visible=True, alpha=0.3) + + fig.suptitle(stem, fontsize=12) # pyright: ignore[reportAttributeAccessIssue] + fig.tight_layout() + png_path = out_dir / f"{stem}_sweep.png" + fig.savefig(str(png_path), dpi=150, bbox_inches="tight") + plt.close(fig) + LOGGER.info("Sweep PNG: %s", png_path) + + +def _log_sweep_table(results: list[EpisodeResult], stem: str, model_path: str) -> None: + """Log per-speed summary table and save sweep PNG after a completed speed sweep.""" + buckets: dict[float, list[EpisodeResult]] = collections.defaultdict(list) + for r in results: + buckets[r.start_speed].append(r) + header = ( + f"{'speed':>6} {'n':>3} {'nocrash%':>8} {'rew/step':>9} {'energy_frac':>11}" + f" {'settle_step':>10} {'x_pos_m':>8} {'x_vel_f':>7} {'x_acc_f':>7}" + f" {'theta_f':>7} {'thdot_f':>7}" + ) + LOGGER.info("\n%s\n%s", header, "-" * len(header)) + for speed in sorted(buckets): + group = buckets[speed] + n = len(group) + nocrash_pct = 100.0 * sum(r.no_crash for r in group) / n + rew_per_step_mean = statistics.mean(r.ep_reward / r.ep_steps for r in group) + energy_frac_mean = statistics.mean(r.energy_final / (0.5 * r.start_speed**2) for r in group) + settle_mean = statistics.mean(r.t_min_settle_step for r in group) + x_pos_m_mean = statistics.mean(r.x_pos_final for r in group) + x_vel_mean = statistics.mean(r.x_vel_final for r in group) + acc_mean = statistics.mean(r.acc_final for r in group) + theta_mean = statistics.mean(r.theta_final for r in group) + thdot_mean = statistics.mean(r.theta_dot_final for r in group) + LOGGER.info( + "%6.1f %3d %7.0f%% %+9.4f %11.4f %10.0f %8.4f %7.4f %7.4f %7.3f %7.4f", + speed, + n, + nocrash_pct, + rew_per_step_mean, + energy_frac_mean, + settle_mean, + x_pos_m_mean, + x_vel_mean, + acc_mean, + theta_mean, + thdot_mean, + ) + _save_sweep_png(results, stem, Path(model_path).parent) + def main() -> None: """Parse CLI arguments and run a trained PPO agent.""" logging.basicConfig(level=logging.INFO, format="%(message)s") + + # Pre-parse --model-path so the sidecar can seed argument defaults. + pre_parser = argparse.ArgumentParser(add_help=False) + _ = pre_parser.add_argument("--model-path", type=str, required=True) + pre_args, _ = pre_parser.parse_known_args() + config = load_training_sidecar(pre_args.model_path) + parser = argparse.ArgumentParser(description="Run a trained PPO agent on the crane anti-pendulum task.") _ = parser.add_argument("--model-path", type=str, required=True, help="Path to a trained .zip model") _ = parser.add_argument("--render-mode", type=str, default="play-back", help="Render mode for playback") - _ = parser.add_argument("--episodes", type=int, default=1, help="Number of episodes to run") + _ = parser.add_argument("--episodes", type=int, default=1, help="Number of episodes to run per speed") + _ = parser.add_argument( + "--randomize-start", + action=argparse.BooleanOptionalAction, + default=config.training.randomize_start, + help="Randomise initial pendulum speed each episode (default from model sidecar).", + ) + _ = parser.add_argument( + "--start-speed", + type=float, + default=1.0, + help="Initial pendulum speed for playback (default 1.0). Ignored when --speed-sweep is set.", + ) + _ = parser.add_argument( + "--continuous-actions", + "--no-continuous-actions", + action=argparse.BooleanOptionalAction, + default=config.training.continuous_actions, + help="Use Box(-1,1) action space (default from model sidecar).", + ) + _ = parser.add_argument( + "--max-episode-steps", + type=int, + default=None, + help=( + "Override TimeLimit for play episodes (default: value from model sidecar). " + "Pass 3000 when playing old pre-trained models that have no max_episode_steps " + "in their sidecar (those default to 100 otherwise)." + ), + ) + _ = parser.add_argument( + "--save-png", + "--no-save-png", + action=argparse.BooleanOptionalAction, + default=True, + help="Save 7-panel trajectory plot per episode alongside the model (default True).", + ) + _ = parser.add_argument( + "--save-csv", + "--no-save-csv", + action=argparse.BooleanOptionalAction, + default=True, + help="Write per-episode metrics to {stem}_play_results.csv alongside the model (default True).", + ) + _ = parser.add_argument( + "--speed-sweep", + action="store_true", + default=False, + help=f"Run over speeds {SWEEP_SPEEDS} instead of --start-speed.", + ) args = parser.parse_args() + mep = args.max_episode_steps if args.max_episode_steps is not None else config.training.max_episode_steps + speeds = SWEEP_SPEEDS if args.speed_sweep else [args.start_speed] + agent = ProximalPolicyOptimizationAgent.load( AntiPendulumEnv, model_path=args.model_path, env_kwargs={ "crane": build_crane, - "start_speed": 1.0, + "start_speed": speeds[0], + "randomize_start": args.randomize_start, "render_mode": args.render_mode, - "reward_fac": (1.0, 0.0015, 0.0), + "reward_fac": config.reward, + "rail_limit": config.training.rail_limit, + "reward_limit": config.training.reward_limit, + "continuous_actions": args.continuous_actions, }, + max_episode_steps=mep, ) - for episode in range(args.episodes): - LOGGER.info("Episode %s/%s", episode + 1, args.episodes) - agent.do_one_episode(seed=episode + 1) + stem = Path(args.model_path).stem + all_results: list[EpisodeResult] = [] + + for speed in speeds: + agent.env.unwrapped.start_speed = speed # type: ignore[attr-defined] + for episode in range(args.episodes): + LOGGER.info("Episode %s/%s speed=%+.1f", episode + 1, args.episodes, speed) + png_path: str | None = None + if args.save_png: + png_path = str(Path(args.model_path).parent / f"{stem}_play_ss{speed:+.1f}_ep{episode + 1}.png") + result = agent.do_one_episode(seed=episode + 1, save_png=png_path) + result.start_speed = speed # override wire-CM value with the explicitly set speed + LOGGER.info( + " steps=%d rew=%.2f no_crash=%s t_min=[%.2f→%.2f@%d] x_pos=%+.4fm theta=%.3f", + result.ep_steps, + result.ep_reward, + result.no_crash, + result.t_min_start, + result.t_min_final, + result.t_min_settle_step, + result.x_pos_final, + result.theta_final, + ) + if png_path is not None: + LOGGER.info(" PNG: %s", png_path) + all_results.append(result) + + if args.save_csv and all_results: + csv_path = Path(args.model_path).parent / f"{stem}_play_results.csv" + fieldnames = [f.name for f in dataclasses.fields(EpisodeResult)] + with csv_path.open("w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(dataclasses.asdict(r) for r in all_results) + LOGGER.info("Play CSV: %s", csv_path) + + if args.speed_sweep and all_results: + _log_sweep_table(all_results, stem, args.model_path) if __name__ == "__main__": diff --git a/scripts/plot_sweep.py b/scripts/plot_sweep.py new file mode 100644 index 0000000..c0b269f --- /dev/null +++ b/scripts/plot_sweep.py @@ -0,0 +1,181 @@ +"""Plot speed-sweep results from one or more *_play_results.csv files. + +Examples +-------- +.. code-block:: bash + + # 3-panel comparison (two CSVs): + uv run python scripts/plot_sweep.py \ + models/hybrid_cv01_s5775_play_results.csv \ + models/sig_t_min_s5775_play_results.csv + + # 9-panel single-model detail (one CSV): + uv run python scripts/plot_sweep.py models/hybrid_cv01_s5775_play_results.csv +""" + +import argparse +import csv +import logging +import math +from pathlib import Path + +import matplotlib.pyplot as plt + +LOGGER = logging.getLogger(__name__) + +_ZERO_SPEED_THRESHOLD = 1e-6 + + +def _load_csv(path: str) -> dict[str, list[float]]: + """Read a play_results CSV; return {column: values} as floats (True/False → 1.0/0.0).""" + cols: dict[str, list[float]] = {} + with Path(path).open(newline="") as f: + for row in csv.DictReader(f): + for key, val in row.items(): + if val in ("True", "False"): + cols.setdefault(key, []).append(1.0 if val == "True" else 0.0) + else: + try: + cols.setdefault(key, []).append(float(val)) + except (ValueError, TypeError): + cols.setdefault(key, []).append(float("nan")) + return cols + + +def _plot_comparison(datasets: list[tuple[str, dict[str, list[float]]]], out_path: str) -> None: + """6-panel 2×3 head-to-head comparison across multiple models.""" + fig, axes = plt.subplots(2, 3, figsize=(14, 8)) + ax = axes.flat + + for label, cols in datasets: + speeds = cols.get("start_speed", []) + abs_speeds = [abs(s) for s in speeds] + x_pos_cm = [v * 100 for v in cols.get("x_pos_final", [])] + x_vel = cols.get("x_vel_final", []) + settle = cols.get("t_min_settle_step", []) + theta = cols.get("theta_final", []) + theta_dot = cols.get("theta_dot_final", []) + acc = cols.get("acc_final", []) + + ax[0].plot(speeds, x_pos_cm, linewidth=1.5, label=label) + ax[1].plot(speeds, x_vel, linewidth=1.5, label=label) + ax[2].plot(abs_speeds, settle, ".", markersize=4, alpha=0.7, label=label) + ax[3].plot(speeds, theta, linewidth=1.5, label=label) + ax[4].plot(speeds, theta_dot, linewidth=1.5, label=label) + ax[5].plot(speeds, acc, linewidth=1.5, label=label) + + titles = [ + ("|x| final (cm)", "start speed (m/s)", "cm"), + ("x_vel final (m/s)", "start speed (m/s)", "m/s"), + ("settle step vs |speed|", "|start speed| (m/s)", "step"), + ("theta final (rad)", "start speed (m/s)", "rad"), + ("theta_dot final (rad/s)", "start speed (m/s)", "rad/s"), + ("acc final (m/s²)", "start speed (m/s)", "m/s²"), + ] + for axis, (title, xlabel, ylabel) in zip(ax, titles, strict=True): + axis.set_title(title, fontsize=10) + axis.set_xlabel(xlabel, fontsize=8) + axis.set_ylabel(ylabel, fontsize=8) + axis.grid(visible=True, alpha=0.3) + axis.tick_params(labelsize=7) + + if len(datasets) > 1: + handles, labels = axes[0, 0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper right", fontsize=9, framealpha=0.8) + + suptitle = ", ".join(d[0] for d in datasets) + fig.suptitle(suptitle, fontsize=10) + fig.tight_layout() + fig.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + LOGGER.info("Sweep PNG: %s", out_path) + + +def _plot_detail(label: str, cols: dict[str, list[float]], out_path: str) -> None: + """9-panel single-model detail figure matching play_ppo.py's _save_sweep_png layout.""" + speeds = cols.get("start_speed", []) + no_crash_pct = [v * 100.0 for v in cols.get("no_crash", [])] + + ep_reward = cols.get("ep_reward", []) + ep_steps = cols.get("ep_steps", []) + rew_per_step = [ + r / s if s and not math.isnan(s) else float("nan") for r, s in zip(ep_reward, ep_steps, strict=True) + ] + + energy_final = cols.get("energy_final", []) + energy_frac = [ + e / (0.5 * sp * sp) if abs(sp) > _ZERO_SPEED_THRESHOLD else float("nan") + for e, sp in zip(energy_final, speeds, strict=True) + ] + + settle_step = cols.get("t_min_settle_step", []) + x_pos_m = cols.get("x_pos_final", []) + x_vel_f = cols.get("x_vel_final", []) + x_acc_f = cols.get("acc_final", []) + theta_f = cols.get("theta_final", []) + thdot_f = cols.get("theta_dot_final", []) + + panels: list[tuple[str, list[float]]] = [ + ("nocrash% ↑", no_crash_pct), + ("rew/step ↑", rew_per_step), + ("energy_frac ↓", energy_frac), + ("settle_step ↓", settle_step), + ("x_pos_m ↓", x_pos_m), + ("x_vel_f", x_vel_f), + ("x_acc_f", x_acc_f), + ("theta_f", theta_f), + ("thdot_f ↓", thdot_f), + ] + + fig, axes = plt.subplots(3, 3, figsize=(15, 10)) + fig.suptitle(label, fontsize=12) + + for ax, (title, ys) in zip(axes.flat, panels, strict=True): + ax.plot(speeds, ys, linewidth=1.2, marker=".", markersize=2, alpha=0.8) + ax.set_title(title, fontsize=9) + ax.grid(visible=True, alpha=0.3) + ax.tick_params(labelsize=7) + ax.set_xlabel("start speed (m/s)", fontsize=7) + + fig.tight_layout() + fig.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + LOGGER.info("Detail PNG: %s", out_path) + + +def main() -> None: + """Plot speed-sweep results: 9-panel detail (1 CSV) or 6-panel comparison (2+ CSVs).""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + parser = argparse.ArgumentParser(description="Plot speed-sweep results from *_play_results.csv files.") + parser.add_argument("csvs", nargs="+", help="One or more play_results CSV paths") + parser.add_argument("--output", type=str, default=None, help="Output PNG path") + args = parser.parse_args() + + datasets: list[tuple[str, dict[str, list[float]]]] = [] + for csv_path in args.csvs: + stem = Path(csv_path).stem + label = stem.removesuffix("_play_results") + cols = _load_csv(csv_path) + datasets.append((label, cols)) + LOGGER.info("Loaded %s (%d episodes)", label, len(cols.get("start_speed", []))) + + if len(datasets) == 1: + label, cols = datasets[0] + if args.output: + out_path = args.output + else: + stem = Path(args.csvs[0]).stem.removesuffix("_play_results") + out_path = str(Path(args.csvs[0]).parent / f"{stem}_detail.png") + _plot_detail(label, cols, out_path) + else: + if args.output: + out_path = args.output + else: + first_stem = Path(args.csvs[0]).stem.removesuffix("_play_results") + out_path = str(Path(args.csvs[0]).parent / f"{first_stem}_sweep.png") + _plot_comparison(datasets, out_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_training.py b/scripts/plot_training.py new file mode 100644 index 0000000..2cdff02 --- /dev/null +++ b/scripts/plot_training.py @@ -0,0 +1,114 @@ +"""Plot training curves from one or more *_log.csv files produced by EpRewardLogCallback. + +Examples +-------- +.. code-block:: bash + + uv run python scripts/plot_training.py models/hybrid_cv01_s5775_log.csv + uv run python scripts/plot_training.py models/hybrid_cv01_s5775_log.csv models/sig_t_min_s5775_log.csv + uv run python scripts/plot_training.py models/hybrid_cv01_s5775_log.csv --output comparison.png +""" + +import argparse +import csv +import logging +from pathlib import Path + +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker + +LOGGER = logging.getLogger(__name__) + +PANELS: list[tuple[str, str]] = [ + ("rail_hit%↓", "rail_hit_pct"), + ("rew/step↑", "rew_per_step"), + ("ep_len↑", "ep_len_mean"), + ("expl_var↑", "explained_variance"), + ("value_loss↓", "value_loss"), + ("entropy~", "entropy_loss"), + ("approx_kl↓", "approx_kl"), + ("clip_frac↓", "clip_fraction"), + ("|θ̇|↓", "mean_theta_dot_abs"), + ("|x|↓ (m)", "mean_x_pos_abs"), + ("|xv|↓", "mean_x_vel_abs"), + ("energy↓", "mean_energy"), +] + + +def _load_csv(path: str) -> tuple[list[float], dict[str, list[float]]]: + """Read a training log CSV; return (timesteps, {column: values}) with NaN for missing entries.""" + ts: list[float] = [] + cols: dict[str, list[float]] = {} + with Path(path).open(newline="") as f: + for row in csv.DictReader(f): + try: + ts.append(float(row["t"])) + except (ValueError, KeyError): + continue + for key, val in row.items(): + if key == "t": + continue + try: + cols.setdefault(key, []).append(float(val)) + except ValueError: + cols.setdefault(key, []).append(float("nan")) + return ts, cols + + +def main() -> None: + """Parse CLI arguments and produce the training curve PNG.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + parser = argparse.ArgumentParser(description="Plot training curves from *_log.csv files.") + parser.add_argument("csvs", nargs="+", help="One or more training log CSV paths") + parser.add_argument( + "--output", + type=str, + default=None, + help="Output PNG path (default: first CSV stem + _training.png in the same directory)", + ) + args = parser.parse_args() + + datasets: list[tuple[str, list[float], dict[str, list[float]]]] = [] + for csv_path in args.csvs: + stem = Path(csv_path).stem + label = stem.removesuffix("_log") + ts, cols = _load_csv(csv_path) + datasets.append((label, ts, cols)) + LOGGER.info("Loaded %s (%d rows)", label, len(ts)) + + fig, axes = plt.subplots(4, 3, figsize=(15, 12)) + fmt = mticker.FuncFormatter(lambda x, _: f"{x:,.0f}") + + linestyles = ["-", "--", "-.", ":"] + for ax, (title, col_key) in zip(axes.flat, PANELS, strict=True): + for i, (label, ts, cols) in enumerate(datasets): + ys = cols.get(col_key, [float("nan")] * len(ts)) + ax.plot(ts, ys, linewidth=1.5, label=label, linestyle=linestyles[i % len(linestyles)]) + ax.set_title(title, fontsize=10) + ax.set_xlabel("steps", fontsize=8) + ax.xaxis.set_major_formatter(fmt) + ax.tick_params(labelsize=7) + ax.grid(visible=True, alpha=0.3) + + if len(datasets) > 1: + handles, labels = axes.flat[0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper right", fontsize=9, framealpha=0.8) + + suptitle = ", ".join(d[0] for d in datasets) + fig.suptitle(suptitle, fontsize=11) # pyright: ignore[reportAttributeAccessIssue] + fig.tight_layout() + + if args.output: + out_path = args.output + else: + first_stem = Path(args.csvs[0]).stem.removesuffix("_log") + out_path = str(Path(args.csvs[0]).parent / f"{first_stem}_training.png") + + fig.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + LOGGER.info("Training PNG: %s", out_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py index dcef917..0e99ed8 100644 --- a/scripts/train_ppo.py +++ b/scripts/train_ppo.py @@ -5,6 +5,9 @@ .. code-block:: bash uv run python scripts/train_ppo.py + uv run python scripts/train_ppo.py --config experiments/baseline.yaml + uv run python scripts/train_ppo.py --config experiments/baseline.yaml --steps 500000 + uv run python scripts/train_ppo.py --reward-fac 1.0 0.0015 0.0 0.005 0.01 uv run python scripts/train_ppo.py --steps 500000 --n-envs 8 --save-path models/ppo.zip uv run python scripts/train_ppo.py --resume-from models/ppo_AntiPendulumEnv.zip --steps 50000 uv run python scripts/train_ppo.py --dry-run @@ -16,22 +19,44 @@ from crane_controller.crane_factory import build_crane from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.experiment_config import ( + ExperimentConfig, + RewardConfig, + TrainingConfig, + load_experiment_config, + load_training_sidecar, + save_training_sidecar, +) from crane_controller.ppo_agent import ProximalPolicyOptimizationAgent LOGGER = logging.getLogger(__name__) -def main() -> None: +def main() -> None: # noqa: PLR0915 """Parse CLI arguments and train a PPO agent.""" logging.basicConfig(level=logging.INFO, format="%(message)s") + + # Pre-parse --config before the main parser so YAML values can seed defaults. + pre_parser = argparse.ArgumentParser(add_help=False) + _ = pre_parser.add_argument("--config", type=str, default=None) + pre_args, _ = pre_parser.parse_known_args() + config = load_experiment_config(pre_args.config) + parser = argparse.ArgumentParser(description="Train a PPO agent on the crane anti-pendulum task.") - _ = parser.add_argument("--steps", type=int, default=100_000, help="Total training timesteps") - _ = parser.add_argument("--n-envs", type=int, default=4, help="Number of parallel environments") + _ = parser.add_argument( + "--steps", + type=int, + default=config.training.steps, + help="Timesteps for this run (default from --config or 100 000). Does not cap total across resumes.", + ) + _ = parser.add_argument( + "--n-envs", type=int, default=config.training.n_envs, help="Number of parallel environments" + ) _ = parser.add_argument("--render-mode", type=str, default="none", help="Render mode during training") _ = parser.add_argument( "--save-path", type=str, - default="models/ppo_AntiPendulumEnv.zip", + default=config.training.save_path, help="Where to save the trained model", ) _ = parser.add_argument( @@ -43,7 +68,7 @@ def main() -> None: _ = parser.add_argument( "--gamma", type=float, - default=0.99, + default=config.training.gamma, help="Discount factor for future rewards (default 0.99). Try 0.999 for longer planning horizon.", ) _ = parser.add_argument( @@ -51,8 +76,109 @@ def main() -> None: action="store_true", help="Run 1000 steps with live reward-tracking plot, without saving the model.", ) + _ = parser.add_argument( + "--config", + type=str, + default=None, + metavar="PATH", + help="Path to a YAML experiment config file.", + ) + _ = parser.add_argument( + "--reward-fac", + type=float, + nargs=5, + default=None, + metavar=("ENERGY", "POSITIONAL", "TIME", "POSITION", "ACCELERATION"), + help="Override all five reward weights (beats --config).", + ) + _ = parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for PPO initialisation. Omit for non-deterministic training.", + ) + _ = parser.add_argument( + "--ent-coef", + type=float, + default=config.training.ent_coef, + help="Entropy bonus coefficient (default 0.0). Try 0.005-0.01 to reduce seed sensitivity.", + ) + _ = parser.add_argument( + "--learning-rate", + type=float, + default=config.training.learning_rate, + help="Adam learning rate (default 3e-4).", + ) + _ = parser.add_argument( + "--clip-range", + type=float, + default=config.training.clip_range, + help="PPO clipping parameter (default 0.2). Lower = more conservative updates.", + ) + _ = parser.add_argument( + "--n-steps", + type=int, + default=config.training.n_steps, + help="Timesteps per env before each gradient update (default 2048). Try 8192 for more stable gradients.", + ) + _ = parser.add_argument( + "--randomize-start", + action="store_true", + default=config.training.randomize_start, + help="Randomise initial pendulum speed each episode (default False).", + ) + _ = parser.add_argument( + "--rail-limit", + type=float, + default=config.training.rail_limit, + help="Half-span of the crane rail in metres (default 10.0). " + "Reduce to e.g. 2.0 for earlier truncation and tighter credit assignment.", + ) + _ = parser.add_argument( + "--start-speed", + type=float, + default=config.training.start_speed, + help="Initial pendulum speed (default 1.0). Upper bound of training range when --randomize-start is set.", + ) + _ = parser.add_argument( + "--continuous-actions", + "--no-continuous-actions", + action=argparse.BooleanOptionalAction, + default=config.training.continuous_actions, + help="Use Box(-1,1) action space for PPO (default True). Pass --no-continuous-actions for Discrete(3).", + ) + _ = parser.add_argument( + "--max-episode-steps", + type=int, + default=config.training.max_episode_steps, + help="TimeLimit cap per episode (default 1000).", + ) args = parser.parse_args() + # Resolve final reward config: explicit --reward-fac beats loaded YAML/defaults. + reward_config = RewardConfig(*args.reward_fac) if args.reward_fac is not None else config.reward + + experiment_config = ExperimentConfig( + reward=reward_config, + training=TrainingConfig( + steps=args.steps, + n_envs=args.n_envs, + gamma=args.gamma, + save_path=args.save_path, + seed=args.seed, + ent_coef=args.ent_coef, + learning_rate=args.learning_rate, + clip_range=args.clip_range, + n_steps=args.n_steps, + randomize_start=args.randomize_start, + rail_limit=args.rail_limit, + start_speed=args.start_speed, + continuous_actions=args.continuous_actions, + max_episode_steps=args.max_episode_steps, + ), + config_source=pre_args.config, + ) + if args.dry_run: agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, @@ -65,20 +191,59 @@ def main() -> None: ) agent.do_training(1000, progress_bar=False) elif args.resume_from: + # Warn when --steps overrides a YAML value on resume — the cap is not enforced. + if pre_args.config is not None and args.steps != config.training.steps: + LOGGER.warning( + "--steps %d overrides the experiment config value (%d). " + "Training will run %d steps from the checkpoint — not a cumulative cap.", + args.steps, + config.training.steps, + args.steps, + ) + + # Priority: --reward-fac > --config > sidecar from checkpoint > defaults + if args.reward_fac is not None: + resume_reward = RewardConfig(*args.reward_fac) + elif pre_args.config is not None: + resume_reward = config.reward + else: + try: + resume_reward = load_training_sidecar(args.resume_from).reward + except FileNotFoundError: + LOGGER.warning("No sidecar found for %s; using config defaults.", args.resume_from) + resume_reward = config.reward + resume_config = ExperimentConfig( + reward=resume_reward, + training=TrainingConfig( + steps=args.steps, + n_envs=args.n_envs, + gamma=args.gamma, + save_path=args.save_path, + continuous_actions=args.continuous_actions, + ), + config_source=pre_args.config, + ) Path(args.save_path).parent.mkdir(parents=True, exist_ok=True) agent = ProximalPolicyOptimizationAgent.resume( AntiPendulumEnv, model_path=args.resume_from, env_kwargs={ "crane": build_crane, - "start_speed": 1.0, + "start_speed": args.start_speed, + "randomize_start": args.randomize_start, "render_mode": args.render_mode, - "reward_fac": (1.0, 0.0015, 0.0), + "reward_fac": resume_config.reward, + "rail_limit": args.rail_limit, + "reward_limit": resume_config.training.reward_limit, + "continuous_actions": args.continuous_actions, }, save_path=args.save_path, n_envs=args.n_envs, + max_episode_steps=resume_config.training.max_episode_steps, ) - agent.do_training(args.steps, reset_num_timesteps=False) + csv_path = str(Path(args.save_path).with_name(Path(args.save_path).stem + "_log.csv")) + agent.do_training(args.steps, reset_num_timesteps=False, csv_path=csv_path) + _ = save_training_sidecar(args.save_path, resume_config) vecnorm_path = Path(args.save_path).parent / f"{Path(args.save_path).stem}_vecnorm.pkl" LOGGER.info("Model saved to %s", args.save_path) LOGGER.info("VecNormalize stats saved to %s", vecnorm_path) @@ -89,14 +254,26 @@ def main() -> None: n_envs=args.n_envs, env_kwargs={ "crane": build_crane, - "start_speed": 1.0, + "start_speed": args.start_speed, + "randomize_start": args.randomize_start, "render_mode": args.render_mode, - "reward_fac": (1.0, 0.0015, 0.0), + "reward_fac": experiment_config.reward, + "rail_limit": experiment_config.training.rail_limit, + "reward_limit": experiment_config.training.reward_limit, + "continuous_actions": args.continuous_actions, }, save_path=args.save_path, gamma=args.gamma, + seed=args.seed, + ent_coef=args.ent_coef, + learning_rate=args.learning_rate, + clip_range=args.clip_range, + n_steps=args.n_steps, + max_episode_steps=experiment_config.training.max_episode_steps, ) - agent.do_training(args.steps) + csv_path = str(Path(args.save_path).with_name(Path(args.save_path).stem + "_log.csv")) + agent.do_training(args.steps, csv_path=csv_path) + _ = save_training_sidecar(args.save_path, experiment_config) vecnorm_path = Path(args.save_path).parent / f"{Path(args.save_path).stem}_vecnorm.pkl" LOGGER.info("Model saved to %s", args.save_path) LOGGER.info("VecNormalize stats saved to %s", vecnorm_path) diff --git a/src/crane_controller/callbacks.py b/src/crane_controller/callbacks.py new file mode 100644 index 0000000..b3a631a --- /dev/null +++ b/src/crane_controller/callbacks.py @@ -0,0 +1,229 @@ +"""SB3 training callbacks for the crane-controller project.""" + +from __future__ import annotations + +import csv +from pathlib import Path +from typing import Any + +import numpy as np +from stable_baselines3.common.callbacks import BaseCallback +from tqdm import tqdm + + +class EpRewardLogCallback(BaseCallback): + """Logs episode stats and PPO diagnostics via tqdm.write() every log_interval steps. + + Prints one line per interval with three ``|``-separated families:: + + [ 600,192/3,000,000] ep_len_mean↑=100 rew/step↑=-0.046 + | kl↓=0.0163 expl_var↑=0.346 value_loss↓=0.041 entropy~=-1.188 clip_frac↓=0.175 + | rail_hit%↓=12% t_min↓=0.820s |x|↓=0.019m |xv|↓=0.001 E↓=0.0001 |ω|↓=0.001 + + - Family 1: policy performance (ep_len_mean, rew/step) + - Family 2: PPO diagnostics (kl, expl_var, value_loss, entropy, clip_frac) + - Family 3: task quality — rail crash rate plus mean physical end-states for survived episodes: + - rail_hit% truncated before max_episode_steps (crane hit the rail) + - t_min mean minimum-time-to-stop at episode end (s) + - |x| mean absolute crane position at episode end (m) + - |xv| mean absolute crane velocity at episode end + - E mean load kinetic energy at episode end + - |ω| mean absolute load angular velocity at episode end + + If *csv_path* is given, all rows are written to a CSV file at the end of + training for post-training analysis and plotting. The CSV also includes + ``policy_gradient_loss`` which is omitted from the terminal line. + + Parameters + ---------- + total_timesteps : int + Total training timesteps (used for the progress label). + log_interval : int + Minimum timesteps between log lines (default 50 000). + csv_path : str or None + Path to write a CSV log file at the end of training (default None). + max_episode_steps : int + TimeLimit cap passed to the environment (default 1000). Used to + distinguish rail hits (ep_len < max_episode_steps) from survived + episodes (ep_len >= max_episode_steps). + """ + + def __init__( + self, + total_timesteps: int, + log_interval: int = 50_000, + csv_path: str | None = None, + max_episode_steps: int = 1000, + ) -> None: + super().__init__(verbose=0) # pyright: ignore[reportCallIssue] + self._total = total_timesteps + self._log_interval = log_interval + self._last_log: int = 0 + self._csv_path = csv_path + self._max_episode_steps = max_episode_steps + self._rows: list[dict[str, float]] = [] + # Per-interval episode counters (reset after each log line) + self._ep_count: int = 0 + self._rail_hits: int = 0 + self._surv_t_min_sum: float = 0.0 + self._surv_t_min_n: int = 0 + self._surv_x_pos_sum: float = 0.0 + self._surv_x_pos_n: int = 0 + self._surv_x_vel_sum: float = 0.0 + self._surv_x_vel_n: int = 0 + self._surv_energy_sum: float = 0.0 + self._surv_energy_n: int = 0 + self._surv_theta_dot_sum: float = 0.0 + self._surv_theta_dot_n: int = 0 + self._surv_theta_dev_sum: float = 0.0 + self._surv_theta_dev_n: int = 0 + + def _diag(self, key: str) -> float | None: + """Read a value from SB3's internal logger; returns None if not yet available.""" + try: + val = self.model.logger.name_to_value.get(key) # pyright: ignore[reportAttributeAccessIssue,reportUnknownMemberType] + return float(val) if val is not None else None + except AttributeError: + return None + + def _on_step(self) -> bool: # noqa: C901, PLR0912, PLR0915 + _locals: dict[str, Any] = self.locals # pyright: ignore[reportAttributeAccessIssue,reportUnknownMemberType] + dones = _locals.get("dones") # pyright: ignore[reportUnknownMemberType] + infos = _locals.get("infos") # pyright: ignore[reportUnknownMemberType] + if dones is not None and infos is not None: + for done, info in zip(dones, infos, strict=False): + if done: + self._ep_count += 1 + ep_steps: int = int(info.get("steps", 0)) # pyright: ignore[reportUnknownMemberType] + if ep_steps < self._max_episode_steps: + self._rail_hits += 1 + else: + t_min = info.get("t_min") # pyright: ignore[reportUnknownMemberType] + x_pos = info.get("x_pos") # pyright: ignore[reportUnknownMemberType] + x_vel = info.get("x_vel") # pyright: ignore[reportUnknownMemberType] + energy = info.get("energy") # pyright: ignore[reportUnknownMemberType] + theta_dot = info.get("theta_dot") # pyright: ignore[reportUnknownMemberType] + if t_min is not None: + self._surv_t_min_sum += float(t_min) + self._surv_t_min_n += 1 + if x_pos is not None: + self._surv_x_pos_sum += abs(float(x_pos)) + self._surv_x_pos_n += 1 + if x_vel is not None: + self._surv_x_vel_sum += abs(float(x_vel)) + self._surv_x_vel_n += 1 + if energy is not None: + self._surv_energy_sum += float(energy) + self._surv_energy_n += 1 + if theta_dot is not None: + self._surv_theta_dot_sum += abs(float(theta_dot)) + self._surv_theta_dot_n += 1 + theta = info.get("theta") # pyright: ignore[reportUnknownMemberType] + if theta is not None: + self._surv_theta_dev_sum += abs(float(theta) - np.pi) + self._surv_theta_dev_n += 1 + + t: int = self.num_timesteps # pyright: ignore[reportAttributeAccessIssue,reportUnknownMemberType] + buf = self.model.ep_info_buffer # pyright: ignore[reportAttributeAccessIssue,reportUnknownMemberType] + if t - self._last_log >= self._log_interval and buf is not None and len(buf) > 0: + mean_rew = float(np.mean([ep["r"] for ep in buf])) + mean_len = float(np.mean([ep["l"] for ep in buf])) + kl = self._diag("train/approx_kl") + ev = self._diag("train/explained_variance") + vl = self._diag("train/value_loss") + ent = self._diag("train/entropy_loss") + clip = self._diag("train/clip_fraction") + pgl = self._diag("train/policy_gradient_loss") + + # Family 1 + line = f"[{t:>8,}/{self._total:,}] ep_len_mean↑={mean_len:.0f} rew/step↑={mean_rew / mean_len:+.3f}" + + # Family 2 + if any(v is not None for v in (kl, ev, vl, ent, clip)): + parts: list[str] = [] + if kl is not None: + parts.append(f"kl↓={kl:.4f}") # lower is healthier (<0.02 OK) + if ev is not None: + parts.append(f"expl_var↑={ev:.3f}") # higher is healthier (>0.5 OK) + if vl is not None: + parts.append(f"value_loss↓={vl:.4f}") # lower is healthier (<0.1 OK) + if ent is not None: + parts.append(f"entropy~={ent:.3f}") # decays toward 0 over training + if clip is not None: + parts.append(f"clip_frac↓={clip:.3f}") # lower is healthier (<0.15 OK) + line += " | " + " ".join(parts) + + # Family 3 — physical end-state means for survived episodes + _sm = lambda s, n: s / n if n > 0 else float("nan") # noqa: E731 + t_min_m = _sm(self._surv_t_min_sum, self._surv_t_min_n) + x_pos_m = _sm(self._surv_x_pos_sum, self._surv_x_pos_n) + x_vel_m = _sm(self._surv_x_vel_sum, self._surv_x_vel_n) + energy_m = _sm(self._surv_energy_sum, self._surv_energy_n) + theta_dot_m = _sm(self._surv_theta_dot_sum, self._surv_theta_dot_n) + theta_dev_m = _sm(self._surv_theta_dev_sum, self._surv_theta_dev_n) + + if self._ep_count > 0: + rail_pct = 100.0 * self._rail_hits / self._ep_count + _f = lambda v, fmt, u="": "---" if np.isnan(v) else f"{v:{fmt}}{u}" # noqa: E731 + line += ( + f" | rail_hit%↓={rail_pct:.0f}%" + f" t_min↓={_f(t_min_m, '.3f', 's')}" + f" |x|↓={_f(x_pos_m, '.4f', 'm')}" + f" |xv|↓={_f(x_vel_m, '.4f')}" + f" E↓={_f(energy_m, '.4f')}" + f" |ω|↓={_f(theta_dot_m, '.4f')}" + f" |θ-π|↓={_f(theta_dev_m, '.4f')}" + ) + + tqdm.write(line) + + rail_pct_val = 100.0 * self._rail_hits / self._ep_count if self._ep_count > 0 else float("nan") + + self._rows.append( + { + "t": float(t), + "ep_len_mean": mean_len, + "rew_per_step": mean_rew / mean_len, + "approx_kl": kl if kl is not None else float("nan"), + "explained_variance": ev if ev is not None else float("nan"), + "value_loss": vl if vl is not None else float("nan"), + "entropy_loss": ent if ent is not None else float("nan"), + "clip_fraction": clip if clip is not None else float("nan"), + "policy_gradient_loss": pgl if pgl is not None else float("nan"), + "rail_hit_pct": rail_pct_val, + "mean_t_min": t_min_m, + "mean_x_pos_abs": x_pos_m, + "mean_x_vel_abs": x_vel_m, + "mean_energy": energy_m, + "mean_theta_dot_abs": theta_dot_m, + "mean_theta_dev": theta_dev_m, + } + ) + + # Reset per-interval counters + self._ep_count = 0 + self._rail_hits = 0 + self._surv_t_min_sum = 0.0 + self._surv_t_min_n = 0 + self._surv_x_pos_sum = 0.0 + self._surv_x_pos_n = 0 + self._surv_x_vel_sum = 0.0 + self._surv_x_vel_n = 0 + self._surv_energy_sum = 0.0 + self._surv_energy_n = 0 + self._surv_theta_dot_sum = 0.0 + self._surv_theta_dot_n = 0 + self._surv_theta_dev_sum = 0.0 + self._surv_theta_dev_n = 0 + self._last_log = t + + return True + + def _on_training_end(self) -> None: + if self._csv_path and self._rows: + Path(self._csv_path).parent.mkdir(parents=True, exist_ok=True) + fieldnames = list(self._rows[0].keys()) + with Path(self._csv_path).open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(self._rows) diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index 04992e1..5e6c10f 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -13,6 +13,8 @@ from gymnasium import spaces from py_crane.animation import AnimatePlayBackLines +from crane_controller.experiment_config import RewardConfig + if TYPE_CHECKING: from collections.abc import Callable @@ -56,7 +58,7 @@ def _level(idx: int, val: float, categories: tuple[float, ...]) -> tuple[int, in AntiPendulumObs = tuple[int, ...] | np.ndarray -class AntiPendulumEnv(gym.Env[AntiPendulumObs, int]): +class AntiPendulumEnv(gym.Env[AntiPendulumObs, int | np.ndarray]): """Environment for a py-crane-based anti-pendulum task. Uses the matplotlib-based animation module from py-crane. @@ -68,14 +70,18 @@ class AntiPendulumEnv(gym.Env[AntiPendulumObs, int]): acc : float, optional Acceleration magnitude applied to the crane (default 0.1). start_speed : float, optional - Fixed start speed in degrees. A negative value causes a random speed - in the range ``[-|start_speed|, |start_speed|]`` each episode - (default 1.0). + Initial pendulum speed for each episode (default 1.0). + randomize_start : bool, optional + If True, sample the initial pendulum speed uniformly from + ``[min_speed, abs(start_speed)]`` with random sign each reset. + Discourages the policy from overfitting to a single starting trajectory + (default False - deterministic). render_mode : str, optional One of the modes listed in ``metadata["render_modes"]`` (default ``"none"``). - size : float, optional - Axis length in all directions (default 10.0). + rail_limit : float, optional + Half-span of the crane rail in metres (default 10.0). The crane spans + ``+-rail_limit``; an episode is truncated when ``|x| > rail_limit``. seed : int or None, optional Seed for repeatable random numbers (default None). reward_limit : float, optional @@ -87,6 +93,14 @@ class AntiPendulumEnv(gym.Env[AntiPendulumObs, int]): When provided, activates discrete observation mode with the given category boundaries. Expected keys: ``"angles"``, ``"pos"``, ``"speed"``, ``"distance"``, ``"sector"`` (default None). + reward_fac : RewardConfig, optional + Weights for the reward contributions. Defaults to the canonical PPO + values - see :class:`.RewardConfig`. + continuous_actions : bool, optional + If True, the action space is ``Box([-1], [1])`` and an action value + in ``[-1, 1]`` is scaled by ``acc`` to produce the crane acceleration. + If False, the action space is ``Discrete(3)`` with mapping + ``0=-acc, 1=0, 2=+acc`` (Q-agent compatible, default False). """ metadata: ClassVar[dict[str, object]] = { # pyright: ignore[reportIncompatibleVariableOverride] # Gymnasium metadata typing is loose @@ -108,13 +122,15 @@ def __init__( # noqa: PLR0913 - environment API needs explicit parameters crane: Callable[..., Crane], acc: float = 0.1, start_speed: float = 1.0, + randomize_start: bool = False, # noqa: FBT001, FBT002 render_mode: str = "none", - size: float = 10.0, + rail_limit: float = 10.0, seed: int | None = None, reward_limit: float = 50.0, dt: float = 1.0, discrete: dict[str, tuple[float | int, ...]] | None = None, - reward_fac: tuple[float, float, float] = (1.0, 0.0015, 0.001), + reward_fac: RewardConfig | None = None, + continuous_actions: bool = False, # noqa: FBT001, FBT002 ) -> None: """Initialize the anti-pendulum environment. @@ -127,7 +143,8 @@ def __init__( # noqa: PLR0913 - environment API needs explicit parameters self.wire: Wire = wire # type: ignore[assignment] # boom_by_name returns Boom; at runtime this is Wire assert render_mode in self.metadata["render_modes"], f"render_mode: {render_mode}" # type: ignore[operator] # metadata values are typed as object self.render_mode = render_mode - self.reward_fac = reward_fac + self.reward_fac: RewardConfig = reward_fac if reward_fac is not None else RewardConfig() + self.continuous_actions = continuous_actions self.reward_stats: list[list[float]] = [] self._playback: list[list[float]] = [] self.rewards: list[float] = [] @@ -137,29 +154,35 @@ def __init__( # noqa: PLR0913 - environment API needs explicit parameters self.traces: dict[str, list[float]] = {"c_x": [], "c_v": [], "l_x": [], "l_v": [], "acc": []} self.obeservation_space: spaces.Box | spaces.Discrete # pyright: ignore[reportMissingTypeArgument] # Discrete type arg not needed here - # Continuous observations are crane position, crane velocity, wire polar angle, and load x-velocity. + # Continuous observations: crane position, crane velocity, wire polar angle, pure angular velocity theta_dot. self.min_speed = 0.1 # np.sqrt(2*reward_limit) # starting with less does not make sense (goal already reached) max_speed = np.sqrt(9.81 * self.wire.length) # speed for pendulum at +/- 90 deg. Polar as deflection from -z if discrete is not None: self.observation_space, self.discrete = self._init_discrete(discrete) else: self.discrete = {} - self.spaces_min = np.array((-size, -max_speed, 0.0, -max_speed), float) - self.spaces_max = np.array((size, max_speed, np.pi, max_speed), float) + self.spaces_min = np.array((-rail_limit, -max_speed, 0.0, -max_speed), float) + self.spaces_max = np.array((rail_limit, max_speed, np.pi, max_speed), float) self.observation_space = spaces.Box(self.spaces_min, self.spaces_max, shape=(4,), dtype=np.float64) self.nresets: int = 0 self.acc = acc self.start_speed = start_speed - self.size = size - self.figsize: tuple[float, float] = (-size, size) # The matplotlib animation window + self.randomize_start = randomize_start + self.initial_speed: float = start_speed + self.rail_limit = rail_limit + self.figsize: tuple[float, float] = (-rail_limit, rail_limit) # The matplotlib animation window self.reward_limit = reward_limit self.nsuccess: int = 0 self.reward = 0.0 # a basic reward (pendulum energy + distance measure) self.dt = dt + self._prev_theta_dot: float | None = None - # We have 1 acceleration action which can each be min, zero or max, corresponding to acceleration of crane - self.action_space = spaces.Discrete(3, start=0, seed=42, dtype=np.int64) + if continuous_actions: + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32) + else: + # Discrete actions: 0 = -acc (left), 1 = 0 (coast), 2 = +acc (right) + self.action_space = spaces.Discrete(3, start=0, seed=42, dtype=np.int64) self.action_to_acc = {0: -self.acc, 1: 0.0, 2: self.acc} self.steps: int = 0 self.time: float = 0.0 @@ -234,36 +257,45 @@ def show_animation(self) -> None: ) ani.do_animation() - def show_plot(self, episode: int) -> None: + def show_plot(self, episode: int, save_path: str | None = None) -> None: """Plot detailed traces for a single episode. Parameters ---------- episode : int Episode number used in the plot title. + save_path : str or None, optional + If set, save the figure to this path and close it instead of calling + ``plt.show()`` (default None — interactive display). """ - _, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(16, 12)) + if not self.traces["l_v"]: + return + fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(6, 1, figsize=(16, 18), sharex=True) times = self.dt * np.arange(len(self.traces["c_x"])) damping = self.traces["l_v"][0] * np.exp(-times / self.wire.damping_time) ax1.plot(times, self.traces["l_x"], label="load angle", color="blue") - ax1y2 = ax1.twinx() - ax1y2.plot(times, self.traces["l_v"], label="load speed", color="red") - ax1y2.plot(times, damping, label="damping", color="green") - ax2.plot(times, self.traces["c_x"], label="crane pos", color="blue") - ax2y2 = ax2.twinx() - ax2y2.plot(times, self.traces["c_v"], label="crane speed", color="red") - ax3.plot(times[: len(self.rewards)], self.rewards, label="rewards") - ax4.plot(times, self.traces["acc"], label="x-acceleration", color="green") - lines1, labels1 = ax1.get_legend_handles_labels() - lines2, labels2 = ax1y2.get_legend_handles_labels() - ax1.legend(lines1 + lines2, labels1 + labels2) - lines3, labels3 = ax2.get_legend_handles_labels() - lines4, labels4 = ax2y2.get_legend_handles_labels() - ax2.legend(lines3 + lines4, labels3 + labels4, loc="upper left") - _ = ax3.legend() - _ = ax4.legend() - _ = plt.suptitle(f"Detailed plot of episode {episode}, reward:{self.reward}") - plt.show() + ax2.plot(times, self.traces["l_v"], label="load speed", color="red") + ax2.plot(times, damping, label="damping", color="green") + ax3.plot(times, self.traces["c_x"], label="crane pos", color="blue") + ax3.axhline(0, color="gray", linestyle="--", linewidth=0.8, alpha=0.7, label="origin") + ax4.plot(times, self.traces["c_v"], label="crane speed", color="red") + ax5.plot(times[: len(self.rewards)], self.rewards, label="rewards") + ax6.plot(times, self.traces["acc"], label="x-acceleration", color="green") + ax6.set_xlabel("time [s]") + for ax in (ax1, ax2, ax3, ax4, ax5, ax6): + _ = ax.legend() + _ = plt.suptitle( + f"Detailed plot of episode {episode}, reward:{self.reward}, start_speed:{self.initial_speed:.3f}" # pyright: ignore[reportUnknownMemberType] + ) + fig.tight_layout() + if save_path is not None: + from pathlib import Path # noqa: PLC0415 + + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_path) + plt.close(fig) + else: + plt.show() for key in self.traces: self.traces[key] = [] self.rewards = [] @@ -284,7 +316,7 @@ def _get_continuous_obs(self) -> tuple[np.ndarray, int]: self.crane.position[0], self.crane.velocity[0], self.wire.boom[1], - self.wire.cm_v[0], # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire + (self.wire.cm_v[0] - self.wire.origin_v[0]) / self.wire.length, # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire ), float, ) @@ -341,7 +373,28 @@ def _get_obs(self, acc: float = 0.0) -> tuple[np.ndarray | tuple[int, ...], floa positional = -self.wire.length * (abs(self.crane.position[0]) + self.crane.velocity[0] ** 2 / self.acc) else: positional = 0.0 # if the crane moves towards the origo we do not subtract reward - self.reward = sum(f * r for f, r in zip(self.reward_fac, (energy, positional, -self.time), strict=True)) + position = -abs(self.crane.position[0]) + acc_penalty = -abs(acc) + rc = self.reward_fac + self.reward = ( + rc.energy * energy + + rc.positional * positional + + rc.time * (-self.time) + + rc.position * position + + rc.acceleration * acc_penalty + ) + theta = self.wire.boom[1] + theta_dot = (self.wire.cm_v[0] - self.wire.origin_v[0]) / self.wire.length # pyright: ignore[reportUnknownMemberType] + theta_ddot = (theta_dot - self._prev_theta_dot) / self.dt if self._prev_theta_dot is not None else 0.0 + self._prev_theta_dot = theta_dot + self.reward += ( # pyright: ignore[reportUnknownMemberType] + rc.angle * (-(theta**2)) + + rc.angular_velocity * (-(theta_dot**2)) + + rc.crane_velocity * (-(self.crane.velocity[0] ** 2)) + + rc.crane_acceleration * (-(acc**2)) + + rc.angular_acceleration * (-(theta_ddot**2)) + + rc.t_min_crane * (-self._t_min_crane()) + ) obs: tuple[int, ...] | np.ndarray if len(self.discrete): @@ -358,7 +411,27 @@ def _get_obs(self, acc: float = 0.0) -> tuple[np.ndarray | tuple[int, ...], floa self.traces["l_v"].append(self.wire.cm_v[0]) # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire self.traces["acc"].append(acc) - return (obs, self.reward, err) + return (obs, self.reward, err) # pyright: ignore[reportUnknownMemberType] + + def _t_min_crane(self) -> float: + """Minimum time for the crane to reach x=0 at rest under bang-bang control. + + Returns + ------- + float + Optimal time-to-origin in seconds; zero when crane is already at rest + at the origin. + """ + x0 = self.crane.position[0] + v0 = self.crane.velocity[0] + a = self.acc + if (x0 >= 0 and v0 >= 0) or (x0 <= 0 and v0 <= 0): # moving away from origin + return (abs(v0) + 2.0 * np.sqrt(max(0.0, abs(x0) * a + 0.5 * v0**2))) / a + # moving toward origin + if abs(x0) >= 0.5 * v0**2 / a: # no overshoot + return (-abs(v0) + 2.0 * np.sqrt(max(0.0, abs(x0) * a + 0.5 * v0**2))) / a + # overshoot + return (abs(v0) + 2.0 * np.sqrt(max(0.0, abs(x0) * a - 0.5 * v0**2))) / a def low_reward(self) -> float: """Return the lowest possible reward for the current mode. @@ -374,7 +447,16 @@ def low_reward(self) -> float: return -float(self.discrete["energies"][-1]) def _get_info(self, reward: float, steps: int) -> dict[str, float | int]: - return {"steps": steps, "reward": reward} + return { + "steps": steps, + "reward": reward, + "t_min": self._t_min_crane(), + "x_pos": self.crane.position[0], + "x_vel": self.crane.velocity[0], + "energy": 0.5 * float(self.wire.cm_v[0]) ** 2, # pyright: ignore[reportUnknownMemberType] + "theta": float(self.wire.boom[1]), + "theta_dot": (float(self.wire.cm_v[0]) - float(self.wire.origin_v[0])) / float(self.wire.length), # pyright: ignore[reportUnknownMemberType] + } def reset_crane(self) -> None: """Reset the crane to its initial physical state. @@ -416,7 +498,7 @@ def reset( self._reward_point = self._reward_plot_init("b.") else: # reset between episodes. Data are available - self.reward_stats.append([self.steps, self.reward]) + self.reward_stats.append([self.steps, self.reward]) # pyright: ignore[reportUnknownMemberType] if self.render_mode == "data": self._reward_point.set_data([r[0] for r in self.reward_stats], [r[1] for r in self.reward_stats]) plt.pause(1e-10) @@ -431,15 +513,14 @@ def reset( self.nresets += 1 if self.start_speed == 0.0: # run in 'start' mode, learning how to start the pendulum action assert self.wire.cm_v[0] == 0.0, f"Load speed expected zero. Found {self.wire.cm_v[0]}" # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire - elif self.start_speed < 0.0: # random speed in 'stop' mode [-,+] range - speed = self.np_random.uniform( - -(-self.start_speed - self.min_speed), - (-self.start_speed - self.min_speed), - ) - speed = speed + self.min_speed if speed >= 0 else speed - self.min_speed - self.wire.cm_v[0] = speed # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire - else: # fixed speed in 'stop' mode (more control) + elif self.randomize_start: + speed = self.np_random.uniform(self.min_speed, abs(self.start_speed)) + sign = 1.0 if self.np_random.random() > 0.5 else -1.0 # noqa: PLR2004 + self.wire.cm_v[0] = speed * sign # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire + else: self.wire.cm_v[0] = self.start_speed # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire + self.initial_speed = float(self.wire.cm_v[0]) # pyright: ignore[reportUnknownMemberType] # dynamic attr on Wire + self._prev_theta_dot = None obs, self.reward, _ = self._get_obs() if self.render_mode == "play-back": self._append_playback(0.0) @@ -448,29 +529,37 @@ def reset( info = self._get_info(self.reward, self.steps) return obs, info - def step(self, action: int) -> tuple[tuple[int, ...] | np.ndarray, float, bool, bool, dict[str, float | int]]: + def step( + self, action: int | np.ndarray + ) -> tuple[tuple[int, ...] | np.ndarray, float, bool, bool, dict[str, float | int]]: """Advance the environment by one time step. Parameters ---------- - action : int - Action index selecting the crane acceleration. + action : int or np.ndarray + Discrete mode: index 0 = -acc, 1 = 0, 2 = +acc. + Continuous mode: float in ``[-1, 1]`` scaled by ``acc``. Returns ------- tuple[tuple[int, ...] | np.ndarray, float, bool, bool, dict[str, float | int]] ``(observation, reward, terminated, truncated, info)``. """ - action_idx = action - if action_idx not in self.action_to_acc: - action_idx += 1 - acc = self.action_to_acc[action_idx] + if self.continuous_actions: + acc = float(np.asarray(action).flat[0]) * self.acc + else: + action_idx = int(action) + if action_idx not in self.action_to_acc: + action_idx += 1 + acc = self.action_to_acc[action_idx] self.crane.d_velocity[0] = acc self.steps += 1 _ = self.crane.do_step(self.time, self.dt) self.time += self.dt obs, self.reward, truncated = self._get_obs(acc) + if truncated and self.reward_fac.terminal_penalty != 0.0: + self.reward += self.reward_fac.terminal_penalty if self.render_mode != "none": self.rewards.append(self.reward) @@ -485,11 +574,20 @@ def step(self, action: int) -> tuple[tuple[int, ...] | np.ndarray, float, bool, if terminated: self.nsuccess += 1 info = self._get_info(self.reward, self.steps) + if truncated > 0: + info["crash"] = True return obs, self.reward, terminated, (truncated > 0), info - def render(self) -> None: - """Render the current episode.""" + def render(self, save_path: str | None = None) -> None: + """Render the current episode. + + Parameters + ---------- + save_path : str or None, optional + If set and render_mode is ``"plot"``, save the figure to this path + instead of showing it interactively (default None). + """ if self.render_mode == "play-back": self.show_animation() elif self.render_mode == "plot": - self.show_plot(self.nresets) + self.show_plot(self.nresets, save_path=save_path) diff --git a/src/crane_controller/experiment_config.py b/src/crane_controller/experiment_config.py new file mode 100644 index 0000000..467b07d --- /dev/null +++ b/src/crane_controller/experiment_config.py @@ -0,0 +1,358 @@ +"""Experiment configuration dataclasses and serialisation utilities.""" + +from __future__ import annotations + +import dataclasses +import json +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Mapping + +import yaml + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class RewardConfig: + """Weights for the five reward contributions in AntiPendulumEnv. + + Parameters + ---------- + energy : float + Weight for the pendulum energy term (default 1.0). + positional : float + Weight for the away-from-origin penalty (default 0.0015). + time : float + Weight for the time penalty (default 0.0). + position : float + Weight for the distance-from-origin penalty ``-|x|`` (default 0.005). + acceleration : float + Weight for the actuation-effort penalty ``-|acc|`` (default 0.01). + terminal_penalty : float + One-time reward added on truncation (out-of-bounds crash). Set to a + negative value to penalise early episode termination; 0.0 disables it + (default 0.0). + angle : float + Weight for the squared pendulum angle penalty ``-theta^2`` (default 0.0). + angular_velocity : float + Weight for the squared angular velocity penalty ``-theta_dot^2`` (default 0.0). + Uses pure angular velocity ``(cm_v[0] - origin_v[0]) / wire.length``, + excluding crane translation. + crane_velocity : float + Weight for the squared crane velocity penalty ``-x_dot^2`` (default 0.0). + crane_acceleration : float + Weight for the squared crane acceleration penalty ``-x_ddot^2`` (default 0.0). + Equals the control action squared (acc = action * self.acc). + angular_acceleration : float + Weight for the squared angular acceleration penalty ``-theta_ddot^2`` (default 0.0). + Computed as a one-step finite difference of theta_dot; zero on the first step + after each reset. + t_min_crane : float + Weight for the minimum-time-to-origin penalty ``-t_min`` (default 0.0). + ``t_min`` is the optimal bang-bang time for the crane to reach ``x=0`` at rest + given current position and velocity. Captures both crane position and velocity + in a single physically grounded signal. + """ + + energy: float = 1.0 + positional: float = 0.0015 + time: float = 0.0 + position: float = 0.005 + acceleration: float = 0.01 + terminal_penalty: float = 0.0 + angle: float = 0.0 + angular_velocity: float = 0.0 + crane_velocity: float = 0.0 + crane_acceleration: float = 0.0 + angular_acceleration: float = 0.0 + t_min_crane: float = 0.0 + + @classmethod + def from_dict(cls, d: Mapping[str, object]) -> RewardConfig: + """Instantiate from a mapping, filling missing keys with defaults. + + Parameters + ---------- + d : dict[str, object] + Mapping of field names to weight values. Unknown keys are ignored. + + Returns + ------- + RewardConfig + Populated instance. + """ + defaults = cls() + return cls( + energy=float(d.get("energy", defaults.energy)), # type: ignore[arg-type] + positional=float(d.get("positional", defaults.positional)), # type: ignore[arg-type] + time=float(d.get("time", defaults.time)), # type: ignore[arg-type] + position=float(d.get("position", defaults.position)), # type: ignore[arg-type] + acceleration=float(d.get("acceleration", defaults.acceleration)), # type: ignore[arg-type] + terminal_penalty=float(d.get("terminal_penalty", defaults.terminal_penalty)), # type: ignore[arg-type] + angle=float(d.get("angle", defaults.angle)), # type: ignore[arg-type] + angular_velocity=float(d.get("angular_velocity", defaults.angular_velocity)), # type: ignore[arg-type] + crane_velocity=float(d.get("crane_velocity", defaults.crane_velocity)), # type: ignore[arg-type] + crane_acceleration=float(d.get("crane_acceleration", defaults.crane_acceleration)), # type: ignore[arg-type] + angular_acceleration=float(d.get("angular_acceleration", defaults.angular_acceleration)), # type: ignore[arg-type] + t_min_crane=float(d.get("t_min_crane", defaults.t_min_crane)), # type: ignore[arg-type] + ) + + +@dataclass(frozen=True, slots=True) +class TrainingConfig: + """Hyperparameters for a PPO training run. + + Parameters + ---------- + steps : int + Total training timesteps (default 100 000). + n_envs : int + Number of parallel environments (default 4). + gamma : float + Discount factor (default 0.99). + save_path : str + Where to save the trained model (default ``models/ppo_AntiPendulumEnv.zip``). + seed : int or None + Random seed passed to PPO for reproducibility (default None - non-deterministic). + ent_coef : float + Entropy bonus coefficient (default 0.0). Increase to 0.005-0.01 to encourage + exploration and reduce sensitivity to random seed. + learning_rate : float + Adam learning rate (default 3e-4, the SB3 default). + clip_range : float + PPO clipping parameter (default 0.2). Lower values give more conservative updates. + n_steps : int + Timesteps collected per environment before each gradient update (default 2048, + the SB3 default). Increasing to 8192 gives ~11 episodes per update instead of + ~3, producing more stable gradient estimates for long-horizon tasks. + randomize_start : bool + If True, sample the initial pendulum speed uniformly from + ``[min_speed, abs(start_speed)]`` with random sign each episode. + Discourages the policy from overfitting to a single starting trajectory + (default False - deterministic, matching evaluation behaviour). + rail_limit : float + Half-span of the crane rail in metres (default 10.0). The crane spans + ``+-rail_limit``; an episode is truncated when ``|x| > rail_limit``. + Reducing to e.g. 2.0 triggers earlier termination and tightens the + credit-assignment gap for the terminal penalty. + start_speed : float + Initial pendulum speed passed to the environment (default 1.0). With + ``randomize_start=True`` the actual per-episode speed is sampled from + ``+-[min_speed, start_speed]``, so this acts as the upper bound of the + training speed range. + continuous_actions : bool + If True, use a ``Box([-1], [1])`` action space so PPO can output any + acceleration in ``[-acc, +acc]``. If False, use ``Discrete(3)`` for + Q-learning compatibility (default True). + reward_limit : float + Per-step reward threshold at which an episode is terminated as solved + (default 50.0, effectively disabled). 0.0 is the theoretical maximum; + setting to a large positive value (e.g. 50.0) disables early + termination so the episode always runs to ``max_episode_steps``. + max_episode_steps : int + Maximum steps per episode enforced via a TimeLimit wrapper (default + 100). Replaces the previous hardcoded value of 3000; shorter episodes + let the discount factor propagate rail-penalty credit meaningfully + (``0.99^100 ≈ 0.37`` vs ``0.99^3000 ≈ 10^-13``). + """ + + steps: int = 100_000 + n_envs: int = 4 + gamma: float = 0.99 + save_path: str = "models/ppo_AntiPendulumEnv.zip" + seed: int | None = None + ent_coef: float = 0.0 + learning_rate: float = 3e-4 + clip_range: float = 0.2 + n_steps: int = 2048 + randomize_start: bool = False + rail_limit: float = 10.0 + start_speed: float = 1.0 + continuous_actions: bool = True + reward_limit: float = 50.0 + max_episode_steps: int = 1000 + + @classmethod + def from_dict(cls, d: Mapping[str, object]) -> TrainingConfig: + """Instantiate from a mapping, filling missing keys with defaults. + + Parameters + ---------- + d : dict[str, object] + Mapping of field names to values. Unknown keys are ignored. + + Returns + ------- + TrainingConfig + Populated instance. + """ + defaults = cls() + seed_raw = d.get("seed", defaults.seed) + return cls( + steps=int(d.get("steps", defaults.steps)), # type: ignore[arg-type,call-overload] + n_envs=int(d.get("n_envs", defaults.n_envs)), # type: ignore[arg-type,call-overload] + gamma=float(d.get("gamma", defaults.gamma)), # type: ignore[arg-type] + save_path=str(d.get("save_path", defaults.save_path)), + seed=int(seed_raw) if isinstance(seed_raw, int) else None, + ent_coef=float(d.get("ent_coef", defaults.ent_coef)), # type: ignore[arg-type] + learning_rate=float(d.get("learning_rate", defaults.learning_rate)), # type: ignore[arg-type] + clip_range=float(d.get("clip_range", defaults.clip_range)), # type: ignore[arg-type] + n_steps=int(d.get("n_steps", defaults.n_steps)), # type: ignore[arg-type,call-overload] + randomize_start=bool(d.get("randomize_start", defaults.randomize_start)), + rail_limit=float(d.get("rail_limit", defaults.rail_limit)), # type: ignore[arg-type] + start_speed=float(d.get("start_speed", defaults.start_speed)), # type: ignore[arg-type] + continuous_actions=bool(d.get("continuous_actions", defaults.continuous_actions)), + reward_limit=float(d.get("reward_limit", defaults.reward_limit)), # type: ignore[arg-type] + max_episode_steps=int(d.get("max_episode_steps", defaults.max_episode_steps)), # type: ignore[arg-type,call-overload] + ) + + +@dataclass(frozen=True, slots=True) +class ExperimentConfig: + """Top-level experiment configuration. + + Parameters + ---------- + reward : RewardConfig + Reward weights (defaults to canonical PPO values). + training : TrainingConfig + Training hyperparameters (defaults to script defaults). + config_source : str or None + Path of the YAML file this config was loaded from (default None). + """ + + reward: RewardConfig = field(default_factory=RewardConfig) + training: TrainingConfig = field(default_factory=TrainingConfig) + config_source: str | None = None + + @classmethod + def from_dict(cls, d: Mapping[str, object], *, config_source: str | None = None) -> ExperimentConfig: + """Instantiate from a nested mapping. + + Parameters + ---------- + d : dict[str, object] + Mapping with optional ``reward`` and ``training`` sub-dicts. + A ``config_source`` key in ``d`` is also restored when present. + config_source : str or None + Path to record as the origin; takes precedence over the key in ``d`` + (default None). + + Returns + ------- + ExperimentConfig + Populated instance. + """ + reward_raw = d.get("reward", {}) + training_raw = d.get("training", {}) + resolved_source: str | None = config_source + if resolved_source is None: + raw = d.get("config_source") + if isinstance(raw, str): + resolved_source = raw + return cls( + reward=RewardConfig.from_dict(reward_raw if isinstance(reward_raw, dict) else {}), + training=TrainingConfig.from_dict(training_raw if isinstance(training_raw, dict) else {}), + config_source=resolved_source, + ) + + +def load_experiment_config(config_path: str | Path | None) -> ExperimentConfig: + """Load an experiment config from a YAML file. + + Parameters + ---------- + config_path : str, Path, or None + Path to the YAML config file. Returns an all-default + :class:`ExperimentConfig` when ``None``. + + Returns + ------- + ExperimentConfig + Loaded configuration; missing YAML keys fall back to dataclass defaults. + + Raises + ------ + FileNotFoundError + When ``config_path`` is not ``None`` but the file does not exist. + """ + if config_path is None: + return ExperimentConfig() + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {path}") + with path.open() as fh: + raw: dict[str, object] = yaml.safe_load(fh) or {} + logger.info("Loaded experiment config from %s", path) + return ExperimentConfig.from_dict(raw, config_source=str(path)) + + +def _meta_path(model_path: str | Path) -> Path: + """Return the sidecar path for a model file. + + Parameters + ---------- + model_path : str or Path + Path to the model ``.zip`` file. + + Returns + ------- + Path + Sibling file with the same stem and ``_meta.json`` suffix. + """ + p = Path(model_path) + return p.parent / f"{p.stem}_meta.json" + + +def save_training_sidecar(model_path: str | Path, config: ExperimentConfig) -> Path: + """Write a JSON sidecar alongside a saved model. + + Parameters + ---------- + model_path : str or Path + Path to the model ``.zip`` file. + config : ExperimentConfig + Experiment configuration to serialise. + + Returns + ------- + Path + Path of the written sidecar file. + """ + sidecar = _meta_path(model_path) + payload = dataclasses.asdict(config) + _ = sidecar.write_text(json.dumps(payload, indent=2)) + logger.info("Training sidecar written to %s", sidecar) + return sidecar + + +def load_training_sidecar(model_path: str | Path) -> ExperimentConfig: + """Read the JSON sidecar for a saved model. + + Parameters + ---------- + model_path : str or Path + Path to the model ``.zip`` file. + + Returns + ------- + ExperimentConfig + Configuration stored in the sidecar. + + Raises + ------ + FileNotFoundError + When the sidecar file does not exist alongside the model. + """ + sidecar = _meta_path(model_path) + if not sidecar.exists(): + raise FileNotFoundError( + f"No sidecar found for model {str(model_path)!r}. Re-train with the current scripts to generate one." + ) + raw: dict[str, object] = json.loads(sidecar.read_text()) + return ExperimentConfig.from_dict(raw) diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py index b9533a8..04246d3 100644 --- a/src/crane_controller/ppo_agent.py +++ b/src/crane_controller/ppo_agent.py @@ -2,6 +2,7 @@ from __future__ import annotations +import dataclasses import logging from pathlib import Path from typing import TYPE_CHECKING, Any @@ -14,16 +15,59 @@ from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.vec_env import VecNormalize +from crane_controller.callbacks import EpRewardLogCallback +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv # noqa: TC001 + if TYPE_CHECKING: from collections.abc import Callable - from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv - plt.rcParams["figure.figsize"] = (10, 5) logger = logging.getLogger(__name__) +_T_MIN_SETTLE_EPS = 0.05 + + +def _t_min_settle(trace: list[float]) -> int: + """Return the first step at which t_min permanently stays at/below the plateau. + + Scans from the end of *trace* to find the last step that was still above + ``trace[-1] + _T_MIN_SETTLE_EPS``. Returns that index + 2 (1-indexed step + after the last unsettled step), or 1 if already settled from the start. + """ + if not trace: + return 0 + threshold = trace[-1] + _T_MIN_SETTLE_EPS + for i in range(len(trace) - 1, -1, -1): + if trace[i] > threshold: + return i + 2 + return 1 + + +@dataclasses.dataclass +class EpisodeResult: + """Structured result returned by :meth:`ProximalPolicyOptimizationAgent.do_one_episode`.""" + + start_speed: float + ep_steps: int + ep_reward: float + terminated: bool + truncated: bool + no_crash: bool + t_min_start: float + t_min_min: float + t_min_final: float + t_min_mean_last100: float + t_min_settle_step: int + x_pos_final: float + x_vel_final: float + theta_final: float + theta_dot_final: float + energy_final: float + acc_final: float + + class ProximalPolicyOptimizationAgent: """Agent that learns a policy via the PPO algorithm. @@ -52,28 +96,55 @@ class ProximalPolicyOptimizationAgent: Discount factor for future rewards (default 0.99). Higher values (e.g. 0.999) extend the effective planning horizon, which can improve policy quality on long episodes at the cost of slower value function convergence. + seed : int or None, optional + Random seed passed to PPO for reproducibility (default None). + ent_coef : float, optional + Entropy bonus coefficient (default 0.0). + learning_rate : float, optional + Adam learning rate (default 3e-4). + clip_range : float, optional + PPO clipping parameter (default 0.2). Lower = more conservative updates. + n_steps : int, optional + Timesteps collected per environment before each gradient update (default 2048). + Increasing to 8192 gives ~11 complete episodes per update instead of ~3, + producing more stable gradient estimates for long-horizon tasks. """ - def __init__( + def __init__( # noqa: PLR0913 self, env: Callable[..., AntiPendulumEnv], n_envs: int = 4, env_kwargs: dict[str, Any] | None = None, save_path: str | None = None, - max_episode_steps: int = 3000, + max_episode_steps: int = 1000, gamma: float = 0.99, + seed: int | None = None, + ent_coef: float = 0.0, + learning_rate: float = 3e-4, + clip_range: float = 0.2, + n_steps: int = 2048, ) -> None: """Set up the agent for training. Use :meth:`load` for inference.""" self.save_path = save_path + self._max_episode_steps = max_episode_steps + _mep = max_episode_steps raw_vec_env = make_vec_env( - env_id=env, + env_id=lambda **kw: TimeLimit(env(**kw), max_episode_steps=_mep), n_envs=n_envs, env_kwargs=env_kwargs, - wrapper_class=TimeLimit, # type: ignore[arg-type] - wrapper_kwargs={"max_episode_steps": max_episode_steps}, ) self.vec_env = VecNormalize(raw_vec_env, norm_obs=True, norm_reward=True) - self.model = PPO("MlpPolicy", self.vec_env, gamma=gamma, verbose=1 if n_envs == 1 else 0) + self.model = PPO( + "MlpPolicy", + self.vec_env, + gamma=gamma, + seed=seed, + ent_coef=ent_coef, + learning_rate=learning_rate, + clip_range=clip_range, + n_steps=n_steps, + verbose=1 if n_envs == 1 else 0, + ) self.env: AntiPendulumEnv = self.vec_env.venv.envs[0] # type: ignore[attr-defined] @classmethod @@ -82,6 +153,7 @@ def load( env: Callable[..., AntiPendulumEnv], model_path: str | Path, env_kwargs: dict[str, Any] | None = None, + max_episode_steps: int = 1000, ) -> ProximalPolicyOptimizationAgent: """Load a trained agent for inference. @@ -100,12 +172,12 @@ def load( Agent configured for inference with VecNormalize in evaluation mode. """ instance = object.__new__(cls) + instance._max_episode_steps = max_episode_steps # noqa: SLF001 + _mep = max_episode_steps raw_vec_env = make_vec_env( - env_id=env, + env_id=lambda **kw: TimeLimit(env(**kw), max_episode_steps=_mep), n_envs=1, env_kwargs=env_kwargs, - wrapper_class=TimeLimit, # type: ignore[arg-type] - wrapper_kwargs={"max_episode_steps": 3000}, ) stats_path = cls._stats_path(str(model_path)) if stats_path.exists(): @@ -127,7 +199,7 @@ def resume( env_kwargs: dict[str, Any] | None = None, save_path: str | None = None, n_envs: int = 4, - max_episode_steps: int = 3000, + max_episode_steps: int = 1000, ) -> ProximalPolicyOptimizationAgent: """Load a saved agent to continue training. @@ -153,12 +225,12 @@ def resume( """ instance = object.__new__(cls) instance.save_path = save_path + instance._max_episode_steps = max_episode_steps # noqa: SLF001 + _mep = max_episode_steps raw_vec_env = make_vec_env( - env_id=env, + env_id=lambda **kw: TimeLimit(env(**kw), max_episode_steps=_mep), n_envs=n_envs, env_kwargs=env_kwargs, - wrapper_class=TimeLimit, # type: ignore[arg-type] - wrapper_kwargs={"max_episode_steps": max_episode_steps}, ) stats_path = cls._stats_path(str(model_path)) if stats_path.exists(): @@ -226,6 +298,8 @@ def do_training( *, progress_bar: bool = True, reset_num_timesteps: bool = True, + log_interval: int = 50_000, + csv_path: str | None = None, ) -> None: """Train the PPO model. @@ -239,11 +313,28 @@ def do_training( Whether to reset the internal timestep counter before training. Set to False when resuming to preserve the learning rate schedule (default True). + log_interval : int, optional + Timesteps between ep_rew_mean log lines printed alongside the + progress bar (default 50 000). Ignored when progress_bar=False. + csv_path : str or None, optional + Path to write a CSV log file with per-interval metrics at the end + of training (default None). """ + cb = ( + EpRewardLogCallback( + total_timesteps, + log_interval, + csv_path=csv_path, + max_episode_steps=self._max_episode_steps, + ) + if progress_bar + else None + ) _ = self.model.learn( total_timesteps, progress_bar=progress_bar, reset_num_timesteps=reset_num_timesteps, + callback=cb, ) if self.save_path is not None and self.env.render_mode != "play-back": self.model.save(self.save_path) @@ -265,18 +356,67 @@ def evaluate(self, n_episodes: int = 10) -> None: self.vec_env.norm_reward = True logger.info("Mean:%s, stdev:%s", mean_reward, std_reward) - def do_one_episode(self, seed: int = 1) -> None: + def do_one_episode( + self, + seed: int = 1, + save_png: str | None = None, + ) -> EpisodeResult: """Run one episode on the non-vectorised, trained environment. Parameters ---------- seed : int, optional Random seed for the environment reset (default 1). + save_png : str or None, optional + If set, save a 7-panel trajectory plot to this path (default None). + + Returns + ------- + EpisodeResult + Per-episode metrics including t_min stats, final crane state, and outcome. """ - obs, _ = self.env.reset(seed=seed) + obs, reset_info = self.env.reset(seed=seed) + nan = float("nan") + start_speed: float = self.env.unwrapped.initial_speed # type: ignore[attr-defined] + t_min_start_val: float = float(reset_info.get("t_min", nan)) terminated = truncated = False + ep_steps = 0 + ep_reward = 0.0 + t_min_trace: list[float] = [] + info: dict[str, float | int] = {} + last_action: np.ndarray | int = 0 while not terminated and not truncated: norm_obs = self.vec_env.normalize_obs(np.asarray(obs)) action, _states = self.model.predict(np.asarray(norm_obs), deterministic=True) - obs, _rewards, terminated, truncated, _ = self.env.step(int(action)) - self.env.render() + last_action = action + obs, reward, terminated, truncated, info = self.env.step(action) + ep_steps += 1 + ep_reward += float(reward) + if "t_min" in info: + t_min_trace.append(float(info["t_min"])) + self.env.unwrapped.render(save_path=save_png) # type: ignore[attr-defined, call-arg] + env_u = self.env.unwrapped + energy_final = 0.5 * float(env_u.wire.cm_v[0]) ** 2 # type: ignore[attr-defined] + if env_u.continuous_actions: # type: ignore[attr-defined] + acc_final = float(np.asarray(last_action).flat[0]) * float(env_u.acc) # type: ignore[attr-defined] + else: + acc_final = float(env_u.action_to_acc[int(last_action)]) # type: ignore[attr-defined] + return EpisodeResult( + start_speed=start_speed, + ep_steps=ep_steps, + ep_reward=ep_reward, + terminated=terminated, + truncated=truncated, + no_crash=not info.get("crash", False), + t_min_start=t_min_start_val, + t_min_min=min(t_min_trace) if t_min_trace else nan, + t_min_final=t_min_trace[-1] if t_min_trace else nan, + t_min_mean_last100=float(np.mean(t_min_trace[-100:])) if t_min_trace else nan, + t_min_settle_step=_t_min_settle(t_min_trace), + x_pos_final=float(info.get("x_pos", nan)), + x_vel_final=float(info.get("x_vel", nan)), + theta_final=float(obs[2]), + theta_dot_final=float(obs[3]), + energy_final=energy_final, + acc_final=acc_final, + ) diff --git a/tests/test_environment.py b/tests/test_environment.py index 43a5d0e..185b3b4 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -5,9 +5,11 @@ import matplotlib.pyplot as plt import numpy as np import pytest +from gymnasium import spaces from py_crane.crane import Crane from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.experiment_config import RewardConfig from crane_controller.q_agent import QLearningAgent logger = logging.getLogger(__name__) @@ -58,6 +60,7 @@ def test_environment( render_mode="plot" if show else "none", reward_limit=reward_limit, discrete=QLearningAgent.DEFAULT_DISCRETE.copy(), + continuous_actions=False, ) assert env.action_space.n == 3 # type: ignore[attr-defined,unused-ignore] assert env.action_space.start == 0 # type: ignore[attr-defined,unused-ignore] @@ -79,16 +82,26 @@ def test_environment( def test_init(crane: Callable[..., Crane], *, show: bool) -> None: """Test the initialization of the environment.""" - env = AntiPendulumEnv(crane, seed=1, start_speed=-1.0, render_mode="play-back" if show else "data") + env = AntiPendulumEnv( + crane, + seed=1, + start_speed=1.0, + randomize_start=False, + render_mode="play-back" if show else "data", + continuous_actions=False, + ) rnd_u = env.np_random.uniform(2, 8) rnd_r = env.np_random.random() assert rnd_u == 5.07092974820154, f"Returns pseudo-random numbers when seed is given. Got {rnd_u} for seed 1" assert rnd_r == 0.9504636963259353, f"Returns pseudo-random numbers when seed is given. Got {rnd_r} for seed 1" obs, inf = env.reset(seed=1) - assert np.allclose(obs, [0.0, 0.0, np.pi, 0.1212789244604621]), f"Found {obs[3]}" + # obs[3] is now pure theta_dot = cm_v[0] / wire.length (crane at rest so origin_v=0) + assert len(obs) == 4 + assert np.isclose(obs[3], 1.0 / env.wire.length), f"Expected theta_dot=1/length, got obs[3]={obs[3]}" assert inf["steps"] == 0 - assert abs(inf["reward"] + 0.5 * 0.1212789244604621**2) < 1e-9, f"Found initial reward {inf['reward']}" - obs, reward, terminated, truncated, _ = env.step(-1) + # reward = energy * 1.0 = -(0.5 * start_speed^2) at equilibrium (PE~0) + assert abs(inf["reward"] + 0.5 * 1.0**2) < 1e-6, f"Found initial reward {inf['reward']}" + obs, reward, terminated, truncated, _ = env.step(0) assert obs[0] == -0.1 assert obs[1] == -0.1 assert not terminated @@ -110,7 +123,7 @@ def test_observation_space_dtype(crane: Callable[..., Crane]) -> None: def test_observations_are_float(crane: Callable[..., Crane]) -> None: """Test that observations preserve sub-integer precision after a physics step.""" - env = AntiPendulumEnv(crane) + env = AntiPendulumEnv(crane, continuous_actions=False) _ = env.reset() obs, _, _, _, _ = env.step(1) # one physics step produces fractional values assert isinstance(obs, np.ndarray) @@ -118,6 +131,148 @@ def test_observations_are_float(crane: Callable[..., Crane]) -> None: assert not np.all(obs == obs.astype(int)) # sub-integer precision is preserved +# --------------------------------------------------------------------------- +# rail_limit +# --------------------------------------------------------------------------- + + +def test_rail_limit_stored(crane: Callable[..., Crane]) -> None: + """rail_limit is stored and bounds the continuous observation space.""" + env = AntiPendulumEnv(crane, rail_limit=5.0) + assert env.rail_limit == 5.0 + assert env.spaces_min[0] == -5.0 + assert env.spaces_max[0] == 5.0 + + +# --------------------------------------------------------------------------- +# obs[3] semantics: pure theta_dot +# --------------------------------------------------------------------------- + + +def test_obs3_is_pure_theta_dot(crane: Callable[..., Crane]) -> None: + """obs[3] equals (cm_v[0] - origin_v[0]) / wire.length, not absolute velocity.""" + env = AntiPendulumEnv(crane, start_speed=1.0, randomize_start=False) + obs, _ = env.reset() + wire = env.wire + expected = (wire.cm_v[0] - wire.origin_v[0]) / wire.length # pyright: ignore[reportUnknownMemberType] + assert np.isclose(obs[3], expected) + # At reset: crane at rest (origin_v=0), so theta_dot = start_speed / length + assert np.isclose(obs[3], 1.0 / wire.length) + + +# --------------------------------------------------------------------------- +# Reward terms +# --------------------------------------------------------------------------- + + +def test_reward_terms_zero_by_default(crane: Callable[..., Crane]) -> None: + """New reward terms contribute zero when their weights are zero.""" + rc_energy = RewardConfig(energy=1.0, positional=0.0, time=0.0, position=0.0, acceleration=0.0) + rc_crane_vel = RewardConfig( + energy=1.0, positional=0.0, time=0.0, position=0.0, acceleration=0.0, crane_velocity=100.0 + ) + env1 = AntiPendulumEnv(crane, start_speed=1.0, reward_fac=rc_energy, continuous_actions=False) + env2 = AntiPendulumEnv(crane, start_speed=1.0, reward_fac=rc_crane_vel, continuous_actions=False) + _ = env1.reset() + _ = env2.reset() + _, r1, _, _, _ = env1.step(2) + _, r2, _, _, _ = env2.step(2) + assert r1 > r2, f"crane_velocity=0 should give higher reward than crane_velocity=100; got r1={r1}, r2={r2}" + + +def test_crane_velocity_reward_term(crane: Callable[..., Crane]) -> None: + """crane_velocity weight adds -crane_vel^2 to the reward.""" + rc = RewardConfig(energy=0.0, positional=0.0, position=0.0, acceleration=0.0, crane_velocity=1.0) + env = AntiPendulumEnv(crane, start_speed=1.0, reward_fac=rc, continuous_actions=False) + _ = env.reset() + obs, reward, _, _, _ = env.step(2) # max acceleration right + crane_vel = obs[1] + assert crane_vel != 0.0 + assert reward < 0.0 + assert np.isclose(reward, -(crane_vel**2), rtol=1e-4) + + +def test_angle_reward_term(crane: Callable[..., Crane]) -> None: + """angle weight contributes -theta^2 to the reward.""" + rc = RewardConfig(energy=0.0, positional=0.0, position=0.0, acceleration=0.0, angle=1.0) + env = AntiPendulumEnv(crane, start_speed=1.0, reward_fac=rc, continuous_actions=False) + _ = env.reset() + _, reward, _, _, _ = env.step(1) # coast + assert reward < 0.0 # theta deviates from 0, so -theta^2 < 0 + + +def test_terminal_penalty_on_truncation(crane: Callable[..., Crane]) -> None: + """terminal_penalty is added to the reward when an episode truncates (OOB).""" + rc = RewardConfig(energy=1.0, terminal_penalty=-50.0) + env = AntiPendulumEnv(crane, start_speed=1.0, rail_limit=0.15, reward_fac=rc, continuous_actions=False) + _ = env.reset() + got_truncation = False + for _ in range(50): + _, reward, _, truncated, _ = env.step(2) + if truncated: + assert reward < -40.0, f"Expected terminal penalty, got reward={reward}" + got_truncation = True + break + assert got_truncation, "Expected at least one truncated step within 50 steps" + + +def test_action_space_is_discrete(crane: Callable[..., Crane]) -> None: + """Part A: action space is Discrete(3) when continuous_actions=False.""" + env = AntiPendulumEnv(crane, continuous_actions=False) + assert isinstance(env.action_space, spaces.Discrete) + assert int(env.action_space.n) == 3 # pyright: ignore[reportUnknownMemberType] + + +# --------------------------------------------------------------------------- +# Part B: continuous_actions +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("continuous_actions", [True, False]) +def test_action_space_type(crane: Callable[..., Crane], continuous_actions: bool) -> None: # noqa: FBT001 + """Action space is Box(-1,1) for continuous and Discrete(3) for discrete.""" + env = AntiPendulumEnv(crane, continuous_actions=continuous_actions) + if continuous_actions: + assert isinstance(env.action_space, spaces.Box) + assert env.action_space.shape == (1,) + assert float(env.action_space.low[0]) == -1.0 + assert float(env.action_space.high[0]) == 1.0 + else: + assert isinstance(env.action_space, spaces.Discrete) + assert int(env.action_space.n) == 3 # pyright: ignore[reportUnknownMemberType] + + +@pytest.mark.parametrize("continuous_actions", [True, False]) +def test_step_accepts_correct_action(crane: Callable[..., Crane], continuous_actions: bool) -> None: # noqa: FBT001 + """step() accepts np.ndarray for continuous and int for discrete; obs shape unchanged.""" + env = AntiPendulumEnv(crane, continuous_actions=continuous_actions) + _ = env.reset() + if continuous_actions: + action: int | np.ndarray = np.array([0.5], dtype=np.float32) + else: + action = 1 + obs, _, _, _, _ = env.step(action) + assert isinstance(obs, np.ndarray) + assert obs.shape == (4,) + + +def test_t_min_crane_reward_term(crane: Callable[..., Crane]) -> None: + """t_min_crane weight adds -t_min to the reward; zero at origin at rest.""" + rc = RewardConfig(energy=0.0, positional=0.0, position=0.0, acceleration=0.0, t_min_crane=1.0) + env = AntiPendulumEnv(crane, start_speed=1.0, reward_fac=rc, continuous_actions=False) + _ = env.reset() + # Displace crane so t_min > 0 + env.crane.position[0] = 1.0 + env.crane.velocity[0] = 0.0 + _, reward, _, _, _ = env.step(1) # coast — minimal physics change + assert reward < 0.0, f"Expected negative reward from t_min penalty, got {reward}" + # At origin at rest t_min = 0 → contribution is exactly 0 + env.crane.position[0] = 0.0 + env.crane.velocity[0] = 0.0 + t_min = env._t_min_crane() # type: ignore[reportPrivateUsage] + assert t_min == 0.0, f"Expected t_min=0 at origin at rest, got {t_min}" + + if __name__ == "__main__": import os from pathlib import Path @@ -127,6 +282,3 @@ def test_observations_are_float(crane: Callable[..., Crane]) -> None: retcode = pytest.main(["-rP -s -v", __file__]) assert retcode == 0, f"Return code {retcode}" os.chdir(Path(__file__).parent.absolute() / "test_working_directory") - # test_init(build_crane, show=True) - # test_observation_space_dtype(build_crane) - # test_observations_are_float(build_crane) diff --git a/tests/test_experiment_config.py b/tests/test_experiment_config.py new file mode 100644 index 0000000..9bf80ee --- /dev/null +++ b/tests/test_experiment_config.py @@ -0,0 +1,208 @@ +"""Tests for crane_controller.experiment_config.""" + +from __future__ import annotations + +import dataclasses +import json +from pathlib import Path + +import pytest + +from crane_controller.experiment_config import ( + ExperimentConfig, + RewardConfig, + TrainingConfig, + _meta_path, # type: ignore[reportPrivateUsage] + load_experiment_config, + load_training_sidecar, + save_training_sidecar, +) + +# --------------------------------------------------------------------------- +# RewardConfig +# --------------------------------------------------------------------------- + + +def test_reward_config_defaults() -> None: + rc = RewardConfig() + assert rc.energy == 1.0 + assert rc.positional == 0.0015 + assert rc.time == 0.0 + assert rc.position == 0.005 + assert rc.acceleration == 0.01 + assert rc.terminal_penalty == 0.0 + assert rc.angle == 0.0 + assert rc.angular_velocity == 0.0 + assert rc.crane_velocity == 0.0 + assert rc.crane_acceleration == 0.0 + assert rc.angular_acceleration == 0.0 + assert rc.t_min_crane == 0.0 + + +def test_reward_config_from_dict_full() -> None: + d = { + "energy": 2.0, + "positional": 0.001, + "time": 0.01, + "position": 0.1, + "acceleration": 0.05, + "terminal_penalty": -5.0, + } + rc = RewardConfig.from_dict(d) + assert rc == RewardConfig( + energy=2.0, positional=0.001, time=0.01, position=0.1, acceleration=0.05, terminal_penalty=-5.0 + ) + + +def test_reward_config_from_dict_partial_fills_defaults() -> None: + rc = RewardConfig.from_dict({"energy": 5.0}) + assert rc.energy == 5.0 + assert rc.positional == RewardConfig().positional + assert rc.acceleration == RewardConfig().acceleration + + +def test_reward_config_from_dict_int_values_coerced_to_float() -> None: + rc = RewardConfig.from_dict({"energy": 1, "position": 0}) + assert isinstance(rc.energy, float) + assert isinstance(rc.position, float) + + +def test_reward_config_is_frozen() -> None: + rc = RewardConfig() + with pytest.raises(dataclasses.FrozenInstanceError): + rc.energy = 99.0 # type: ignore[misc] + + +# --------------------------------------------------------------------------- +# TrainingConfig +# --------------------------------------------------------------------------- + + +def test_training_config_defaults() -> None: + tc = TrainingConfig() + assert tc.steps == 100_000 + assert tc.n_envs == 4 + assert tc.gamma == 0.99 + assert tc.save_path == "models/ppo_AntiPendulumEnv.zip" + assert tc.seed is None + assert tc.ent_coef == 0.0 + assert tc.learning_rate == 3e-4 + assert tc.clip_range == 0.2 + assert tc.n_steps == 2048 + assert tc.randomize_start is False + assert tc.rail_limit == 10.0 + assert tc.continuous_actions is True + assert tc.reward_limit == 50.0 + assert tc.max_episode_steps == 1000 + + +def test_training_config_from_dict() -> None: + d = { + "steps": 500000, + "n_envs": 8, + "gamma": 0.999, + "save_path": "models/my_model.zip", + "seed": 42, + "ent_coef": 0.01, + "learning_rate": 1e-4, + "clip_range": 0.1, + "n_steps": 8192, + "randomize_start": True, + "max_episode_steps": 50, + } + tc = TrainingConfig.from_dict(d) + assert tc == TrainingConfig( + steps=500000, + n_envs=8, + gamma=0.999, + save_path="models/my_model.zip", + seed=42, + ent_coef=0.01, + learning_rate=1e-4, + clip_range=0.1, + n_steps=8192, + randomize_start=True, + max_episode_steps=50, + ) + + +def test_training_config_from_dict_partial_fills_defaults() -> None: + tc = TrainingConfig.from_dict({"steps": 1000}) + assert tc.steps == 1000 + assert tc.n_envs == TrainingConfig().n_envs + + +# --------------------------------------------------------------------------- +# load_experiment_config +# --------------------------------------------------------------------------- + + +def test_load_experiment_config_none_returns_defaults() -> None: + ec = load_experiment_config(None) + assert ec.reward == RewardConfig() + assert ec.training == TrainingConfig() + assert ec.config_source is None + + +def test_load_experiment_config_reads_yaml(tmp_path: Path) -> None: + yaml_content = "reward:\n energy: 2.0\n position: 0.1\ntraining:\n steps: 50000\n gamma: 0.95\n" + cfg_file = tmp_path / "test.yaml" + _ = cfg_file.write_text(yaml_content) + ec = load_experiment_config(cfg_file) + assert ec.reward.energy == 2.0 + assert ec.reward.position == 0.1 + assert ec.training.steps == 50000 + assert ec.training.gamma == 0.95 + assert ec.config_source == str(cfg_file) + + +def test_load_experiment_config_nonexistent_file_raises() -> None: + with pytest.raises(FileNotFoundError): + _ = load_experiment_config("/nonexistent/path/config.yaml") + + +# --------------------------------------------------------------------------- +# save / load sidecar +# --------------------------------------------------------------------------- + + +def test_meta_path_convention() -> None: + assert _meta_path("models/ppo_foo.zip") == Path("models/ppo_foo_meta.json") + + +def test_save_training_sidecar_creates_file(tmp_path: Path) -> None: + model_path = tmp_path / "model.zip" + config = ExperimentConfig() + sidecar = save_training_sidecar(model_path, config) + assert sidecar.exists() + assert sidecar.suffix == ".json" + + +def test_save_training_sidecar_content(tmp_path: Path) -> None: + model_path = tmp_path / "model.zip" + config = ExperimentConfig( + reward=RewardConfig(energy=2.0), + training=TrainingConfig(steps=999), + ) + sidecar = save_training_sidecar(model_path, config) + payload = json.loads(sidecar.read_text()) + assert payload["reward"]["energy"] == 2.0 + assert payload["training"]["steps"] == 999 + + +def test_load_training_sidecar_round_trip(tmp_path: Path) -> None: + model_path = tmp_path / "model.zip" + original = ExperimentConfig( + reward=RewardConfig(position=0.1, acceleration=0.05), + training=TrainingConfig(steps=250000, gamma=0.999), + ) + _ = save_training_sidecar(model_path, original) + restored = load_training_sidecar(model_path) + assert restored.reward == original.reward + assert restored.training == original.training + + +def test_load_training_sidecar_missing_raises(tmp_path: Path) -> None: + model_path = tmp_path / "model.zip" + with pytest.raises(FileNotFoundError): + _ = load_training_sidecar(model_path) diff --git a/tests/test_q.py b/tests/test_q.py index 832621d..43fa641 100644 --- a/tests/test_q.py +++ b/tests/test_q.py @@ -19,6 +19,7 @@ def test_smoke(crane: Callable[..., Crane], *, show: bool) -> None: render_mode="plot" if show else "none", reward_limit=-0.05, discrete=QLearningAgent.DEFAULT_DISCRETE.copy(), + continuous_actions=False, ) agent = QLearningAgent(env, filename=None) agent.do_episodes(n_episodes=5, max_steps=200) @@ -31,6 +32,7 @@ def test_q_analyse(crane: Callable[..., Crane], *, show: bool) -> None: env = AntiPendulumEnv( crane, discrete=QLearningAgent.DEFAULT_DISCRETE.copy(), + continuous_actions=False, ) agent = QLearningAgent(env, filename=Path("q_trained.json"), use_trained=True) for k, v in agent.q_values.items(): @@ -55,6 +57,7 @@ def test_intervals(crane: Callable[..., Crane]): render_mode="none", reward_limit=-0.05, discrete=QLearningAgent.DEFAULT_DISCRETE.copy(), + continuous_actions=False, ) agent = QLearningAgent(env, filename=save_path, use_trained=False)