nmWTAI-Platform/ML/nmWTAI-ML/scripts/evaluate_time_conditioned.py

"""评估时间条件正演代理模型。

时间条件模型以“样本参数 + 单个时间点”为输入预测压力/导数，因此本脚本会把整条曲线
展开为点级批量推理，再还原为样本级误差统计、域内/域外分组摘要和最差案例图。
该评估用于检查模型在可变时间采样与 PSO 参数域上的泛化能力。
"""

from __future__ import annotations

import argparse
import csv
import json
import random
import sys
from pathlib import Path
from typing import Iterable

import joblib
import matplotlib.pyplot as plt
import numpy as np
import torch

ROOT = Path(__file__).resolve().parents[1]
sys.path.append(str(ROOT))

from src.common.experiment_paths import normalize_tag, processed_path_for_tag
from src.data.param_features import inverse_transform_param_features
from src.models.time_conditioned_surrogate import TimeConditionedSurrogate
from src.training.train_forward import get_part_slices, infer_curve_layout


DEFAULT_RANDOM_SEED = 42
DEFAULT_PSO_DOMAIN = {
    "k_min": 0.001,
    "k_max": 10.0,
    "skin_min": -10.0,
    "skin_max": 10.0,
    "wellboreC_min": 1.0e-4,
    "wellboreC_max": 2.0,
    "phi_min": 0.01,
    "phi_max": 0.5,
    "h_min": 2.0,
    "h_max": 50.0,
}


def parse_args() -> argparse.Namespace:
    """解析时间条件代理模型评估所需的数据、checkpoint、样本数和输出目录。"""
    parser = argparse.ArgumentParser(description="Evaluate a time-conditioned point-wise surrogate")
    parser.add_argument("--processed", type=str, default=None, help="Processed dataset path")
    parser.add_argument("--tag", type=str, default=None, help="Experiment tag for auto naming")
    parser.add_argument("--model", type=str, default=None, help="Model checkpoint path")
    parser.add_argument("--output-dir", type=str, default=None, help="Optional evaluation output directory")
    parser.add_argument("--batch-size", type=int, default=65536, help="Point batch size for inference")
    parser.add_argument("--device", type=str, default=None, help="Override device, e.g. cpu or cuda")
    parser.add_argument("--seed", type=int, default=DEFAULT_RANDOM_SEED)
    parser.add_argument("--n-random-plots", type=int, default=5)
    parser.add_argument("--n-best-plots", type=int, default=5)
    parser.add_argument("--n-worst-plots", type=int, default=10)
    parser.add_argument("--top-k-analysis", type=int, default=300)
    parser.add_argument("--pso-k-min", type=float, default=DEFAULT_PSO_DOMAIN["k_min"])
    parser.add_argument("--pso-k-max", type=float, default=DEFAULT_PSO_DOMAIN["k_max"])
    parser.add_argument("--pso-h-min", type=float, default=DEFAULT_PSO_DOMAIN["h_min"])
    parser.add_argument("--pso-h-max", type=float, default=DEFAULT_PSO_DOMAIN["h_max"])
    parser.add_argument("--pso-skin-min", type=float, default=DEFAULT_PSO_DOMAIN["skin_min"])
    parser.add_argument("--pso-skin-max", type=float, default=DEFAULT_PSO_DOMAIN["skin_max"])
    parser.add_argument("--pso-wellboreC-min", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_min"])
    parser.add_argument("--pso-wellboreC-max", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_max"])
    parser.add_argument("--pso-phi-min", type=float, default=DEFAULT_PSO_DOMAIN["phi_min"])
    parser.add_argument("--pso-phi-max", type=float, default=DEFAULT_PSO_DOMAIN["phi_max"])
    return parser.parse_args()


def default_model_path(tag: str | None) -> Path:
    """根据实验标签和是否使用流量制度输入推导默认模型检查点路径。"""
    if tag:
        return Path("models") / f"time_conditioned_surrogate_{tag}" / "time_conditioned_surrogate_best.pt"
    return Path("models/time_conditioned_surrogate/time_conditioned_surrogate_best.pt")


def default_output_dir(tag: str | None) -> Path:
    """根据实验标签生成当前分析脚本默认的输出目录。"""
    if tag:
        return Path("results") / f"evaluation_time_conditioned_{tag}"
    return Path("results/evaluation_time_conditioned")


def percentile_summary(values: np.ndarray) -> dict:
    """计算数组的均值、中位数和若干百分位数，用于误差分布汇总。"""
    x = np.asarray(values, dtype=np.float64).reshape(-1)
    if x.size == 0:
        return {
            "min": None,
            "p05": None,
            "p25": None,
            "median": None,
            "p75": None,
            "p90": None,
            "p95": None,
            "max": None,
        }
    return {
        "min": float(np.min(x)),
        "p05": float(np.percentile(x, 5)),
        "p25": float(np.percentile(x, 25)),
        "median": float(np.percentile(x, 50)),
        "p75": float(np.percentile(x, 75)),
        "p90": float(np.percentile(x, 90)),
        "p95": float(np.percentile(x, 95)),
        "max": float(np.max(x)),
    }


def point_metrics(true: np.ndarray, pred: np.ndarray) -> dict:
    """计算逐时间点预测的误差指标。"""
    err = np.asarray(pred, dtype=np.float64) - np.asarray(true, dtype=np.float64)
    abs_err = np.abs(err)
    return {
        "rmse": float(np.sqrt(np.mean(err**2))),
        "mae": float(np.mean(abs_err)),
        "bias": float(np.mean(err)),
        "p90_abs": float(np.percentile(abs_err, 90)),
        "p95_abs": float(np.percentile(abs_err, 95)),
    }


def sample_metrics(true_p: np.ndarray, pred_p: np.ndarray, true_d: np.ndarray, pred_d: np.ndarray) -> list[dict]:
    """计算单条曲线样本的整体误差指标。"""
    rows: list[dict] = []
    for idx in range(true_p.shape[0]):
        p_err = pred_p[idx] - true_p[idx]
        d_err = pred_d[idx] - true_d[idx]
        rmse_p = float(np.sqrt(np.mean(p_err**2)))
        rmse_d = float(np.sqrt(np.mean(d_err**2)))
        mae_p = float(np.mean(np.abs(p_err)))
        mae_d = float(np.mean(np.abs(d_err)))
        rows.append(
            {
                "idx": idx,
                "rmse_p": rmse_p,
                "rmse_d": rmse_d,
                "mae_p": mae_p,
                "mae_d": mae_d,
                "score": float(rmse_p + 2.0 * rmse_d),
            }
        )
    return rows


def write_csv(path: Path, rows: list[dict], fieldnames: list[str] | None = None) -> None:
    """按字段名写出 CSV 明细或汇总结果。"""
    path.parent.mkdir(parents=True, exist_ok=True)
    if not rows:
        path.write_text("", encoding="utf-8-sig")
        return
    names = fieldnames or list(rows[0].keys())
    with path.open("w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=names, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)


def iter_batches(total: int, batch_size: int) -> Iterable[tuple[int, int]]:
    """把样本数量切分为批量索引范围，避免评估时一次性占用过多内存。"""
    batch = max(1, int(batch_size))
    for start in range(0, int(total), batch):
        yield start, min(start + batch, int(total))


def load_model(model_path: Path, device: torch.device) -> tuple[TimeConditionedSurrogate, dict]:
    """加载模型检查点，按保存的维度和超参数重建网络并切换到评估模式。"""
    checkpoint = torch.load(model_path, map_location="cpu")
    model = TimeConditionedSurrogate(
        param_dim=int(checkpoint["param_dim"]),
        schedule_dim=int(checkpoint["schedule_dim"]),
        time_dim=int(checkpoint["time_dim"]),
        hidden_dim=int(checkpoint["hidden_dim"]),
        n_blocks=int(checkpoint["n_blocks"]),
        dropout=float(checkpoint["dropout"]),
        use_schedule=bool(checkpoint.get("use_schedule", True)),
    )
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    model.to(device)
    return model, checkpoint


def predict_scaled_points(
    model: TimeConditionedSurrogate,
    params_x: np.ndarray,
    schedule_x: np.ndarray,
    time_x: np.ndarray,
    device: torch.device,
    batch_size: int,
) -> np.ndarray:
    """在标准化空间中批量预测时间条件模型的逐点输出。"""
    n_samples, n_time, time_dim = time_x.shape
    params_flat = np.repeat(params_x, n_time, axis=0)
    schedule_flat = np.repeat(schedule_x, n_time, axis=0)
    time_flat = time_x.reshape(n_samples * n_time, time_dim)

    pred_flat = np.empty((n_samples * n_time, 2), dtype=np.float32)
    use_schedule = bool(model.use_schedule)
    with torch.no_grad():
        for start, end in iter_batches(len(time_flat), batch_size):
            params_t = torch.tensor(params_flat[start:end], dtype=torch.float32, device=device)
            time_t = torch.tensor(time_flat[start:end], dtype=torch.float32, device=device)
            if use_schedule:
                schedule_t = torch.tensor(schedule_flat[start:end], dtype=torch.float32, device=device)
            else:
                schedule_t = None
            pred_flat[start:end] = model(params_t, time_t, schedule_t).detach().cpu().numpy()
    return pred_flat.reshape(n_samples, n_time, 2)


def inverse_curve_part(values_scaled: np.ndarray, scaler_curve: object, part_slice: slice) -> np.ndarray:
    """使用曲线 scaler 的对应分段参数把预测值恢复到原始尺度。"""
    mean = np.asarray(scaler_curve.mean_[part_slice], dtype=np.float32)
    scale = np.asarray(scaler_curve.scale_[part_slice], dtype=np.float32)
    return values_scaled.astype(np.float32) * scale.reshape(1, -1) + mean.reshape(1, -1)


def recover_raw_params(data: dict) -> dict[str, np.ndarray]:
    """把标准化后的参数特征反变换回原始物理参数。"""
    meta = data.get("meta", {}) or {}
    features = data["scaler_params"].inverse_transform(data["X_params_test"])
    raw = inverse_transform_param_features(features, meta.get("param_feature_transform"))
    names = list(meta.get("param_names") or ["k", "skin", "wellboreC", "phi", "h", "Cf"])
    return {name: raw[:, idx].astype(np.float64) for idx, name in enumerate(names[: raw.shape[1]])}


def build_pso_mask(params: dict[str, np.ndarray], args: argparse.Namespace) -> np.ndarray:
    """根据元数据标记出 PSO/自动拟合相关样本，用于单独统计。"""
    return (
        (params["k"] >= float(args.pso_k_min))
        & (params["k"] <= float(args.pso_k_max))
        & (params["skin"] >= float(args.pso_skin_min))
        & (params["skin"] <= float(args.pso_skin_max))
        & (params["wellboreC"] >= float(args.pso_wellboreC_min))
        & (params["wellboreC"] <= float(args.pso_wellboreC_max))
        & (params["phi"] >= float(args.pso_phi_min))
        & (params["phi"] <= float(args.pso_phi_max))
        & (params["h"] >= float(args.pso_h_min))
        & (params["h"] <= float(args.pso_h_max))
    )


def summarize_group(score: np.ndarray, rmse_p: np.ndarray, rmse_d: np.ndarray, mask: np.ndarray) -> dict:
    """对一个样本分组计算误差、不确定性和样本数量等汇总指标。"""
    m = np.asarray(mask, dtype=bool)
    return {
        "n": int(np.sum(m)),
        "score": percentile_summary(score[m]),
        "rmse_p": percentile_summary(rmse_p[m]),
        "rmse_d": percentile_summary(rmse_d[m]),
        "score_gt_1_ratio": float(np.mean(score[m] > 1.0)) if np.any(m) else None,
        "score_gt_2_ratio": float(np.mean(score[m] > 2.0)) if np.any(m) else None,
        "score_gt_5_ratio": float(np.mean(score[m] > 5.0)) if np.any(m) else None,
    }


def build_domain_summary(sample_rows: list[dict], params: dict[str, np.ndarray], pso_mask: np.ndarray) -> dict:
    """按数据域或流量制度类别构建评估摘要。"""
    score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64)
    rmse_p = np.asarray([r["rmse_p"] for r in sample_rows], dtype=np.float64)
    rmse_d = np.asarray([r["rmse_d"] for r in sample_rows], dtype=np.float64)
    skin = params["skin"]
    wellboreC = params["wellboreC"]

    order = np.argsort(-score)
    top100 = order[: min(100, order.size)]
    return {
        "all": summarize_group(score, rmse_p, rmse_d, np.ones_like(pso_mask, dtype=bool)),
        "pso_domain": summarize_group(score, rmse_p, rmse_d, pso_mask),
        "outside_pso_domain": summarize_group(score, rmse_p, rmse_d, ~pso_mask),
        "pso_skin_lt_minus_5": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -5.0)),
        "pso_skin_lt_minus_8": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -8.0)),
        "pso_skin_lt_minus_5_wellboreC_gt_0_1": summarize_group(
            score,
            rmse_p,
            rmse_d,
            pso_mask & (skin < -5.0) & (wellboreC > 0.1),
        ),
        "top100": {
            "outside_pso_domain": int(np.sum(~pso_mask[top100])),
            "k_lt_0_001": int(np.sum(params["k"][top100] < 0.001)),
            "k_gt_10": int(np.sum(params["k"][top100] > 10.0)),
            "h_gt_50": int(np.sum(params["h"][top100] > 50.0)),
            "pso_skin_lt_minus_5_wellboreC_gt_0_1": int(
                np.sum((pso_mask & (skin < -5.0) & (wellboreC > 0.1))[top100])
            ),
        },
    }


def summarize_params_for_indices(params: dict[str, np.ndarray], indices: np.ndarray) -> dict:
    """统计指定样本索引对应的物理参数范围和中心值。"""
    return {
        name: percentile_summary(values[np.asarray(indices, dtype=int)])
        for name, values in params.items()
        if name in {"k", "skin", "wellboreC", "phi", "h", "Cf"}
    }


def build_worst_case_summary(
    sample_rows: list[dict],
    params: dict[str, np.ndarray],
    pso_mask: np.ndarray,
    top_k: int,
) -> dict:
    """挑选误差最大的样本并整理其指标、参数和元数据。"""
    score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64)
    order_worst = np.argsort(-score)
    order_best = np.argsort(score)
    top = order_worst[: min(int(top_k), order_worst.size)]
    worst100 = order_worst[: min(100, order_worst.size)]
    best100 = order_best[: min(100, order_best.size)]

    return {
        "top_k": int(top.size),
        "metrics": {
            "score": percentile_summary(score),
            "n_score_gt_1": int(np.sum(score > 1.0)),
            "n_score_gt_2": int(np.sum(score > 2.0)),
            "n_score_gt_5": int(np.sum(score > 5.0)),
        },
        "pso_domain": {
            "n_inside": int(np.sum(pso_mask)),
            "n_outside": int(np.sum(~pso_mask)),
            "top100_outside": int(np.sum(~pso_mask[worst100])),
            "top100_k_lt_0_001": int(np.sum(params["k"][worst100] < 0.001)),
            "top100_k_gt_10": int(np.sum(params["k"][worst100] > 10.0)),
            "top100_h_gt_50": int(np.sum(params["h"][worst100] > 50.0)),
        },
        "params": {
            "all": summarize_params_for_indices(params, np.arange(score.size)),
            "worst_top_k": summarize_params_for_indices(params, top),
            "worst100": summarize_params_for_indices(params, worst100),
            "best100": summarize_params_for_indices(params, best100),
        },
    }


def build_worst_case_rows(
    sample_rows: list[dict],
    params: dict[str, np.ndarray],
    data: dict,
    true_p: np.ndarray,
    true_d: np.ndarray,
    pred_p: np.ndarray,
    pred_d: np.ndarray,
    pso_mask: np.ndarray,
    top_k: int,
) -> tuple[list[dict], list[dict]]:
    """生成最差样本明细行，供 CSV 输出和人工检查。"""
    score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64)
    order = np.argsort(-score)[: min(int(top_k), len(sample_rows))]
    family = data.get("family_name_test")
    schedule_meta = data.get("schedule_meta_test")
    schedule_meta_names = list((data.get("meta", {}) or {}).get("schedule_meta_names") or [])

    case_rows: list[dict] = []
    residual_rows: list[dict] = []
    for rank, idx in enumerate(order, 1):
        p_res = pred_p[idx] - true_p[idx]
        d_res = pred_d[idx] - true_d[idx]
        p_rmse = float(np.sqrt(np.mean(p_res**2)))
        d_rmse = float(np.sqrt(np.mean(d_res**2)))
        p_mean = float(np.mean(p_res))
        d_mean = float(np.mean(d_res))
        p_std = float(np.std(p_res))
        d_std = float(np.std(d_res))

        row = {
            "rank": rank,
            "idx": int(idx),
            "score": float(score[idx]),
            "rmse_p": float(sample_rows[idx]["rmse_p"]),
            "rmse_d": float(sample_rows[idx]["rmse_d"]),
            "mae_p": float(sample_rows[idx]["mae_p"]),
            "mae_d": float(sample_rows[idx]["mae_d"]),
            "in_pso_domain": int(bool(pso_mask[idx])),
            "family": str(family[idx]) if family is not None else "",
        }
        for name in ["k", "skin", "wellboreC", "phi", "h", "Cf"]:
            if name in params:
                row[name] = float(params[name][idx])
        if schedule_meta is not None:
            for midx, name in enumerate(schedule_meta_names):
                if midx < schedule_meta.shape[1]:
                    row[f"sched_{name}"] = float(schedule_meta[idx, midx])
        case_rows.append(row)

        residual_rows.append(
            {
                "rank": rank,
                "idx": int(idx),
                "score": float(score[idx]),
                "p_res_mean": p_mean,
                "p_res_std": p_std,
                "p_res_rmse": p_rmse,
                "p_shift_ratio": float(abs(p_mean) / max(p_rmse, 1.0e-12)),
                "d_res_mean": d_mean,
                "d_res_std": d_std,
                "d_res_rmse": d_rmse,
                "d_shift_ratio": float(abs(d_mean) / max(d_rmse, 1.0e-12)),
                "p_res_first": float(p_res[0]),
                "p_res_last": float(p_res[-1]),
                "d_res_first": float(d_res[0]),
                "d_res_last": float(d_res[-1]),
            }
        )
    return case_rows, residual_rows


def plot_sample(
    output_path: Path,
    idx: int,
    t: np.ndarray,
    true_p: np.ndarray,
    pred_p: np.ndarray,
    true_d: np.ndarray,
    pred_d: np.ndarray,
    title: str,
) -> None:
    """绘制单个样本的真实曲线、预测曲线和误差曲线，并保存为图片。"""
    x = np.asarray(t, dtype=np.float64)
    fig, axes = plt.subplots(2, 2, figsize=(13, 8))
    fig.suptitle(title)

    axes[0, 0].plot(x, true_p, label="True", linewidth=2)
    axes[0, 0].plot(x, pred_p, label="Pred", linewidth=2)
    axes[0, 0].set_title("Log Pressure")
    axes[0, 0].set_xscale("log")
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].legend()

    axes[0, 1].plot(x, pred_p - true_p, linewidth=1.5)
    axes[0, 1].axhline(0.0, linestyle="--", linewidth=1)
    axes[0, 1].set_title("Pressure Residual")
    axes[0, 1].set_xscale("log")
    axes[0, 1].grid(True, alpha=0.3)

    axes[1, 0].plot(x, true_d, label="True", linewidth=2)
    axes[1, 0].plot(x, pred_d, label="Pred", linewidth=2)
    axes[1, 0].set_title("Log Derivative")
    axes[1, 0].set_xscale("log")
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 0].legend()

    axes[1, 1].plot(x, pred_d - true_d, linewidth=1.5)
    axes[1, 1].axhline(0.0, linestyle="--", linewidth=1)
    axes[1, 1].set_title("Derivative Residual")
    axes[1, 1].set_xscale("log")
    axes[1, 1].grid(True, alpha=0.3)

    for ax in axes.ravel():
        ax.set_xlabel("Time")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close(fig)


def write_plots(
    output_dir: Path,
    sample_rows: list[dict],
    t_curve: np.ndarray,
    true_p: np.ndarray,
    true_d: np.ndarray,
    pred_p: np.ndarray,
    pred_d: np.ndarray,
    args: argparse.Namespace,
) -> None:
    """批量绘制代表性样本或最差样本的曲线对比图。"""
    plot_dir = output_dir / "plots"
    score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64)
    random.seed(int(args.seed))
    n_random = min(int(args.n_random_plots), len(sample_rows))
    n_best = min(int(args.n_best_plots), len(sample_rows))
    n_worst = min(int(args.n_worst_plots), len(sample_rows))
    best = np.argsort(score)[:n_best].tolist()
    worst = np.argsort(-score)[:n_worst].tolist()
    random_idx = random.sample(range(len(sample_rows)), n_random)

    for idx in random_idx:
        plot_sample(
            plot_dir / f"sample_{idx:04d}.png",
            idx,
            t_curve[idx],
            true_p[idx],
            pred_p[idx],
            true_d[idx],
            pred_d[idx],
            f"Random sample {idx} | score={score[idx]:.4f}",
        )
    for idx in best:
        plot_sample(
            plot_dir / f"best_sample_{idx:04d}.png",
            idx,
            t_curve[idx],
            true_p[idx],
            pred_p[idx],
            true_d[idx],
            pred_d[idx],
            f"Best sample {idx} | score={score[idx]:.4f}",
        )
    for idx in worst:
        plot_sample(
            plot_dir / f"worst_sample_{idx:04d}.png",
            idx,
            t_curve[idx],
            true_p[idx],
            pred_p[idx],
            true_d[idx],
            pred_d[idx],
            f"Worst sample {idx} | score={score[idx]:.4f}",
        )


def main() -> None:
    """评估按时间点预测的代理模型，分别统计缩放域和原始对数域误差。"""
    args = parse_args()
    tag = normalize_tag(args.tag)
    processed_path = Path(args.processed) if args.processed is not None else processed_path_for_tag(tag)
    model_path = Path(args.model) if args.model is not None else default_model_path(tag)
    output_dir = Path(args.output_dir) if args.output_dir is not None else default_output_dir(tag)
    output_dir.mkdir(parents=True, exist_ok=True)

    device_name = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device(device_name)

    print("Loading processed dataset...")
    data = joblib.load(processed_path)
    required = ["X_params_test", "X_schedule_test", "X_time_test", "Y_curve_test"]
    missing = [key for key in required if key not in data]
    if missing:
        # 时间条件模型依赖逐点时间特征；旧版 processed 数据没有这些字段时不能评估。
        raise KeyError(f"processed dataset is missing time-conditioned fields: {missing}")

    print("Loading model...")
    model, checkpoint = load_model(model_path, device)
    curve_layout = checkpoint.get("curve_layout") or infer_curve_layout(data)
    slices = get_part_slices(curve_layout)

    x_params = np.asarray(data["X_params_test"], dtype=np.float32)
    x_schedule = np.asarray(data["X_schedule_test"], dtype=np.float32)
    x_time = np.asarray(data["X_time_test"], dtype=np.float32)
    y_curve = np.asarray(data["Y_curve_test"], dtype=np.float32)
    scaler_curve = data["scaler_curve"]

    print(
        f"test={x_params.shape[0]}, n_time={x_time.shape[1]}, "
        f"param_dim={x_params.shape[1]}, schedule_dim={x_schedule.shape[1]}, time_dim={x_time.shape[-1]}"
    )
    print(f"device={device}, batch_size={args.batch_size}")

    pred_scaled = predict_scaled_points(
        model=model,
        params_x=x_params,
        schedule_x=x_schedule,
        time_x=x_time,
        device=device,
        batch_size=int(args.batch_size),
    )

    p_slice = slices["log_pressure"]
    d_slice = slices["log_derivative"]
    # 时间条件模型只预测 pressure/derivative 两个通道，不直接预测辅助 slope。
    true_p_scaled = y_curve[:, p_slice]
    true_d_scaled = y_curve[:, d_slice]
    pred_p_scaled = pred_scaled[:, :, 0]
    pred_d_scaled = pred_scaled[:, :, 1]

    true_p = inverse_curve_part(true_p_scaled, scaler_curve, p_slice)
    true_d = inverse_curve_part(true_d_scaled, scaler_curve, d_slice)
    pred_p = inverse_curve_part(pred_p_scaled, scaler_curve, p_slice)
    pred_d = inverse_curve_part(pred_d_scaled, scaler_curve, d_slice)

    # scaled 指标看模型训练空间内的误差，raw 指标看真实物理/工程尺度下的误差。
    summary = {
        "processed_path": str(processed_path),
        "model_path": str(model_path),
        "device": str(device),
        "checkpoint": {
            "hidden_dim": int(checkpoint["hidden_dim"]),
            "n_blocks": int(checkpoint["n_blocks"]),
            "dropout": float(checkpoint["dropout"]),
            "use_schedule": bool(checkpoint.get("use_schedule", True)),
        },
        "scaled_log_pressure": point_metrics(true_p_scaled, pred_p_scaled),
        "scaled_log_derivative": point_metrics(true_d_scaled, pred_d_scaled),
        "raw_log_pressure": point_metrics(true_p, pred_p),
        "raw_log_derivative": point_metrics(true_d, pred_d),
    }

    rows = sample_metrics(true_p=true_p, pred_p=pred_p, true_d=true_d, pred_d=pred_d)
    params = recover_raw_params(data)
    pso_mask = build_pso_mask(params, args)
    domain_summary = build_domain_summary(rows, params, pso_mask)
    # PSO-domain 单独统计，用来判断模型在自动拟合常用参数范围内是否稳定。
    summary["pso_domain"] = {
        "bounds": {
            "k": [float(args.pso_k_min), float(args.pso_k_max)],
            "skin": [float(args.pso_skin_min), float(args.pso_skin_max)],
            "wellboreC": [float(args.pso_wellboreC_min), float(args.pso_wellboreC_max)],
            "phi": [float(args.pso_phi_min), float(args.pso_phi_max)],
            "h": [float(args.pso_h_min), float(args.pso_h_max)],
        },
        "metrics": domain_summary,
    }

    case_rows, residual_rows = build_worst_case_rows(
        sample_rows=rows,
        params=params,
        data=data,
        true_p=true_p,
        true_d=true_d,
        pred_p=pred_p,
        pred_d=pred_d,
        pso_mask=pso_mask,
        top_k=int(args.top_k_analysis),
    )
    worst_case_summary = build_worst_case_summary(
        sample_rows=rows,
        params=params,
        pso_mask=pso_mask,
        top_k=int(args.top_k_analysis),
    )

    # 输出分为整体指标、逐样本指标、最差案例和最差残差，便于逐层排查误差来源。
    (output_dir / "summary_metrics.json").write_text(
        json.dumps(summary, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    write_csv(output_dir / "sample_metrics.csv", rows)
    write_csv(output_dir / "worst_case_analysis.csv", case_rows)
    write_csv(output_dir / "worst_residual_analysis.csv", residual_rows)
    (output_dir / "worst_case_summary.json").write_text(
        json.dumps(worst_case_summary, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )

    residual_summary = {
        # 残差 shift ratio 用于观察坏样本是否主要表现为整条曲线偏移。
        "top_k": int(len(residual_rows)),
        "top300_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows])),
        "top300_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows])),
        "top100_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows[:100]])),
        "top100_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows[:100]])),
        "top20": residual_rows[:20],
    }
    (output_dir / "worst_residual_summary.json").write_text(
        json.dumps(residual_summary, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )

    t_curve = np.asarray(data.get("T_curve_test"), dtype=np.float32)
    if t_curve.ndim == 2 and t_curve.shape == true_p.shape:
        # 只有保存了与曲线形状一致的真实时间坐标时，才绘制时间轴曲线图。
        write_plots(output_dir, rows, t_curve, true_p, true_d, pred_p, pred_d, args)

    print("\nEvaluation complete.")
    print(f"raw_log_pressure RMSE={summary['raw_log_pressure']['rmse']:.6f}, MAE={summary['raw_log_pressure']['mae']:.6f}")
    print(f"raw_log_derivative RMSE={summary['raw_log_derivative']['rmse']:.6f}, MAE={summary['raw_log_derivative']['mae']:.6f}")
    print(
        "PSO-domain: "
        f"n={domain_summary['pso_domain']['n']}, "
        f"median={domain_summary['pso_domain']['score']['median']:.6f}, "
        f"p95={domain_summary['pso_domain']['score']['p95']:.6f}, "
        f"score>1={100.0 * domain_summary['pso_domain']['score_gt_1_ratio']:.3f}%"
    )
    print(f"Artifacts written to: {output_dir}")


if __name__ == "__main__":
    main()