from __future__ import annotations import argparse import csv import json import random import sys from pathlib import Path from typing import Iterable import joblib import matplotlib.pyplot as plt import numpy as np import torch ROOT = Path(__file__).resolve().parents[1] sys.path.append(str(ROOT)) from src.common.experiment_paths import normalize_tag, processed_path_for_tag from src.data.param_features import inverse_transform_param_features from src.models.time_conditioned_surrogate import TimeConditionedSurrogate from src.training.train_forward import get_part_slices, infer_curve_layout DEFAULT_RANDOM_SEED = 42 DEFAULT_PSO_DOMAIN = { "k_min": 0.001, "k_max": 10.0, "skin_min": -10.0, "skin_max": 10.0, "wellboreC_min": 1.0e-4, "wellboreC_max": 2.0, "phi_min": 0.01, "phi_max": 0.5, "h_min": 2.0, "h_max": 50.0, } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Evaluate a time-conditioned point-wise surrogate") parser.add_argument("--processed", type=str, default=None, help="Processed dataset path") parser.add_argument("--tag", type=str, default=None, help="Experiment tag for auto naming") parser.add_argument("--model", type=str, default=None, help="Model checkpoint path") parser.add_argument("--output-dir", type=str, default=None, help="Optional evaluation output directory") parser.add_argument("--batch-size", type=int, default=65536, help="Point batch size for inference") parser.add_argument("--device", type=str, default=None, help="Override device, e.g. cpu or cuda") parser.add_argument("--seed", type=int, default=DEFAULT_RANDOM_SEED) parser.add_argument("--n-random-plots", type=int, default=5) parser.add_argument("--n-best-plots", type=int, default=5) parser.add_argument("--n-worst-plots", type=int, default=10) parser.add_argument("--top-k-analysis", type=int, default=300) parser.add_argument("--pso-k-min", type=float, default=DEFAULT_PSO_DOMAIN["k_min"]) parser.add_argument("--pso-k-max", type=float, default=DEFAULT_PSO_DOMAIN["k_max"]) parser.add_argument("--pso-h-min", type=float, default=DEFAULT_PSO_DOMAIN["h_min"]) parser.add_argument("--pso-h-max", type=float, default=DEFAULT_PSO_DOMAIN["h_max"]) parser.add_argument("--pso-skin-min", type=float, default=DEFAULT_PSO_DOMAIN["skin_min"]) parser.add_argument("--pso-skin-max", type=float, default=DEFAULT_PSO_DOMAIN["skin_max"]) parser.add_argument("--pso-wellboreC-min", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_min"]) parser.add_argument("--pso-wellboreC-max", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_max"]) parser.add_argument("--pso-phi-min", type=float, default=DEFAULT_PSO_DOMAIN["phi_min"]) parser.add_argument("--pso-phi-max", type=float, default=DEFAULT_PSO_DOMAIN["phi_max"]) return parser.parse_args() def default_model_path(tag: str | None) -> Path: if tag: return Path("models") / f"time_conditioned_surrogate_{tag}" / "time_conditioned_surrogate_best.pt" return Path("models/time_conditioned_surrogate/time_conditioned_surrogate_best.pt") def default_output_dir(tag: str | None) -> Path: if tag: return Path("results") / f"evaluation_time_conditioned_{tag}" return Path("results/evaluation_time_conditioned") def percentile_summary(values: np.ndarray) -> dict: x = np.asarray(values, dtype=np.float64).reshape(-1) if x.size == 0: return { "min": None, "p05": None, "p25": None, "median": None, "p75": None, "p90": None, "p95": None, "max": None, } return { "min": float(np.min(x)), "p05": float(np.percentile(x, 5)), "p25": float(np.percentile(x, 25)), "median": float(np.percentile(x, 50)), "p75": float(np.percentile(x, 75)), "p90": float(np.percentile(x, 90)), "p95": float(np.percentile(x, 95)), "max": float(np.max(x)), } def point_metrics(true: np.ndarray, pred: np.ndarray) -> dict: err = np.asarray(pred, dtype=np.float64) - np.asarray(true, dtype=np.float64) abs_err = np.abs(err) return { "rmse": float(np.sqrt(np.mean(err**2))), "mae": float(np.mean(abs_err)), "bias": float(np.mean(err)), "p90_abs": float(np.percentile(abs_err, 90)), "p95_abs": float(np.percentile(abs_err, 95)), } def sample_metrics(true_p: np.ndarray, pred_p: np.ndarray, true_d: np.ndarray, pred_d: np.ndarray) -> list[dict]: rows: list[dict] = [] for idx in range(true_p.shape[0]): p_err = pred_p[idx] - true_p[idx] d_err = pred_d[idx] - true_d[idx] rmse_p = float(np.sqrt(np.mean(p_err**2))) rmse_d = float(np.sqrt(np.mean(d_err**2))) mae_p = float(np.mean(np.abs(p_err))) mae_d = float(np.mean(np.abs(d_err))) rows.append( { "idx": idx, "rmse_p": rmse_p, "rmse_d": rmse_d, "mae_p": mae_p, "mae_d": mae_d, "score": float(rmse_p + 2.0 * rmse_d), } ) return rows def write_csv(path: Path, rows: list[dict], fieldnames: list[str] | None = None) -> None: path.parent.mkdir(parents=True, exist_ok=True) if not rows: path.write_text("", encoding="utf-8-sig") return names = fieldnames or list(rows[0].keys()) with path.open("w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=names, extrasaction="ignore") writer.writeheader() writer.writerows(rows) def iter_batches(total: int, batch_size: int) -> Iterable[tuple[int, int]]: batch = max(1, int(batch_size)) for start in range(0, int(total), batch): yield start, min(start + batch, int(total)) def load_model(model_path: Path, device: torch.device) -> tuple[TimeConditionedSurrogate, dict]: checkpoint = torch.load(model_path, map_location="cpu") model = TimeConditionedSurrogate( param_dim=int(checkpoint["param_dim"]), schedule_dim=int(checkpoint["schedule_dim"]), time_dim=int(checkpoint["time_dim"]), hidden_dim=int(checkpoint["hidden_dim"]), n_blocks=int(checkpoint["n_blocks"]), dropout=float(checkpoint["dropout"]), use_schedule=bool(checkpoint.get("use_schedule", True)), ) model.load_state_dict(checkpoint["model_state_dict"]) model.eval() model.to(device) return model, checkpoint def predict_scaled_points( model: TimeConditionedSurrogate, params_x: np.ndarray, schedule_x: np.ndarray, time_x: np.ndarray, device: torch.device, batch_size: int, ) -> np.ndarray: n_samples, n_time, time_dim = time_x.shape params_flat = np.repeat(params_x, n_time, axis=0) schedule_flat = np.repeat(schedule_x, n_time, axis=0) time_flat = time_x.reshape(n_samples * n_time, time_dim) pred_flat = np.empty((n_samples * n_time, 2), dtype=np.float32) use_schedule = bool(model.use_schedule) with torch.no_grad(): for start, end in iter_batches(len(time_flat), batch_size): params_t = torch.tensor(params_flat[start:end], dtype=torch.float32, device=device) time_t = torch.tensor(time_flat[start:end], dtype=torch.float32, device=device) if use_schedule: schedule_t = torch.tensor(schedule_flat[start:end], dtype=torch.float32, device=device) else: schedule_t = None pred_flat[start:end] = model(params_t, time_t, schedule_t).detach().cpu().numpy() return pred_flat.reshape(n_samples, n_time, 2) def inverse_curve_part(values_scaled: np.ndarray, scaler_curve: object, part_slice: slice) -> np.ndarray: mean = np.asarray(scaler_curve.mean_[part_slice], dtype=np.float32) scale = np.asarray(scaler_curve.scale_[part_slice], dtype=np.float32) return values_scaled.astype(np.float32) * scale.reshape(1, -1) + mean.reshape(1, -1) def recover_raw_params(data: dict) -> dict[str, np.ndarray]: meta = data.get("meta", {}) or {} features = data["scaler_params"].inverse_transform(data["X_params_test"]) raw = inverse_transform_param_features(features, meta.get("param_feature_transform")) names = list(meta.get("param_names") or ["k", "skin", "wellboreC", "phi", "h", "Cf"]) return {name: raw[:, idx].astype(np.float64) for idx, name in enumerate(names[: raw.shape[1]])} def build_pso_mask(params: dict[str, np.ndarray], args: argparse.Namespace) -> np.ndarray: return ( (params["k"] >= float(args.pso_k_min)) & (params["k"] <= float(args.pso_k_max)) & (params["skin"] >= float(args.pso_skin_min)) & (params["skin"] <= float(args.pso_skin_max)) & (params["wellboreC"] >= float(args.pso_wellboreC_min)) & (params["wellboreC"] <= float(args.pso_wellboreC_max)) & (params["phi"] >= float(args.pso_phi_min)) & (params["phi"] <= float(args.pso_phi_max)) & (params["h"] >= float(args.pso_h_min)) & (params["h"] <= float(args.pso_h_max)) ) def summarize_group(score: np.ndarray, rmse_p: np.ndarray, rmse_d: np.ndarray, mask: np.ndarray) -> dict: m = np.asarray(mask, dtype=bool) return { "n": int(np.sum(m)), "score": percentile_summary(score[m]), "rmse_p": percentile_summary(rmse_p[m]), "rmse_d": percentile_summary(rmse_d[m]), "score_gt_1_ratio": float(np.mean(score[m] > 1.0)) if np.any(m) else None, "score_gt_2_ratio": float(np.mean(score[m] > 2.0)) if np.any(m) else None, "score_gt_5_ratio": float(np.mean(score[m] > 5.0)) if np.any(m) else None, } def build_domain_summary(sample_rows: list[dict], params: dict[str, np.ndarray], pso_mask: np.ndarray) -> dict: score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) rmse_p = np.asarray([r["rmse_p"] for r in sample_rows], dtype=np.float64) rmse_d = np.asarray([r["rmse_d"] for r in sample_rows], dtype=np.float64) skin = params["skin"] wellboreC = params["wellboreC"] order = np.argsort(-score) top100 = order[: min(100, order.size)] return { "all": summarize_group(score, rmse_p, rmse_d, np.ones_like(pso_mask, dtype=bool)), "pso_domain": summarize_group(score, rmse_p, rmse_d, pso_mask), "outside_pso_domain": summarize_group(score, rmse_p, rmse_d, ~pso_mask), "pso_skin_lt_minus_5": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -5.0)), "pso_skin_lt_minus_8": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -8.0)), "pso_skin_lt_minus_5_wellboreC_gt_0_1": summarize_group( score, rmse_p, rmse_d, pso_mask & (skin < -5.0) & (wellboreC > 0.1), ), "top100": { "outside_pso_domain": int(np.sum(~pso_mask[top100])), "k_lt_0_001": int(np.sum(params["k"][top100] < 0.001)), "k_gt_10": int(np.sum(params["k"][top100] > 10.0)), "h_gt_50": int(np.sum(params["h"][top100] > 50.0)), "pso_skin_lt_minus_5_wellboreC_gt_0_1": int( np.sum((pso_mask & (skin < -5.0) & (wellboreC > 0.1))[top100]) ), }, } def summarize_params_for_indices(params: dict[str, np.ndarray], indices: np.ndarray) -> dict: return { name: percentile_summary(values[np.asarray(indices, dtype=int)]) for name, values in params.items() if name in {"k", "skin", "wellboreC", "phi", "h", "Cf"} } def build_worst_case_summary( sample_rows: list[dict], params: dict[str, np.ndarray], pso_mask: np.ndarray, top_k: int, ) -> dict: score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) order_worst = np.argsort(-score) order_best = np.argsort(score) top = order_worst[: min(int(top_k), order_worst.size)] worst100 = order_worst[: min(100, order_worst.size)] best100 = order_best[: min(100, order_best.size)] return { "top_k": int(top.size), "metrics": { "score": percentile_summary(score), "n_score_gt_1": int(np.sum(score > 1.0)), "n_score_gt_2": int(np.sum(score > 2.0)), "n_score_gt_5": int(np.sum(score > 5.0)), }, "pso_domain": { "n_inside": int(np.sum(pso_mask)), "n_outside": int(np.sum(~pso_mask)), "top100_outside": int(np.sum(~pso_mask[worst100])), "top100_k_lt_0_001": int(np.sum(params["k"][worst100] < 0.001)), "top100_k_gt_10": int(np.sum(params["k"][worst100] > 10.0)), "top100_h_gt_50": int(np.sum(params["h"][worst100] > 50.0)), }, "params": { "all": summarize_params_for_indices(params, np.arange(score.size)), "worst_top_k": summarize_params_for_indices(params, top), "worst100": summarize_params_for_indices(params, worst100), "best100": summarize_params_for_indices(params, best100), }, } def build_worst_case_rows( sample_rows: list[dict], params: dict[str, np.ndarray], data: dict, true_p: np.ndarray, true_d: np.ndarray, pred_p: np.ndarray, pred_d: np.ndarray, pso_mask: np.ndarray, top_k: int, ) -> tuple[list[dict], list[dict]]: score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) order = np.argsort(-score)[: min(int(top_k), len(sample_rows))] family = data.get("family_name_test") schedule_meta = data.get("schedule_meta_test") schedule_meta_names = list((data.get("meta", {}) or {}).get("schedule_meta_names") or []) case_rows: list[dict] = [] residual_rows: list[dict] = [] for rank, idx in enumerate(order, 1): p_res = pred_p[idx] - true_p[idx] d_res = pred_d[idx] - true_d[idx] p_rmse = float(np.sqrt(np.mean(p_res**2))) d_rmse = float(np.sqrt(np.mean(d_res**2))) p_mean = float(np.mean(p_res)) d_mean = float(np.mean(d_res)) p_std = float(np.std(p_res)) d_std = float(np.std(d_res)) row = { "rank": rank, "idx": int(idx), "score": float(score[idx]), "rmse_p": float(sample_rows[idx]["rmse_p"]), "rmse_d": float(sample_rows[idx]["rmse_d"]), "mae_p": float(sample_rows[idx]["mae_p"]), "mae_d": float(sample_rows[idx]["mae_d"]), "in_pso_domain": int(bool(pso_mask[idx])), "family": str(family[idx]) if family is not None else "", } for name in ["k", "skin", "wellboreC", "phi", "h", "Cf"]: if name in params: row[name] = float(params[name][idx]) if schedule_meta is not None: for midx, name in enumerate(schedule_meta_names): if midx < schedule_meta.shape[1]: row[f"sched_{name}"] = float(schedule_meta[idx, midx]) case_rows.append(row) residual_rows.append( { "rank": rank, "idx": int(idx), "score": float(score[idx]), "p_res_mean": p_mean, "p_res_std": p_std, "p_res_rmse": p_rmse, "p_shift_ratio": float(abs(p_mean) / max(p_rmse, 1.0e-12)), "d_res_mean": d_mean, "d_res_std": d_std, "d_res_rmse": d_rmse, "d_shift_ratio": float(abs(d_mean) / max(d_rmse, 1.0e-12)), "p_res_first": float(p_res[0]), "p_res_last": float(p_res[-1]), "d_res_first": float(d_res[0]), "d_res_last": float(d_res[-1]), } ) return case_rows, residual_rows def plot_sample( output_path: Path, idx: int, t: np.ndarray, true_p: np.ndarray, pred_p: np.ndarray, true_d: np.ndarray, pred_d: np.ndarray, title: str, ) -> None: x = np.asarray(t, dtype=np.float64) fig, axes = plt.subplots(2, 2, figsize=(13, 8)) fig.suptitle(title) axes[0, 0].plot(x, true_p, label="True", linewidth=2) axes[0, 0].plot(x, pred_p, label="Pred", linewidth=2) axes[0, 0].set_title("Log Pressure") axes[0, 0].set_xscale("log") axes[0, 0].grid(True, alpha=0.3) axes[0, 0].legend() axes[0, 1].plot(x, pred_p - true_p, linewidth=1.5) axes[0, 1].axhline(0.0, linestyle="--", linewidth=1) axes[0, 1].set_title("Pressure Residual") axes[0, 1].set_xscale("log") axes[0, 1].grid(True, alpha=0.3) axes[1, 0].plot(x, true_d, label="True", linewidth=2) axes[1, 0].plot(x, pred_d, label="Pred", linewidth=2) axes[1, 0].set_title("Log Derivative") axes[1, 0].set_xscale("log") axes[1, 0].grid(True, alpha=0.3) axes[1, 0].legend() axes[1, 1].plot(x, pred_d - true_d, linewidth=1.5) axes[1, 1].axhline(0.0, linestyle="--", linewidth=1) axes[1, 1].set_title("Derivative Residual") axes[1, 1].set_xscale("log") axes[1, 1].grid(True, alpha=0.3) for ax in axes.ravel(): ax.set_xlabel("Time") output_path.parent.mkdir(parents=True, exist_ok=True) plt.tight_layout(rect=[0, 0, 1, 0.95]) plt.savefig(output_path, dpi=150, bbox_inches="tight") plt.close(fig) def write_plots( output_dir: Path, sample_rows: list[dict], t_curve: np.ndarray, true_p: np.ndarray, true_d: np.ndarray, pred_p: np.ndarray, pred_d: np.ndarray, args: argparse.Namespace, ) -> None: plot_dir = output_dir / "plots" score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) random.seed(int(args.seed)) n_random = min(int(args.n_random_plots), len(sample_rows)) n_best = min(int(args.n_best_plots), len(sample_rows)) n_worst = min(int(args.n_worst_plots), len(sample_rows)) best = np.argsort(score)[:n_best].tolist() worst = np.argsort(-score)[:n_worst].tolist() random_idx = random.sample(range(len(sample_rows)), n_random) for idx in random_idx: plot_sample( plot_dir / f"sample_{idx:04d}.png", idx, t_curve[idx], true_p[idx], pred_p[idx], true_d[idx], pred_d[idx], f"Random sample {idx} | score={score[idx]:.4f}", ) for idx in best: plot_sample( plot_dir / f"best_sample_{idx:04d}.png", idx, t_curve[idx], true_p[idx], pred_p[idx], true_d[idx], pred_d[idx], f"Best sample {idx} | score={score[idx]:.4f}", ) for idx in worst: plot_sample( plot_dir / f"worst_sample_{idx:04d}.png", idx, t_curve[idx], true_p[idx], pred_p[idx], true_d[idx], pred_d[idx], f"Worst sample {idx} | score={score[idx]:.4f}", ) def main() -> None: args = parse_args() tag = normalize_tag(args.tag) processed_path = Path(args.processed) if args.processed is not None else processed_path_for_tag(tag) model_path = Path(args.model) if args.model is not None else default_model_path(tag) output_dir = Path(args.output_dir) if args.output_dir is not None else default_output_dir(tag) output_dir.mkdir(parents=True, exist_ok=True) device_name = args.device or ("cuda" if torch.cuda.is_available() else "cpu") device = torch.device(device_name) print("Loading processed dataset...") data = joblib.load(processed_path) required = ["X_params_test", "X_schedule_test", "X_time_test", "Y_curve_test"] missing = [key for key in required if key not in data] if missing: raise KeyError(f"processed dataset is missing time-conditioned fields: {missing}") print("Loading model...") model, checkpoint = load_model(model_path, device) curve_layout = checkpoint.get("curve_layout") or infer_curve_layout(data) slices = get_part_slices(curve_layout) x_params = np.asarray(data["X_params_test"], dtype=np.float32) x_schedule = np.asarray(data["X_schedule_test"], dtype=np.float32) x_time = np.asarray(data["X_time_test"], dtype=np.float32) y_curve = np.asarray(data["Y_curve_test"], dtype=np.float32) scaler_curve = data["scaler_curve"] print( f"test={x_params.shape[0]}, n_time={x_time.shape[1]}, " f"param_dim={x_params.shape[1]}, schedule_dim={x_schedule.shape[1]}, time_dim={x_time.shape[-1]}" ) print(f"device={device}, batch_size={args.batch_size}") pred_scaled = predict_scaled_points( model=model, params_x=x_params, schedule_x=x_schedule, time_x=x_time, device=device, batch_size=int(args.batch_size), ) p_slice = slices["log_pressure"] d_slice = slices["log_derivative"] true_p_scaled = y_curve[:, p_slice] true_d_scaled = y_curve[:, d_slice] pred_p_scaled = pred_scaled[:, :, 0] pred_d_scaled = pred_scaled[:, :, 1] true_p = inverse_curve_part(true_p_scaled, scaler_curve, p_slice) true_d = inverse_curve_part(true_d_scaled, scaler_curve, d_slice) pred_p = inverse_curve_part(pred_p_scaled, scaler_curve, p_slice) pred_d = inverse_curve_part(pred_d_scaled, scaler_curve, d_slice) summary = { "processed_path": str(processed_path), "model_path": str(model_path), "device": str(device), "checkpoint": { "hidden_dim": int(checkpoint["hidden_dim"]), "n_blocks": int(checkpoint["n_blocks"]), "dropout": float(checkpoint["dropout"]), "use_schedule": bool(checkpoint.get("use_schedule", True)), }, "scaled_log_pressure": point_metrics(true_p_scaled, pred_p_scaled), "scaled_log_derivative": point_metrics(true_d_scaled, pred_d_scaled), "raw_log_pressure": point_metrics(true_p, pred_p), "raw_log_derivative": point_metrics(true_d, pred_d), } rows = sample_metrics(true_p=true_p, pred_p=pred_p, true_d=true_d, pred_d=pred_d) params = recover_raw_params(data) pso_mask = build_pso_mask(params, args) domain_summary = build_domain_summary(rows, params, pso_mask) summary["pso_domain"] = { "bounds": { "k": [float(args.pso_k_min), float(args.pso_k_max)], "skin": [float(args.pso_skin_min), float(args.pso_skin_max)], "wellboreC": [float(args.pso_wellboreC_min), float(args.pso_wellboreC_max)], "phi": [float(args.pso_phi_min), float(args.pso_phi_max)], "h": [float(args.pso_h_min), float(args.pso_h_max)], }, "metrics": domain_summary, } case_rows, residual_rows = build_worst_case_rows( sample_rows=rows, params=params, data=data, true_p=true_p, true_d=true_d, pred_p=pred_p, pred_d=pred_d, pso_mask=pso_mask, top_k=int(args.top_k_analysis), ) worst_case_summary = build_worst_case_summary( sample_rows=rows, params=params, pso_mask=pso_mask, top_k=int(args.top_k_analysis), ) (output_dir / "summary_metrics.json").write_text( json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8", ) write_csv(output_dir / "sample_metrics.csv", rows) write_csv(output_dir / "worst_case_analysis.csv", case_rows) write_csv(output_dir / "worst_residual_analysis.csv", residual_rows) (output_dir / "worst_case_summary.json").write_text( json.dumps(worst_case_summary, indent=2, ensure_ascii=False), encoding="utf-8", ) residual_summary = { "top_k": int(len(residual_rows)), "top300_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows])), "top300_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows])), "top100_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows[:100]])), "top100_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows[:100]])), "top20": residual_rows[:20], } (output_dir / "worst_residual_summary.json").write_text( json.dumps(residual_summary, indent=2, ensure_ascii=False), encoding="utf-8", ) t_curve = np.asarray(data.get("T_curve_test"), dtype=np.float32) if t_curve.ndim == 2 and t_curve.shape == true_p.shape: write_plots(output_dir, rows, t_curve, true_p, true_d, pred_p, pred_d, args) print("\nEvaluation complete.") print(f"raw_log_pressure RMSE={summary['raw_log_pressure']['rmse']:.6f}, MAE={summary['raw_log_pressure']['mae']:.6f}") print(f"raw_log_derivative RMSE={summary['raw_log_derivative']['rmse']:.6f}, MAE={summary['raw_log_derivative']['mae']:.6f}") print( "PSO-domain: " f"n={domain_summary['pso_domain']['n']}, " f"median={domain_summary['pso_domain']['score']['median']:.6f}, " f"p95={domain_summary['pso_domain']['score']['p95']:.6f}, " f"score>1={100.0 * domain_summary['pso_domain']['score_gt_1_ratio']:.3f}%" ) print(f"Artifacts written to: {output_dir}") if __name__ == "__main__": main()