From 88b47c33598e36e6ad02fdb164261c2daa174fc1 Mon Sep 17 00:00:00 2001 From: 1294271022 <1294271022@qq.com> Date: Mon, 22 Jun 2026 18:25:31 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E7=A7=BB=E9=99=A4=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E6=9D=A1=E4=BB=B6=E6=A8=A1=E5=9E=8B=202=E3=80=81=E7=A7=BB?= =?UTF-8?q?=E9=99=A4=E4=B8=8D=E7=A1=AE=E5=AE=9A=E6=80=A7=E9=87=8F=E5=8C=96?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=E4=BB=A3=E7=A0=81=203=E3=80=81=E7=A7=BB?= =?UTF-8?q?=E9=99=A4160=E7=BB=B4slope=204=E3=80=81=E6=95=B4=E7=90=86?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ML/nmWTAI-ML/configs/data_gen.yaml | 1 - .../configs/data_gen_case_neighborhood.yaml | 1 - .../configs/data_gen_family_random.yaml | 1 - .../configs/data_gen_family_random_hard.yaml | 1 - .../configs/data_gen_family_random_v2_q.yaml | 1 - ML/nmWTAI-ML/scripts/analyze_uq_results.py | 455 ------------ .../scripts/analyze_uq_with_metadata.py | 371 ---------- ML/nmWTAI-ML/scripts/compare_single_case.py | 42 +- ML/nmWTAI-ML/scripts/evaluate_forward.py | 48 +- .../scripts/evaluate_forward_ensemble.py | 419 ----------- .../scripts/evaluate_time_conditioned.py | 684 ------------------ .../scripts/finetune_forward_local_ranking.py | 11 +- .../generate_autofit_neighborhood_dataset.py | 112 ++- ML/nmWTAI-ML/scripts/preprocess_dataset.py | 24 +- ML/nmWTAI-ML/scripts/q_sweep_local_ranking.py | 2 + .../scripts/replay_pso_trace_screening.py | 6 +- ML/nmWTAI-ML/scripts/score_pso_candidates.py | 6 +- ML/nmWTAI-ML/scripts/train_forward.py | 2 - .../scripts/train_forward_ensemble.py | 156 ---- .../scripts/train_time_conditioned.py | 104 --- .../scripts/validate_autofit_local_ranking.py | 20 +- .../validate_autofit_local_ranking_batch.py | 2 + ML/nmWTAI-ML/src/common/config.py | 3 +- ML/nmWTAI-ML/src/data/curve_processing.py | 55 +- ML/nmWTAI-ML/src/data/dataset_generation.py | 4 +- ML/nmWTAI-ML/src/data/preprocess.py | 148 +++- ML/nmWTAI-ML/src/data/runner_client.py | 121 +++- ML/nmWTAI-ML/src/data/time_features.py | 49 -- .../src/evaluation/autofit_objective.py | 2 +- ML/nmWTAI-ML/src/models/forward_surrogate.py | 54 +- .../src/models/time_conditioned_surrogate.py | 193 ----- ML/nmWTAI-ML/src/training/train_forward.py | 34 +- .../src/training/train_time_conditioned.py | 552 -------------- .../nmCalculation/nmCalculationAutoFitPSO.cpp | 4 +- 34 files changed, 464 insertions(+), 3224 deletions(-) delete mode 100644 ML/nmWTAI-ML/scripts/analyze_uq_results.py delete mode 100644 ML/nmWTAI-ML/scripts/analyze_uq_with_metadata.py delete mode 100644 ML/nmWTAI-ML/scripts/evaluate_forward_ensemble.py delete mode 100644 ML/nmWTAI-ML/scripts/evaluate_time_conditioned.py delete mode 100644 ML/nmWTAI-ML/scripts/train_forward_ensemble.py delete mode 100644 ML/nmWTAI-ML/scripts/train_time_conditioned.py delete mode 100644 ML/nmWTAI-ML/src/data/time_features.py delete mode 100644 ML/nmWTAI-ML/src/models/time_conditioned_surrogate.py delete mode 100644 ML/nmWTAI-ML/src/training/train_time_conditioned.py diff --git a/ML/nmWTAI-ML/configs/data_gen.yaml b/ML/nmWTAI-ML/configs/data_gen.yaml index 64fe359..e51520b 100644 --- a/ML/nmWTAI-ML/configs/data_gen.yaml +++ b/ML/nmWTAI-ML/configs/data_gen.yaml @@ -59,7 +59,6 @@ curve_processing: # 双对数曲线清洗与重采样设置 n_time_points: 160 # 每条双对数曲线统一重采样的时间点数 min_valid_points: 40 # 有效曲线至少需要的原始点数 feature_epsilon: 1.0e-12 # 取对数和除法时使用的最小保护值 - use_slope_feature: true # 是否额外拼接压力曲线斜率特征 max_time_cap: 3000.0 # 曲线重采样允许的最大时间上限 min_time_floor: 1.0e-6 # 曲线重采样允许的最小时间下限 fixed_time_range: null # 固定重采样时间范围,null 表示按样本自身范围 diff --git a/ML/nmWTAI-ML/configs/data_gen_case_neighborhood.yaml b/ML/nmWTAI-ML/configs/data_gen_case_neighborhood.yaml index 32cc81b..22363c2 100644 --- a/ML/nmWTAI-ML/configs/data_gen_case_neighborhood.yaml +++ b/ML/nmWTAI-ML/configs/data_gen_case_neighborhood.yaml @@ -59,7 +59,6 @@ curve_processing: # 双对数曲线清洗与重采样设置 n_time_points: 160 # 每条双对数曲线统一重采样的时间点数 min_valid_points: 40 # 有效曲线至少需要的原始点数 feature_epsilon: 1.0e-12 # 取对数和除法时使用的最小保护值 - use_slope_feature: true # 是否额外拼接压力曲线斜率特征 max_time_cap: 3000.0 # 曲线重采样允许的最大时间上限 min_time_floor: 1.0e-6 # 曲线重采样允许的最小时间下限 fixed_time_range: null # 固定重采样时间范围,null 表示按样本自身范围 diff --git a/ML/nmWTAI-ML/configs/data_gen_family_random.yaml b/ML/nmWTAI-ML/configs/data_gen_family_random.yaml index c2c56d2..d4bb75b 100644 --- a/ML/nmWTAI-ML/configs/data_gen_family_random.yaml +++ b/ML/nmWTAI-ML/configs/data_gen_family_random.yaml @@ -59,7 +59,6 @@ curve_processing: # 双对数曲线清洗与重采样设置 n_time_points: 160 # 每条双对数曲线统一重采样的时间点数 min_valid_points: 40 # 有效曲线至少需要的原始点数 feature_epsilon: 1.0e-12 # 取对数和除法时使用的最小保护值 - use_slope_feature: true # 是否额外拼接压力曲线斜率特征 max_time_cap: 3000.0 # 曲线重采样允许的最大时间上限 min_time_floor: 1.0e-6 # 曲线重采样允许的最小时间下限 fixed_time_range: null # 固定重采样时间范围,null 表示按样本自身范围 diff --git a/ML/nmWTAI-ML/configs/data_gen_family_random_hard.yaml b/ML/nmWTAI-ML/configs/data_gen_family_random_hard.yaml index b147aab..9fd308c 100644 --- a/ML/nmWTAI-ML/configs/data_gen_family_random_hard.yaml +++ b/ML/nmWTAI-ML/configs/data_gen_family_random_hard.yaml @@ -77,7 +77,6 @@ curve_processing: # 双对数曲线清洗与重采样设置 n_time_points: 160 # 每条双对数曲线统一重采样的时间点数 min_valid_points: 40 # 有效曲线至少需要的原始点数 feature_epsilon: 1.0e-12 # 取对数和除法时使用的最小保护值 - use_slope_feature: true # 是否额外拼接压力曲线斜率特征 max_time_cap: 3000.0 # 曲线重采样允许的最大时间上限 min_time_floor: 1.0e-6 # 曲线重采样允许的最小时间下限 fixed_time_range: null # 固定重采样时间范围,null 表示按样本自身范围 diff --git a/ML/nmWTAI-ML/configs/data_gen_family_random_v2_q.yaml b/ML/nmWTAI-ML/configs/data_gen_family_random_v2_q.yaml index 56487a5..86682cb 100644 --- a/ML/nmWTAI-ML/configs/data_gen_family_random_v2_q.yaml +++ b/ML/nmWTAI-ML/configs/data_gen_family_random_v2_q.yaml @@ -77,7 +77,6 @@ curve_processing: # 双对数曲线清洗与重采样设置 n_time_points: 160 # 每条双对数曲线统一重采样的时间点数 min_valid_points: 40 # 有效曲线至少需要的原始点数 feature_epsilon: 1.0e-12 # 取对数和除法时使用的最小保护值 - use_slope_feature: true # 是否额外拼接压力曲线斜率特征 max_time_cap: 3000.0 # 曲线重采样允许的最大时间上限 min_time_floor: 1.0e-6 # 曲线重采样允许的最小时间下限 fixed_time_range: null # 固定重采样时间范围,null 表示按样本自身范围 diff --git a/ML/nmWTAI-ML/scripts/analyze_uq_results.py b/ML/nmWTAI-ML/scripts/analyze_uq_results.py deleted file mode 100644 index d616282..0000000 --- a/ML/nmWTAI-ML/scripts/analyze_uq_results.py +++ /dev/null @@ -1,455 +0,0 @@ -""" -分析集成代理模型不确定性结果, -评估 uncertainty 是否能够识别高误差样本, -并生成 retention curve、风险样本统计和 summary 输出。 -""" - -from __future__ import annotations - -import argparse -import csv -import json -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import matplotlib.pyplot as plt -import numpy as np - - -# 获取当前脚本所在目录的上一级项目根目录。 -# 这里假设脚本位于项目的某个子目录中,因此 parents[1] 指向项目根目录。 -ROOT = Path(__file__).resolve().parents[1] - -# 将项目根目录加入 Python 搜索路径,方便后续导入项目内部模块。 -# 当前脚本虽然没有直接导入内部模块,但保留该设置可以兼容项目结构。 -sys.path.append(str(ROOT)) - -Row = dict[str, str] -OutputRow = dict[str, Any] - - -@dataclass(frozen=True) -class Metrics: - """保存样本级误差和不确定性指标,减少 main 函数中的局部变量数量。""" - - rmse: np.ndarray - mae: np.ndarray - r2: np.ndarray - unc: np.ndarray - - -@dataclass(frozen=True) -class UncertaintyThresholds: - """保存低/高不确定性阈值。""" - - low: float - high: float - - -@dataclass(frozen=True) -class RiskRows: - """保存几类需要导出的风险样本。""" - - top_uncertain: list[Row] - high_error_high_unc: list[Row] - high_error_low_unc: list[Row] - - -def parse_args() -> argparse.Namespace: - """解析不确定性评估结果 CSV、输出目录以及分位数分箱参数。""" - parser = argparse.ArgumentParser( - description="Analyze ensemble UQ outputs for fallback usefulness" - ) - - parser.add_argument( - "--input-csv", - type=str, - default=None, - help="Path to sample_uncertainty_metrics.csv", - ) - parser.add_argument( - "--output-dir", - type=str, - default=None, - help="Optional output directory", - ) - parser.add_argument( - "--tag", - type=str, - default="family_random_50k", - help="Experiment tag", - ) - parser.add_argument( - "--quantiles", - type=str, - default="0,5,10,20,30,40,50", - help="Comma-separated uncertainty removal percentages", - ) - parser.add_argument( - "--top-k", - type=int, - default=100, - help="Top-K risky samples to export", - ) - parser.add_argument( - "--high-error-rmse", - type=float, - default=1.0, - help="High-error RMSE threshold", - ) - parser.add_argument( - "--low-unc-quantile", - type=float, - default=50.0, - help="Low uncertainty quantile", - ) - parser.add_argument( - "--high-unc-quantile", - type=float, - default=90.0, - help="High uncertainty quantile", - ) - - return parser.parse_args() - - -def parse_quantiles(text: str) -> list[float]: - """解析逗号分隔的剔除比例,用于不确定性保留曲线分析。""" - values = [] - - for item in str(text).split(","): - item = item.strip() - if not item: - continue - - q = float(item) - if q < 0 or q >= 100: - raise ValueError(f"非法 quantile 百分比: {q}") - - values.append(q) - - if 0.0 not in values: - values = [0.0] + values - - return sorted(set(values)) - - -def default_input_csv(tag: str) -> Path: - """根据实验标签定位集成评估生成的 sample_uncertainty_metrics.csv。""" - return Path("results") / f"evaluation_{tag}_ensemble_uq" / "sample_uncertainty_metrics.csv" - - -def default_output_dir(tag: str) -> Path: - """根据实验标签生成当前分析脚本默认的输出目录。""" - return Path("results") / f"evaluation_{tag}_ensemble_uq_analysis" - - -def resolve_paths(args: argparse.Namespace) -> tuple[Path, Path]: - """根据命令行参数解析输入 CSV 和输出目录。""" - tag = str(args.tag).strip() - input_csv = Path(args.input_csv) if args.input_csv else default_input_csv(tag) - output_dir = Path(args.output_dir) if args.output_dir else default_output_dir(tag) - return input_csv, output_dir - - -def load_rows(csv_path: Path) -> list[Row]: - """读取 CSV 为字典行列表,保留每个样本的不确定性和误差字段。""" - with open(csv_path, "r", encoding="utf-8-sig", newline="") as file: - rows = list(csv.DictReader(file)) - - if not rows: - raise ValueError(f"CSV 没有数据: {csv_path}") - - return rows - - -def as_float(rows: list[Row], key: str) -> np.ndarray: - """将 CSV 字段转为 float 数组,用于后续统计分析。""" - return np.array([float(row[key]) for row in rows], dtype=np.float64) - - -def extract_metrics(rows: list[Row]) -> Metrics: - """从 CSV 行中提取样本级误差和不确定性指标。""" - return Metrics( - rmse=as_float(rows, "overall_rmse"), - mae=as_float(rows, "overall_mae"), - r2=as_float(rows, "overall_r2"), - unc=as_float(rows, "unc_mean_std"), - ) - - -def safe_mean(x: np.ndarray) -> float: - """计算均值;没有有效数据时返回 NaN。""" - return float(np.mean(x)) if x.size else np.nan - - -def safe_median(x: np.ndarray) -> float: - """计算中位数;没有有效数据时返回 NaN。""" - return float(np.median(x)) if x.size else np.nan - - -def safe_percentile(x: np.ndarray, q: float) -> float: - """计算百分位数;没有有效数据时返回 NaN。""" - return float(np.percentile(x, q)) if x.size else np.nan - - -def save_csv(path: Path, rows: list[OutputRow] | list[Row]) -> None: - """把字典行写入 CSV。""" - if not rows: - return - - with open(path, "w", encoding="utf-8-sig", newline="") as file: - writer = csv.DictWriter(file, fieldnames=list(rows[0].keys())) - writer.writeheader() - writer.writerows(rows) - - -def build_thresholds( - unc: np.ndarray, - low_unc_quantile: float, - high_unc_quantile: float, -) -> UncertaintyThresholds: - """根据样本不确定性分布计算低/高不确定性阈值。""" - return UncertaintyThresholds( - low=float(np.percentile(unc, low_unc_quantile)), - high=float(np.percentile(unc, high_unc_quantile)), - ) - - -def build_retention_curve(metrics: Metrics, quantiles: list[float]) -> list[OutputRow]: - """构造按不确定性由高到低剔除样本后的误差统计曲线。""" - curve_rows: list[OutputRow] = [] - - for removed_pct in quantiles: - threshold = float(np.percentile(metrics.unc, 100.0 - removed_pct)) - keep_mask = metrics.unc <= threshold - - kept_rmse = metrics.rmse[keep_mask] - kept_mae = metrics.mae[keep_mask] - kept_r2 = metrics.r2[keep_mask] - - curve_rows.append( - { - "removed_pct": float(removed_pct), - "retained_pct": float(100.0 * np.mean(keep_mask)), - "n_retained": int(np.sum(keep_mask)), - "rmse_mean": safe_mean(kept_rmse), - "rmse_median": safe_median(kept_rmse), - "rmse_p90": safe_percentile(kept_rmse, 90), - "mae_mean": safe_mean(kept_mae), - "mae_median": safe_median(kept_mae), - "r2_mean": safe_mean(kept_r2), - "r2_median": safe_median(kept_r2), - "unc_threshold": threshold, - } - ) - - return curve_rows - - -def select_top_uncertain_rows(rows: list[Row], unc: np.ndarray, top_k: int) -> list[Row]: - """按不确定性从大到小导出 Top-K 样本。""" - top_uncertain_indices = np.argsort(-unc)[: min(top_k, len(rows))] - return [rows[int(index)] for index in top_uncertain_indices] - - -def classify_risky_rows( - rows: list[Row], - thresholds: UncertaintyThresholds, - high_error_rmse: float, -) -> tuple[list[Row], list[Row]]: - """识别高误差高不确定性样本,以及高误差低不确定性样本。""" - high_error_high_unc_rows = [] - high_error_low_unc_rows = [] - - for row in rows: - row_rmse = float(row["overall_rmse"]) - row_unc = float(row["unc_mean_std"]) - - if row_rmse >= high_error_rmse and row_unc >= thresholds.high: - high_error_high_unc_rows.append(row) - - if row_rmse >= high_error_rmse and row_unc <= thresholds.low: - high_error_low_unc_rows.append(row) - - high_error_high_unc_rows.sort( - key=lambda row: float(row["overall_rmse"]), - reverse=True, - ) - high_error_low_unc_rows.sort( - key=lambda row: float(row["overall_rmse"]), - reverse=True, - ) - - return high_error_high_unc_rows, high_error_low_unc_rows - - -def build_risk_rows( - rows: list[Row], - metrics: Metrics, - thresholds: UncertaintyThresholds, - high_error_rmse: float, - top_k: int, -) -> RiskRows: - """构造所有需要导出的风险样本集合。""" - high_error_high_unc_rows, high_error_low_unc_rows = classify_risky_rows( - rows=rows, - thresholds=thresholds, - high_error_rmse=high_error_rmse, - ) - - return RiskRows( - top_uncertain=select_top_uncertain_rows(rows, metrics.unc, top_k), - high_error_high_unc=high_error_high_unc_rows, - high_error_low_unc=high_error_low_unc_rows, - ) - - -def build_summary( - input_csv: Path, - metrics: Metrics, - thresholds: UncertaintyThresholds, - risk_rows: RiskRows, - args: argparse.Namespace, -) -> dict[str, Any]: - """汇总全局统计信息和关键阈值,方便后续写入 JSON。""" - return { - "input_csv": str(input_csv), - "n_samples": int(metrics.rmse.size), - "global": { - "rmse_mean": safe_mean(metrics.rmse), - "rmse_median": safe_median(metrics.rmse), - "rmse_p90": safe_percentile(metrics.rmse, 90), - "mae_mean": safe_mean(metrics.mae), - "r2_mean": safe_mean(metrics.r2), - "unc_mean": safe_mean(metrics.unc), - "unc_median": safe_median(metrics.unc), - "unc_p90": safe_percentile(metrics.unc, 90), - }, - "thresholds": { - "high_error_rmse": float(args.high_error_rmse), - "low_unc_quantile": float(args.low_unc_quantile), - "low_unc_threshold": thresholds.low, - "high_unc_quantile": float(args.high_unc_quantile), - "high_unc_threshold": thresholds.high, - }, - "counts": { - "top_uncertain_exported": len(risk_rows.top_uncertain), - "high_error_high_unc": len(risk_rows.high_error_high_unc), - "high_error_low_unc": len(risk_rows.high_error_low_unc), - }, - } - - -def plot_retention_curve(curve_rows: list[OutputRow], output_path: Path) -> None: - """绘制按不确定性由高到低剔除样本后的误差变化。""" - removed = np.array( - [float(row["removed_pct"]) for row in curve_rows], - dtype=np.float64, - ) - retained = np.array( - [float(row["retained_pct"]) for row in curve_rows], - dtype=np.float64, - ) - rmse = np.array([float(row["rmse_mean"]) for row in curve_rows], dtype=np.float64) - mae = np.array([float(row["mae_mean"]) for row in curve_rows], dtype=np.float64) - r2 = np.array([float(row["r2_mean"]) for row in curve_rows], dtype=np.float64) - - fig, axes = plt.subplots(1, 3, figsize=(15, 4.5)) - - axes[0].plot(retained, rmse, marker="o") - axes[0].set_title("Retained vs RMSE") - axes[0].set_xlabel("Retained Samples (%)") - axes[0].set_ylabel("RMSE mean") - axes[0].grid(True, alpha=0.3) - - axes[1].plot(retained, mae, marker="o") - axes[1].set_title("Retained vs MAE") - axes[1].set_xlabel("Retained Samples (%)") - axes[1].set_ylabel("MAE mean") - axes[1].grid(True, alpha=0.3) - - axes[2].plot(retained, r2, marker="o") - axes[2].set_title("Retained vs R2") - axes[2].set_xlabel("Retained Samples (%)") - axes[2].set_ylabel("R2 mean") - axes[2].grid(True, alpha=0.3) - - removed_grid = ",".join(str(int(value)) for value in removed) - fig.suptitle(f"Uncertainty Filtering Curves | removed grid={removed_grid}%") - - plt.tight_layout(rect=[0, 0, 1, 0.94]) - plt.savefig(output_path, dpi=150, bbox_inches="tight") - plt.close() - - -def save_outputs( - output_dir: Path, - curve_rows: list[OutputRow], - risk_rows: RiskRows, - summary: dict[str, Any], -) -> None: - """保存 JSON、CSV 和不确定性过滤曲线图。""" - with open(output_dir / "uq_fallback_summary.json", "w", encoding="utf-8") as file: - json.dump(summary, file, ensure_ascii=False, indent=2) - - save_csv(output_dir / "uncertainty_filter_curve.csv", curve_rows) - save_csv(output_dir / "top_uncertain_samples.csv", risk_rows.top_uncertain) - save_csv(output_dir / "high_error_high_unc_samples.csv", risk_rows.high_error_high_unc) - save_csv(output_dir / "high_error_low_unc_samples.csv", risk_rows.high_error_low_unc) - - plot_retention_curve(curve_rows, output_dir / "uncertainty_filter_curve.png") - - -def print_summary(output_dir: Path, summary: dict[str, Any]) -> None: - """在控制台打印简要结果,便于命令行运行后快速判断分析是否完成。""" - print("UQ fallback analysis complete.") - print(f"Output dir: {output_dir}") - print( - f"Global RMSE mean={summary['global']['rmse_mean']:.6f}, " - f"unc mean={summary['global']['unc_mean']:.6f}" - ) - print( - f"High-error & high-unc count={summary['counts']['high_error_high_unc']}, " - f"high-error & low-unc count={summary['counts']['high_error_low_unc']}" - ) - - -def main() -> None: - """分析集成不确定性与真实误差的相关性,并生成分析产物。""" - args = parse_args() - input_csv, output_dir = resolve_paths(args) - output_dir.mkdir(parents=True, exist_ok=True) - - rows = load_rows(input_csv) - metrics = extract_metrics(rows) - quantiles = parse_quantiles(args.quantiles) - thresholds = build_thresholds( - unc=metrics.unc, - low_unc_quantile=float(args.low_unc_quantile), - high_unc_quantile=float(args.high_unc_quantile), - ) - curve_rows = build_retention_curve(metrics, quantiles) - risk_rows = build_risk_rows( - rows=rows, - metrics=metrics, - thresholds=thresholds, - high_error_rmse=float(args.high_error_rmse), - top_k=int(args.top_k), - ) - summary = build_summary( - input_csv=input_csv, - metrics=metrics, - thresholds=thresholds, - risk_rows=risk_rows, - args=args, - ) - - save_outputs(output_dir, curve_rows, risk_rows, summary) - print_summary(output_dir, summary) - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/analyze_uq_with_metadata.py b/ML/nmWTAI-ML/scripts/analyze_uq_with_metadata.py deleted file mode 100644 index 6643840..0000000 --- a/ML/nmWTAI-ML/scripts/analyze_uq_with_metadata.py +++ /dev/null @@ -1,371 +0,0 @@ -""" -将集成学习不确定性量化样本指标与处理后的数据集元数据进行关联。 - -该脚本为每个 UQ 指标行补充调度元数据、类别标签以及可选的源标签,然后输出分组汇总结果和困难样本。 -""" - -from __future__ import annotations - -import argparse -import csv -import json -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import joblib -import numpy as np - -ROOT = Path(__file__).resolve().parents[1] -if str(ROOT) not in sys.path: - sys.path.insert(0, str(ROOT)) - - -def load_experiment_path_helpers(): - """Load project path helpers after adding project root to sys.path.""" - # pylint: disable=import-error,import-outside-toplevel - from src.common.experiment_paths import normalize_tag, processed_path_for_tag - - return normalize_tag, processed_path_for_tag - - -normalize_tag_func, processed_path_func = load_experiment_path_helpers() - - -@dataclass(frozen=True) -class AnalysisPaths: - """Resolved input and output paths used by the metadata analysis.""" - - tag: str - processed_path: Path - uq_csv: Path - output_dir: Path - - -@dataclass(frozen=True) -class ProcessedMetadata: - """Metadata arrays loaded from the processed dataset.""" - - schedule_meta_test: np.ndarray - family_name_test: np.ndarray - source_name_test: np.ndarray | None - source_id_test: np.ndarray | None - meta_names: list[str] - name_to_col: dict[str, int] - - -def parse_args() -> argparse.Namespace: - """解析 UQ 指标 CSV、processed 数据和输出路径,用于合并样本元数据分析。""" - parser = argparse.ArgumentParser( - description="Join UQ sample metrics with saved metadata" - ) - parser.add_argument( - "--processed", - type=str, - default=None, - help="Processed dataset path", - ) - parser.add_argument( - "--uq-csv", - type=str, - default=None, - help="sample_uncertainty_metrics.csv path", - ) - parser.add_argument( - "--tag", - type=str, - default="family_random_50k", - help="Experiment tag", - ) - parser.add_argument( - "--output-dir", - type=str, - default=None, - help="Output directory", - ) - parser.add_argument("--high-error-rmse", type=float, default=1.0) - return parser.parse_args() - - -def default_uq_csv(tag: str) -> Path: - """根据实验标签定位默认的不确定性指标 CSV。""" - return ( - Path("results") - / f"evaluation_{tag}_ensemble_uq" - / "sample_uncertainty_metrics.csv" - ) - - -def default_output_dir(tag: str) -> Path: - """根据实验标签生成当前分析脚本默认的输出目录。""" - return Path("results") / f"evaluation_{tag}_ensemble_uq_metadata_analysis" - - -def resolve_paths(args: argparse.Namespace) -> AnalysisPaths: - """根据命令行参数和实验标签解析输入输出路径。""" - tag = normalize_tag_func(args.tag) - fallback_tag = tag or "family_random_50k" - - processed_path = ( - Path(args.processed) - if args.processed is not None - else processed_path_func(tag) - ) - uq_csv = ( - Path(args.uq_csv) - if args.uq_csv is not None - else default_uq_csv(fallback_tag) - ) - output_dir = ( - Path(args.output_dir) - if args.output_dir is not None - else default_output_dir(fallback_tag) - ) - - return AnalysisPaths( - tag=tag, - processed_path=processed_path, - uq_csv=uq_csv, - output_dir=output_dir, - ) - - -def save_csv(path: Path, rows: list[dict[str, Any]]) -> None: - """把字典行写入 CSV;当没有行时不写文件。""" - if not rows: - return - - with open(path, "w", encoding="utf-8-sig", newline="") as f: - writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) - writer.writeheader() - writer.writerows(rows) - - -def load_uq_rows(uq_csv: Path) -> list[dict[str, str]]: - """读取 sample_uncertainty_metrics.csv。""" - with open(uq_csv, "r", encoding="utf-8-sig", newline="") as f: - rows = list(csv.DictReader(f)) - - if not rows: - raise ValueError(f"UQ CSV 没有数据: {uq_csv}") - - return rows - - -def load_processed_metadata(processed_path: Path) -> ProcessedMetadata: - """读取 processed 数据中的测试集元数据。""" - data = joblib.load(processed_path) - required_keys = {"schedule_meta_test", "family_name_test"} - - if not required_keys.issubset(data): - raise RuntimeError( - "Processed dataset does not contain schedule metadata. " - "Re-run preprocess on a metadata-rich raw HDF5 first." - ) - - schedule_meta_test = np.asarray(data["schedule_meta_test"], dtype=np.float32) - family_name_test = np.asarray(data["family_name_test"]).astype(str) - source_name_test = None - source_id_test = None - - if "source_name_test" in data: - source_name_test = np.asarray(data["source_name_test"]).astype(str) - if "source_id_test" in data: - source_id_test = np.asarray(data["source_id_test"]) - - meta_names = data["meta"].get("schedule_meta_names") or [] - name_to_col = {name: i for i, name in enumerate(meta_names)} - - return ProcessedMetadata( - schedule_meta_test=schedule_meta_test, - family_name_test=family_name_test, - source_name_test=source_name_test, - source_id_test=source_id_test, - meta_names=meta_names, - name_to_col=name_to_col, - ) - - -def enrich_uq_rows( - uq_rows: list[dict[str, str]], - metadata: ProcessedMetadata, -) -> list[dict[str, Any]]: - """把 UQ 指标与 family/source/schedule metadata 拼接到同一行。""" - enriched_rows: list[dict[str, Any]] = [] - - for row in uq_rows: - idx = int(row["idx"]) - meta_row = metadata.schedule_meta_test[idx] - enriched: dict[str, Any] = dict(row) - - enriched["family_name"] = metadata.family_name_test[idx] - - if metadata.source_name_test is not None: - enriched["source_name"] = metadata.source_name_test[idx] - - if metadata.source_id_test is not None: - enriched["source_id"] = int(metadata.source_id_test[idx]) - - for name, col in metadata.name_to_col.items(): - enriched[name] = float(meta_row[col]) - - enriched_rows.append(enriched) - - return enriched_rows - - -def summarize_group( - rows: list[dict[str, Any]], - group_key: str, -) -> list[dict[str, Any]]: - """对一个样本分组计算误差、不确定性和样本数量等汇总指标。""" - groups: dict[str, list[dict[str, Any]]] = {} - - for row in rows: - groups.setdefault(str(row[group_key]), []).append(row) - - summaries = [] - sorted_groups = sorted( - groups.items(), - key=lambda item: len(item[1]), - reverse=True, - ) - - for key, group_rows in sorted_groups: - rmse = np.array( - [float(row["overall_rmse"]) for row in group_rows], - dtype=np.float64, - ) - unc = np.array( - [float(row["unc_mean_std"]) for row in group_rows], - dtype=np.float64, - ) - summaries.append( - { - group_key: key, - "n_samples": len(group_rows), - "rmse_mean": float(np.mean(rmse)), - "rmse_median": float(np.median(rmse)), - "unc_mean": float(np.mean(unc)), - "unc_median": float(np.median(unc)), - } - ) - - return summaries - - -def save_group_summaries( - output_dir: Path, - enriched_rows: list[dict[str, Any]], - metadata: ProcessedMetadata, -) -> None: - """按 family/source/n_prod 等维度保存分组统计。""" - save_csv( - output_dir / "summary_by_family.csv", - summarize_group(enriched_rows, "family_name"), - ) - - if metadata.source_name_test is not None: - save_csv( - output_dir / "summary_by_source.csv", - summarize_group(enriched_rows, "source_name"), - ) - - if "n_prod" in metadata.name_to_col: - for row in enriched_rows: - row["n_prod_group"] = int(round(float(row["n_prod"]))) - - save_csv( - output_dir / "summary_by_n_prod.csv", - summarize_group(enriched_rows, "n_prod_group"), - ) - - -def find_high_error_low_unc( - enriched_rows: list[dict[str, Any]], - high_error_rmse: float, -) -> list[dict[str, Any]]: - """筛出高误差但低不确定性的危险样本。""" - unc_values = np.array( - [float(row["unc_mean_std"]) for row in enriched_rows], - dtype=np.float64, - ) - low_unc_threshold = float(np.median(unc_values)) - - risky_rows = [ - row - for row in enriched_rows - if float(row["overall_rmse"]) >= high_error_rmse - and float(row["unc_mean_std"]) <= low_unc_threshold - ] - risky_rows.sort(key=lambda row: float(row["overall_rmse"]), reverse=True) - - return risky_rows - - -def write_json_summary( - output_dir: Path, - paths: AnalysisPaths, - metadata: ProcessedMetadata, - enriched_rows: list[dict[str, Any]], - high_error_low_unc: list[dict[str, Any]], -) -> None: - """保存 metadata join 的整体摘要信息。""" - summary = { - "tag": paths.tag, - "processed_path": str(paths.processed_path), - "uq_csv": str(paths.uq_csv), - "n_samples": len(enriched_rows), - "meta_names": metadata.meta_names, - "has_source_name": bool(metadata.source_name_test is not None), - "high_error_low_unc_count": len(high_error_low_unc), - } - - with open(output_dir / "metadata_join_summary.json", "w", encoding="utf-8") as f: - json.dump(summary, f, ensure_ascii=False, indent=2) - - -def run_analysis(args: argparse.Namespace) -> tuple[Path, int]: - """执行完整 UQ metadata join 分析流程。""" - paths = resolve_paths(args) - paths.output_dir.mkdir(parents=True, exist_ok=True) - - metadata = load_processed_metadata(paths.processed_path) - uq_rows = load_uq_rows(paths.uq_csv) - enriched_rows = enrich_uq_rows(uq_rows, metadata) - - save_csv(paths.output_dir / "uq_samples_with_metadata.csv", enriched_rows) - save_group_summaries(paths.output_dir, enriched_rows, metadata) - - high_error_low_unc = find_high_error_low_unc( - enriched_rows=enriched_rows, - high_error_rmse=float(args.high_error_rmse), - ) - save_csv( - paths.output_dir / "high_error_low_unc_with_metadata.csv", - high_error_low_unc, - ) - - write_json_summary( - output_dir=paths.output_dir, - paths=paths, - metadata=metadata, - enriched_rows=enriched_rows, - high_error_low_unc=high_error_low_unc, - ) - - return paths.output_dir, len(high_error_low_unc) - - -def main() -> None: - """命令行入口:拼接 UQ 结果和元数据,并输出汇总文件。""" - output_dir, risky_count = run_analysis(parse_args()) - - print("UQ metadata join analysis complete.") - print(f"Output dir: {output_dir}") - print(f"High-error low-unc count={risky_count}") - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/compare_single_case.py b/ML/nmWTAI-ML/scripts/compare_single_case.py index 8b75ee5..ade13cd 100644 --- a/ML/nmWTAI-ML/scripts/compare_single_case.py +++ b/ML/nmWTAI-ML/scripts/compare_single_case.py @@ -41,7 +41,7 @@ from src.common.experiment_paths import ( normalize_tag, processed_path_for_tag, ) -from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features +from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_model_features from src.data.param_features import param_feature_transform_from_meta, transform_param_features from src.data.params import Params, Schedule from src.data.runner_client import CppRunner, read_result_bin @@ -164,29 +164,25 @@ def calc_metrics( def infer_curve_layout(meta: dict, curve_dim: int) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" - # curve_layout 描述了拼接曲线的结构布局 - # 包括每一段 pressure/derivative/slope 的起止位置 + """从元数据读取布局;缺失时兼容推断双通道或旧三通道布局。""" curve_layout = meta.get("curve_layout") if curve_layout is not None: return curve_layout - # 兼容早期 processed 文件:没有显式 layout 时仍按 pressure/derivative/slope 三段切分。 - n_time_points = curve_dim // 3 + n_parts = 3 if curve_dim % 3 == 0 else 2 + n_time_points = curve_dim // n_parts + names = ["log_pressure", "log_derivative"] + (["slope"] if n_parts == 3 else []) return { "n_time_points": int(n_time_points), "parts": [ - {"name": "log_pressure", "start": 0, "end": n_time_points}, - {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, + {"name": name, "start": i * n_time_points, "end": (i + 1) * n_time_points} + for i, name in enumerate(names) ], } def split_curve_by_layout(curve: np.ndarray, layout: dict) -> dict[str, np.ndarray]: - """按照 curve_layout 将拼接曲线拆成 log_pressure、log_derivative 和 slope 三段。""" - # parts 用于保存拆分后的不同曲线段 - # 例如 log_pressure / derivative / slope + """按照 curve_layout 拆分曲线。""" parts: dict[str, np.ndarray] = {} for part in layout["parts"]: start = int(part["start"]) @@ -310,7 +306,12 @@ def build_params_from_args(args: argparse.Namespace, cfg: Config, schedule: Sche ) -def run_solver_and_extract_curve(cfg: Config, params: Params, well_index: int) -> tuple[np.ndarray, dict]: +def run_solver_and_extract_curve( + cfg: Config, + params: Params, + well_index: int, + target_curve_dim: int, +) -> tuple[np.ndarray, dict]: """调用 C++ 求解器运行一次正演,并把双对数输出重采样为模型曲线向量。""" # 创建 C++ 求解器客户端 # 实际底层会调用数值试井求解程序 @@ -351,7 +352,9 @@ def run_solver_and_extract_curve(cfg: Config, params: Params, well_index: int) - # 将原始不规则时间曲线重采样到固定特征网格 # 这是代理模型训练时使用的统一输入格式 - curve_feat = resample_curve_to_features(cfg, t_clean, p_clean, d_clean) + curve_feat = resample_curve_to_model_features( + cfg, t_clean, p_clean, d_clean, target_curve_dim + ) raw = { "t": t_clean.tolist(), "p": p_clean.tolist(), @@ -449,16 +452,15 @@ def plot_comparison( # 用于衡量双对数曲线形态是否一致 autofit = dual_log_objective(curve_true, curve_pred, curve_layout) - part_names = ["log_pressure", "log_derivative", "slope"] + part_names = [str(part["name"]) for part in curve_layout["parts"]] title_map = { "log_pressure": "Log Pressure", "log_derivative": "Log |Derivative|", - "slope": "Slope of Log Pressure vs Log Time", } r2_text = "nan" if np.isnan(overall["r2"]) else f"{overall['r2']:.4f}" - fig, axes = plt.subplots(3, 2, figsize=(14, 12)) + fig, axes = plt.subplots(len(part_names), 2, figsize=(14, 4 * len(part_names))) fig.suptitle( "Solver vs Surrogate\n" f"k={params.k:.6g}, skin={params.skin:.6g}, wellboreC={params.wellboreC:.6g}, " @@ -472,7 +474,7 @@ def plot_comparison( summary = {"overall": overall, "autofit": autofit, "parts": {}} - # 分别对 pressure / derivative / slope 三部分绘图 + # 分别对各曲线通道绘图。 for row, name in enumerate(part_names): # 左列画曲线重合程度,右列画逐时间点残差,便于定位误差集中在哪个时期。 y_true = true_parts[name] @@ -533,7 +535,9 @@ def main() -> None: print("Running solver...") # 运行真实数值求解器 # 得到 ground truth 曲线 - curve_solver, raw_solver = run_solver_and_extract_curve(cfg, params, args.well_index) + curve_solver, raw_solver = run_solver_and_extract_curve( + cfg, params, args.well_index, int(processed["meta"]["curve_dim"]) + ) print("Running surrogate...") model, use_schedule, device = load_model(model_path) diff --git a/ML/nmWTAI-ML/scripts/evaluate_forward.py b/ML/nmWTAI-ML/scripts/evaluate_forward.py index eb11e8b..273ee10 100644 --- a/ML/nmWTAI-ML/scripts/evaluate_forward.py +++ b/ML/nmWTAI-ML/scripts/evaluate_forward.py @@ -119,7 +119,7 @@ def calc_metrics( def split_curve_by_layout(curve: np.ndarray, layout: dict) -> dict[str, np.ndarray]: - """按照 curve_layout 将拼接曲线拆成 log_pressure、log_derivative 和 slope 三段。""" + """按照 curve_layout 拆分曲线。""" parts: dict[str, np.ndarray] = {} for part in layout["parts"]: start = int(part["start"]) @@ -225,7 +225,6 @@ def build_composite_score(overall_m: dict, part_ms: dict) -> float: + 0.5 * overall_m["mae"] + 0.8 * part_ms["log_pressure"]["abs_bias"] + 0.8 * part_ms["log_derivative"]["rmse"] - + 0.2 * part_ms["slope"]["rmse"] ) @@ -245,7 +244,8 @@ def plot_sample( nrmse_text = "nan" if np.isnan(overall["nrmse"]) else f"{overall['nrmse']:.4f}" r2_text = "nan" if np.isnan(overall["r2"]) else f"{overall['r2']:.4f}" - fig, axes = plt.subplots(3, 2, figsize=(14, 12)) + plot_order = [str(part["name"]) for part in curve_layout["parts"]] + fig, axes = plt.subplots(len(plot_order), 2, figsize=(14, 4 * len(plot_order))) fig.suptitle( f"{title_prefix} | Sample #{idx} | " f"Overall RMSE={overall['rmse']:.4f}, MAE={overall['mae']:.4f}, " @@ -254,11 +254,10 @@ def plot_sample( f"R2={r2_text}" ) - plot_order = ["log_pressure", "log_derivative", "slope"] title_map = { "log_pressure": "Log Pressure", "log_derivative": "Log |Derivative|", - "slope": "Slope of Log Pressure vs Log Time", + "slope": "Legacy Slope", } for row, name in enumerate(plot_order): @@ -275,7 +274,7 @@ def plot_sample( ax_l.plot(x, y_true, label="True", linewidth=2, alpha=0.85) ax_l.plot(x, y_pred, label="Predicted", linewidth=2, alpha=0.85) ax_l.set_title( - f"{title_map[name]} | RMSE={m['rmse']:.4f}, MAE={m['mae']:.4f}, " + f"{title_map.get(name, name)} | RMSE={m['rmse']:.4f}, MAE={m['mae']:.4f}, " f"Bias={m['bias']:.4f}, NRMSE={nrmse_text}, R2={r2_text}" ) ax_l.set_xlabel("Resampled Time Index") @@ -286,7 +285,7 @@ def plot_sample( ax_r = axes[row, 1] ax_r.plot(x, err, linewidth=1.5, alpha=0.85) ax_r.axhline(0.0, linestyle="--", linewidth=1) - ax_r.set_title(f"{title_map[name]} Error") + ax_r.set_title(f"{title_map.get(name, name)} Error") ax_r.set_xlabel("Resampled Time Index") ax_r.set_ylabel("Pred - True") ax_r.grid(True, alpha=0.3) @@ -311,18 +310,19 @@ def save_sample_metrics_csv(sample_scores: list[dict], path: Path) -> None: def infer_curve_layout(meta: dict, curve_dim: int) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" + """从元数据读取布局;缺失时根据 320/480 等常用维度兼容推断。""" curve_layout = meta.get("curve_layout") if curve_layout is not None: return curve_layout - n_time_points = curve_dim // 3 + n_parts = 3 if curve_dim % 3 == 0 else 2 + n_time_points = curve_dim // n_parts + names = ["log_pressure", "log_derivative"] + (["slope"] if n_parts == 3 else []) return { "n_time_points": int(n_time_points), "parts": [ - {"name": "log_pressure", "start": 0, "end": n_time_points}, - {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, + {"name": name, "start": i * n_time_points, "end": (i + 1) * n_time_points} + for i, name in enumerate(names) ], } @@ -470,15 +470,12 @@ def main() -> None: print("Inference complete.") overall_metric_list: list[dict] = [] - part_metric_lists = { - "log_pressure": [], - "log_derivative": [], - "slope": [], - } + part_names = [str(part["name"]) for part in curve_layout["parts"]] + part_metric_lists = {name: [] for name in part_names} sample_scores: list[dict] = [] for idx, (curve_true, curve_pred) in enumerate(zip(all_true, all_pred)): - # 同时记录整体指标和三段指标,便于判断误差来自压力、导数还是辅助 slope。 + # 同时记录整体指标和各曲线通道指标。 overall_m = calc_metrics(curve_true, curve_pred) overall_metric_list.append(overall_m) @@ -486,7 +483,7 @@ def main() -> None: pred_parts = split_curve_by_layout(curve_pred, curve_layout) part_ms: dict[str, dict] = {} - for name in ["log_pressure", "log_derivative", "slope"]: + for name in part_names: part_m = calc_metrics(true_parts[name], pred_parts[name]) part_metric_lists[name].append(part_m) part_ms[name] = part_m @@ -513,20 +510,12 @@ def main() -> None: "log_derivative_abs_bias": part_ms["log_derivative"]["abs_bias"], "log_derivative_nrmse": part_ms["log_derivative"]["nrmse"], "log_derivative_r2": part_ms["log_derivative"]["r2"], - "slope_rmse": part_ms["slope"]["rmse"], - "slope_mae": part_ms["slope"]["mae"], - "slope_bias": part_ms["slope"]["bias"], - "slope_abs_bias": part_ms["slope"]["abs_bias"], - "slope_nrmse": part_ms["slope"]["nrmse"], - "slope_r2": part_ms["slope"]["r2"], "overall_valid_nrmse": overall_m["valid_nrmse"], "overall_valid_r2": overall_m["valid_r2"], "log_pressure_valid_nrmse": part_ms["log_pressure"]["valid_nrmse"], "log_pressure_valid_r2": part_ms["log_pressure"]["valid_r2"], "log_derivative_valid_nrmse": part_ms["log_derivative"]["valid_nrmse"], "log_derivative_valid_r2": part_ms["log_derivative"]["valid_r2"], - "slope_valid_nrmse": part_ms["slope"]["valid_nrmse"], - "slope_valid_r2": part_ms["slope"]["valid_r2"], } ) @@ -538,7 +527,6 @@ def main() -> None: "overall": summarize_metric_dicts(overall_metric_list, "overall"), "log_pressure": summarize_metric_dicts(part_metric_lists["log_pressure"], "log_pressure"), "log_derivative": summarize_metric_dicts(part_metric_lists["log_derivative"], "log_derivative"), - "slope": summarize_metric_dicts(part_metric_lists["slope"], "slope"), "checkpoint": { "model_path": str(model_path), "processed_path": str(processed_path), @@ -559,7 +547,9 @@ def main() -> None: n_worst = min(args.n_worst_plots, len(sample_scores)) best_indices = np.argsort(score_composite)[:n_best].tolist() - worst_indices = np.argsort(score_composite)[-n_worst:].tolist() + worst_indices = ( + np.argsort(score_composite)[-n_worst:].tolist() if n_worst > 0 else [] + ) random_indices = random.sample(range(len(sample_scores)), n_random) # 三类图各有用途:best 看上限,worst 定位失败模式,random 检查整体观感。 diff --git a/ML/nmWTAI-ML/scripts/evaluate_forward_ensemble.py b/ML/nmWTAI-ML/scripts/evaluate_forward_ensemble.py deleted file mode 100644 index 1636986..0000000 --- a/ML/nmWTAI-ML/scripts/evaluate_forward_ensemble.py +++ /dev/null @@ -1,419 +0,0 @@ -"""评估正演代理模型集成并估计预测不确定性。 - -脚本按多个随机种子加载同结构模型,计算集成均值预测、成员间标准差和逐样本误差, -导出不确定性-误差相关性统计、散点图和高不确定性样本,用于判断集成方差是否可作为 -自动拟合候选筛选或风险提示信号。 -""" -# pylint: disable=import-error,wrong-import-position -# pylint: disable=too-many-locals,too-many-arguments,too-many-positional-arguments -# pylint: disable=too-many-statements - - -from __future__ import annotations - -import argparse -import csv -import json -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[1] -sys.path.append(str(ROOT)) - -import joblib -import matplotlib.pyplot as plt -import numpy as np -import torch - -from src.common.experiment_paths import normalize_tag, processed_path_for_tag -from src.models.forward_surrogate import ForwardSurrogate - - -def parse_seed_list(seed_text: str) -> list[int]: - """解析逗号分隔的随机种子列表,用于训练或评估模型集成。""" - seeds = [] - for item in str(seed_text).split(","): - item = item.strip() - if not item: - continue - seeds.append(int(item)) - if not seeds: - raise ValueError("至少需要一个 seed") - return seeds - - -def default_model_root(tag: str | None, use_schedule: bool) -> Path: - """根据实验标签推导集成模型成员所在的根目录。""" - suffix = "" if use_schedule else "_no_schedule" - if tag: - return Path("models") / f"forward_surrogate_{tag}_ensemble{suffix}" - return Path("models") / f"forward_surrogate_ensemble{suffix}" - - -def default_output_dir(tag: str | None, use_schedule: bool) -> Path: - """根据实验标签生成当前分析脚本默认的输出目录。""" - suffix = "" if use_schedule else "_no_schedule" - if tag: - return Path("results") / f"evaluation_{tag}_ensemble_uq{suffix}" - return Path("results") / f"evaluation_ensemble_uq{suffix}" - - -def parse_args() -> argparse.Namespace: - """解析深度集成评估所需的数据、成员模型清单、样本数量和绘图数量。""" - parser = argparse.ArgumentParser(description="Evaluate deep-ensemble UQ for forward surrogate") - parser.add_argument("--processed", type=str, default=None, help="Processed dataset path") - parser.add_argument("--tag", type=str, default=None, help="Experiment tag") - parser.add_argument( - "--model-root", - type=str, - default=None, - help="Root dir that contains seed_* members", - ) - parser.add_argument("--output-dir", type=str, default=None, help="Evaluation output dir") - parser.add_argument("--seeds", type=str, default="41,42,43", help="Comma-separated seed list") - parser.add_argument("--no-schedule", action="store_true") - parser.add_argument("--n-top-uncertain-plots", type=int, default=5) - return parser.parse_args() - - -def calc_metrics( - y_true: np.ndarray, - y_pred: np.ndarray, - eps_range: float = 1e-3, - eps_var: float = 1e-6, -) -> dict: - """计算 RMSE、MAE、Bias、NRMSE、R2 等回归指标。""" - err = y_pred - y_true - mse = np.mean(err**2) - rmse = float(np.sqrt(mse)) - mae = float(np.mean(np.abs(err))) - bias = float(np.mean(err)) - value_range = float(np.max(y_true) - np.min(y_true)) - ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2)) - ss_res = float(np.sum(err**2)) - valid_nrmse = value_range > eps_range - valid_r2 = ss_tot > eps_var - nrmse = float(rmse / value_range) if valid_nrmse else np.nan - r2 = float(1.0 - ss_res / ss_tot) if valid_r2 else np.nan - return { - "rmse": rmse, - "mae": mae, - "bias": bias, - "abs_bias": float(abs(bias)), - "nrmse": nrmse, - "r2": r2, - "valid_nrmse": bool(valid_nrmse), - "valid_r2": bool(valid_r2), - } - - -def infer_curve_layout(meta: dict, curve_dim: int) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" - curve_layout = meta.get("curve_layout") - if curve_layout is not None: - return curve_layout - n_time_points = curve_dim // 3 - return { - "n_time_points": int(n_time_points), - "parts": [ - {"name": "log_pressure", "start": 0, "end": n_time_points}, - {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, - ], - } - - -def split_curve_by_layout(curve: np.ndarray, layout: dict) -> dict[str, np.ndarray]: - """按照 curve_layout 将拼接曲线拆成 log_pressure、log_derivative 和 slope 三段。""" - parts: dict[str, np.ndarray] = {} - for part in layout["parts"]: - start = int(part["start"]) - end = int(part["end"]) - parts[str(part["name"])] = curve[start:end] - return parts - - -def load_member(checkpoint_path: Path, device: torch.device) -> tuple[ForwardSurrogate, dict]: - """加载集成中的一个模型成员、对应归一化器和模型配置。""" - checkpoint = torch.load(checkpoint_path, map_location="cpu") - model = ForwardSurrogate( - param_dim=int(checkpoint["param_dim"]), - schedule_dim=int(checkpoint["schedule_dim"]), - curve_dim=int(checkpoint["curve_dim"]), - hidden_dim=int(checkpoint["hidden_dim"]), - dropout=float(checkpoint["dropout"]), - use_schedule=bool(checkpoint.get("use_schedule", True)), - ).to(device) - model.load_state_dict(checkpoint["model_state_dict"]) - model.eval() - return model, checkpoint - - -def safe_mean(x: np.ndarray) -> float: - """计算忽略 NaN 后的均值;没有有效数据时返回 NaN。""" - return float(np.mean(x)) if x.size else np.nan - - -def safe_median(x: np.ndarray) -> float: - """计算忽略 NaN 后的中位数;没有有效数据时返回 NaN。""" - return float(np.median(x)) if x.size else np.nan - - -def safe_percentile(x: np.ndarray, q: float) -> float: - """计算忽略 NaN 后的百分位数;没有有效数据时返回 NaN。""" - return float(np.percentile(x, q)) if x.size else np.nan - - -def pearson_corr(x: np.ndarray, y: np.ndarray) -> float: - """计算 Pearson 相关系数;样本数不足或方差为零时返回 NaN。""" - if x.size == 0 or y.size == 0: - return np.nan - if np.allclose(np.std(x), 0.0) or np.allclose(np.std(y), 0.0): - return np.nan - return float(np.corrcoef(x, y)[0, 1]) - - -def plot_uncertainty_scatter(sample_rows: list[dict], output_path: Path) -> None: - """绘制预测不确定性与真实误差的散点图,检查二者是否相关。""" - rmse = np.array([row["overall_rmse"] for row in sample_rows], dtype=np.float64) - unc = np.array([row["unc_mean_std"] for row in sample_rows], dtype=np.float64) - plt.figure(figsize=(7, 5)) - plt.scatter(unc, rmse, s=10, alpha=0.35) - plt.xlabel("Predictive Uncertainty (mean std)") - plt.ylabel("Overall RMSE") - plt.title(f"Uncertainty vs Error | Pearson={pearson_corr(unc, rmse):.4f}") - plt.grid(True, alpha=0.3) - plt.tight_layout() - plt.savefig(output_path, dpi=150, bbox_inches="tight") - plt.close() - - -def plot_uncertain_sample( - idx: int, - curve_true: np.ndarray, - curve_mean: np.ndarray, - curve_std: np.ndarray, - curve_layout: dict, - output_dir: Path, - unc_score: float, - rmse: float, -) -> None: - """绘制高不确定性样本的真实曲线、集成均值和成员分布。""" - true_parts = split_curve_by_layout(curve_true, curve_layout) - mean_parts = split_curve_by_layout(curve_mean, curve_layout) - std_parts = split_curve_by_layout(curve_std, curve_layout) - - title_map = { - "log_pressure": "Log Pressure", - "log_derivative": "Log |Derivative|", - "slope": "Slope of Log Pressure vs Log Time", - } - - fig, axes = plt.subplots(3, 1, figsize=(12, 10)) - fig.suptitle( - f"High-Uncertainty Sample #{idx} | unc_mean_std={unc_score:.4f}, overall_rmse={rmse:.4f}" - ) - - for ax, name in zip(axes, ["log_pressure", "log_derivative", "slope"]): - y_true = true_parts[name] - y_mean = mean_parts[name] - y_std = std_parts[name] - x = np.arange(len(y_true)) - ax.plot(x, y_true, label="True", linewidth=2) - ax.plot(x, y_mean, label="Ensemble mean", linewidth=2) - ax.fill_between( - x, - y_mean - 2.0 * y_std, - y_mean + 2.0 * y_std, - alpha=0.2, - label="mean ± 2 std", - ) - ax.set_title(title_map[name]) - ax.grid(True, alpha=0.3) - ax.legend() - - plt.tight_layout(rect=[0, 0, 1, 0.96]) - plt.savefig(output_dir / f"top_uncertain_sample_{idx:04d}.png", dpi=150, bbox_inches="tight") - plt.close() - - -def main() -> None: - """汇总多个代理模型成员的均值和方差,用于测试集误差与不确定性评估。""" - args = parse_args() - tag = normalize_tag(args.tag) - use_schedule = not args.no_schedule - seeds = parse_seed_list(args.seeds) - - processed_path = ( - Path(args.processed) - if args.processed is not None - else processed_path_for_tag(tag) - ) - model_root = ( - Path(args.model_root) - if args.model_root is not None - else default_model_root(tag, use_schedule) - ) - output_dir = ( - Path(args.output_dir) - if args.output_dir is not None - else default_output_dir(tag, use_schedule) - ) - output_dir.mkdir(parents=True, exist_ok=True) - - # 集成评估必须使用同一份 processed 数据,保证各成员输入标准化口径一致。 - data = joblib.load(processed_path) - x_params_test = data["X_params_test"] - x_schedule_test = data["X_schedule_test"] - y_curve_test = data["Y_curve_test"] - scaler_curve = data["scaler_curve"] - curve_layout = infer_curve_layout(data["meta"], int(data["meta"]["curve_dim"])) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - members: list[tuple[int, ForwardSurrogate]] = [] - member_paths = [] - first_use_schedule = None - - for seed in seeds: - ckpt_path = model_root / f"seed_{seed}" / "forward_surrogate_best.pt" - model, checkpoint = load_member(ckpt_path, device) - member_paths.append(str(ckpt_path)) - members.append((seed, model)) - cur_use_schedule = bool(checkpoint.get("use_schedule", True)) - if first_use_schedule is None: - first_use_schedule = cur_use_schedule - elif first_use_schedule != cur_use_schedule: - # 混用带/不带 schedule 的模型会导致输入含义不同,不能直接求均值和方差。 - raise RuntimeError("Ensemble 成员的 use_schedule 设置不一致") - - all_true = [] - all_mean = [] - all_std = [] - sample_rows = [] - - with torch.no_grad(): - for idx in range(len(x_params_test)): - params_t = torch.tensor( - x_params_test[idx : idx + 1], - dtype=torch.float32, - device=device, - ) - schedule_t = torch.tensor( - x_schedule_test[idx : idx + 1], - dtype=torch.float32, - device=device, - ) - member_preds = [] - for _, model in members: - # 每个成员独立预测后先反标准化;集成均值和标准差都在原始曲线尺度上计算。 - if first_use_schedule: - pred_scaled = model(params_t, schedule_t).cpu().numpy() - else: - pred_scaled = model(params_t, None).cpu().numpy() - pred = scaler_curve.inverse_transform(pred_scaled)[0].astype(np.float32) - member_preds.append(pred) - - member_preds = np.stack(member_preds, axis=0) - curve_true = scaler_curve.inverse_transform( - y_curve_test[idx : idx + 1] - )[0].astype(np.float32) - curve_mean = member_preds.mean(axis=0).astype(np.float32) - curve_std = member_preds.std(axis=0, ddof=0).astype(np.float32) - - metrics = calc_metrics(curve_true, curve_mean) - parts_std = split_curve_by_layout(curve_std, curve_layout) - - # std 作为经验不确定性指标,后续会和真实 RMSE 做相关性分析。 - sample_rows.append( - { - "idx": idx, - "overall_rmse": metrics["rmse"], - "overall_mae": metrics["mae"], - "overall_bias": metrics["bias"], - "overall_r2": metrics["r2"], - "unc_mean_std": float(np.mean(curve_std)), - "unc_max_std": float(np.max(curve_std)), - "unc_log_pressure_mean_std": float(np.mean(parts_std["log_pressure"])), - "unc_log_derivative_mean_std": float(np.mean(parts_std["log_derivative"])), - "unc_slope_mean_std": float(np.mean(parts_std["slope"])), - } - ) - - all_true.append(curve_true) - all_mean.append(curve_mean) - all_std.append(curve_std) - - all_true = np.stack(all_true, axis=0) - all_mean = np.stack(all_mean, axis=0) - all_std = np.stack(all_std, axis=0) - - overall_metrics = [calc_metrics(t, p) for t, p in zip(all_true, all_mean)] - rmse = np.array([m["rmse"] for m in overall_metrics], dtype=np.float64) - mae = np.array([m["mae"] for m in overall_metrics], dtype=np.float64) - r2_valid = np.array([m["r2"] for m in overall_metrics if m["valid_r2"]], dtype=np.float64) - unc = np.array([row["unc_mean_std"] for row in sample_rows], dtype=np.float64) - - # 聚合输出分为预测质量和不确定性质量两部分,方便单独比较 ensemble 是否有价值。 - summary = { - "ensemble": { - "member_count": len(members), - "member_paths": member_paths, - "use_schedule": bool(first_use_schedule), - "processed_path": str(processed_path), - }, - "prediction": { - "rmse_mean": safe_mean(rmse), - "rmse_median": safe_median(rmse), - "rmse_p90": safe_percentile(rmse, 90), - "mae_mean": safe_mean(mae), - "mae_median": safe_median(mae), - "r2_mean_valid": safe_mean(r2_valid), - "r2_median_valid": safe_median(r2_valid), - }, - "uncertainty": { - "unc_mean": safe_mean(unc), - "unc_median": safe_median(unc), - "unc_p90": safe_percentile(unc, 90), - "unc_vs_rmse_pearson": pearson_corr(unc, rmse), - }, - } - - with open(output_dir / "ensemble_uq_summary.json", "w", encoding="utf-8") as f: - json.dump(summary, f, ensure_ascii=False, indent=2) - - with open( - output_dir / "sample_uncertainty_metrics.csv", - "w", - newline="", - encoding="utf-8-sig", - ) as f: - writer = csv.DictWriter(f, fieldnames=list(sample_rows[0].keys())) - writer.writeheader() - writer.writerows(sample_rows) - - plot_uncertainty_scatter(sample_rows, output_dir / "uncertainty_vs_error.png") - - top_k = min(args.n_top_uncertain_plots, len(sample_rows)) - top_uncertain = sorted(sample_rows, key=lambda row: row["unc_mean_std"], reverse=True)[:top_k] - # 只绘制最高不确定性的样本,重点检查模型成员分歧最大的区域。 - for row in top_uncertain: - idx = int(row["idx"]) - plot_uncertain_sample( - idx=idx, - curve_true=all_true[idx], - curve_mean=all_mean[idx], - curve_std=all_std[idx], - curve_layout=curve_layout, - output_dir=output_dir, - unc_score=float(row["unc_mean_std"]), - rmse=float(row["overall_rmse"]), - ) - - print("Ensemble UQ evaluation complete.") - print(f"Output dir: {output_dir}") - print(f"RMSE mean={summary['prediction']['rmse_mean']:.6f}") - print(f"UQ-RMSE Pearson={summary['uncertainty']['unc_vs_rmse_pearson']:.6f}") - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/evaluate_time_conditioned.py b/ML/nmWTAI-ML/scripts/evaluate_time_conditioned.py deleted file mode 100644 index 1ee6b5f..0000000 --- a/ML/nmWTAI-ML/scripts/evaluate_time_conditioned.py +++ /dev/null @@ -1,684 +0,0 @@ -"""评估时间条件正演代理模型。 - -时间条件模型以“样本参数 + 单个时间点”为输入预测压力/导数,因此本脚本会把整条曲线 -展开为点级批量推理,再还原为样本级误差统计、域内/域外分组摘要和最差案例图。 -该评估用于检查模型在可变时间采样与 PSO 参数域上的泛化能力。 -""" - -from __future__ import annotations - -# pylint: disable=import-error,wrong-import-position,line-too-long -# pylint: disable=too-many-arguments,too-many-positional-arguments -# pylint: disable=too-many-locals,too-many-statements -# pylint: disable=invalid-name,unused-argument,too-many-function-args - - -import argparse -import csv -import json -import random -import sys -from pathlib import Path -from typing import Iterable - -import joblib -import matplotlib.pyplot as plt -import numpy as np -import torch - -ROOT = Path(__file__).resolve().parents[1] -sys.path.append(str(ROOT)) - -from src.common.experiment_paths import normalize_tag, processed_path_for_tag -from src.data.param_features import inverse_transform_param_features -from src.models.time_conditioned_surrogate import TimeConditionedSurrogate -from src.training.train_forward import get_part_slices, infer_curve_layout - - -DEFAULT_RANDOM_SEED = 42 -DEFAULT_PSO_DOMAIN = { - "k_min": 0.001, - "k_max": 10.0, - "skin_min": -10.0, - "skin_max": 10.0, - "wellboreC_min": 1.0e-4, - "wellboreC_max": 2.0, - "phi_min": 0.01, - "phi_max": 0.5, - "h_min": 2.0, - "h_max": 50.0, -} - - -def parse_args() -> argparse.Namespace: - """解析时间条件代理模型评估所需的数据、checkpoint、样本数和输出目录。""" - parser = argparse.ArgumentParser(description="Evaluate a time-conditioned point-wise surrogate") - parser.add_argument("--processed", type=str, default=None, help="Processed dataset path") - parser.add_argument("--tag", type=str, default=None, help="Experiment tag for auto naming") - parser.add_argument("--model", type=str, default=None, help="Model checkpoint path") - parser.add_argument("--output-dir", type=str, default=None, help="Optional evaluation output directory") - parser.add_argument("--batch-size", type=int, default=65536, help="Point batch size for inference") - parser.add_argument("--device", type=str, default=None, help="Override device, e.g. cpu or cuda") - parser.add_argument("--seed", type=int, default=DEFAULT_RANDOM_SEED) - parser.add_argument("--n-random-plots", type=int, default=5) - parser.add_argument("--n-best-plots", type=int, default=5) - parser.add_argument("--n-worst-plots", type=int, default=10) - parser.add_argument("--top-k-analysis", type=int, default=300) - parser.add_argument("--pso-k-min", type=float, default=DEFAULT_PSO_DOMAIN["k_min"]) - parser.add_argument("--pso-k-max", type=float, default=DEFAULT_PSO_DOMAIN["k_max"]) - parser.add_argument("--pso-h-min", type=float, default=DEFAULT_PSO_DOMAIN["h_min"]) - parser.add_argument("--pso-h-max", type=float, default=DEFAULT_PSO_DOMAIN["h_max"]) - parser.add_argument("--pso-skin-min", type=float, default=DEFAULT_PSO_DOMAIN["skin_min"]) - parser.add_argument("--pso-skin-max", type=float, default=DEFAULT_PSO_DOMAIN["skin_max"]) - parser.add_argument("--pso-wellboreC-min", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_min"]) - parser.add_argument("--pso-wellboreC-max", type=float, default=DEFAULT_PSO_DOMAIN["wellboreC_max"]) - parser.add_argument("--pso-phi-min", type=float, default=DEFAULT_PSO_DOMAIN["phi_min"]) - parser.add_argument("--pso-phi-max", type=float, default=DEFAULT_PSO_DOMAIN["phi_max"]) - return parser.parse_args() - - -def default_model_path(tag: str | None) -> Path: - """根据实验标签和是否使用流量制度输入推导默认模型检查点路径。""" - if tag: - return Path("models") / f"time_conditioned_surrogate_{tag}" / "time_conditioned_surrogate_best.pt" - return Path("models/time_conditioned_surrogate/time_conditioned_surrogate_best.pt") - - -def default_output_dir(tag: str | None) -> Path: - """根据实验标签生成当前分析脚本默认的输出目录。""" - if tag: - return Path("results") / f"evaluation_time_conditioned_{tag}" - return Path("results/evaluation_time_conditioned") - - -def percentile_summary(values: np.ndarray) -> dict: - """计算数组的均值、中位数和若干百分位数,用于误差分布汇总。""" - x = np.asarray(values, dtype=np.float64).reshape(-1) - if x.size == 0: - return { - "min": None, - "p05": None, - "p25": None, - "median": None, - "p75": None, - "p90": None, - "p95": None, - "max": None, - } - return { - "min": float(np.min(x)), - "p05": float(np.percentile(x, 5)), - "p25": float(np.percentile(x, 25)), - "median": float(np.percentile(x, 50)), - "p75": float(np.percentile(x, 75)), - "p90": float(np.percentile(x, 90)), - "p95": float(np.percentile(x, 95)), - "max": float(np.max(x)), - } - - -def point_metrics(true: np.ndarray, pred: np.ndarray) -> dict: - """计算逐时间点预测的误差指标。""" - err = np.asarray(pred, dtype=np.float64) - np.asarray(true, dtype=np.float64) - abs_err = np.abs(err) - return { - "rmse": float(np.sqrt(np.mean(err**2))), - "mae": float(np.mean(abs_err)), - "bias": float(np.mean(err)), - "p90_abs": float(np.percentile(abs_err, 90)), - "p95_abs": float(np.percentile(abs_err, 95)), - } - - -def sample_metrics(true_p: np.ndarray, pred_p: np.ndarray, true_d: np.ndarray, pred_d: np.ndarray) -> list[dict]: - """计算单条曲线样本的整体误差指标。""" - rows: list[dict] = [] - for idx in range(true_p.shape[0]): - p_err = pred_p[idx] - true_p[idx] - d_err = pred_d[idx] - true_d[idx] - rmse_p = float(np.sqrt(np.mean(p_err**2))) - rmse_d = float(np.sqrt(np.mean(d_err**2))) - mae_p = float(np.mean(np.abs(p_err))) - mae_d = float(np.mean(np.abs(d_err))) - rows.append( - { - "idx": idx, - "rmse_p": rmse_p, - "rmse_d": rmse_d, - "mae_p": mae_p, - "mae_d": mae_d, - "score": float(rmse_p + 2.0 * rmse_d), - } - ) - return rows - - -def write_csv(path: Path, rows: list[dict], fieldnames: list[str] | None = None) -> None: - """按字段名写出 CSV 明细或汇总结果。""" - path.parent.mkdir(parents=True, exist_ok=True) - if not rows: - path.write_text("", encoding="utf-8-sig") - return - names = fieldnames or list(rows[0].keys()) - with path.open("w", newline="", encoding="utf-8-sig") as f: - writer = csv.DictWriter(f, fieldnames=names, extrasaction="ignore") - writer.writeheader() - writer.writerows(rows) - - -def iter_batches(total: int, batch_size: int) -> Iterable[tuple[int, int]]: - """把样本数量切分为批量索引范围,避免评估时一次性占用过多内存。""" - batch = max(1, int(batch_size)) - for start in range(0, int(total), batch): - yield start, min(start + batch, int(total)) - - -def load_model(model_path: Path, device: torch.device) -> tuple[TimeConditionedSurrogate, dict]: - """加载模型检查点,按保存的维度和超参数重建网络并切换到评估模式。""" - checkpoint = torch.load(model_path, map_location="cpu") - model = TimeConditionedSurrogate( - param_dim=int(checkpoint["param_dim"]), - schedule_dim=int(checkpoint["schedule_dim"]), - time_dim=int(checkpoint["time_dim"]), - hidden_dim=int(checkpoint["hidden_dim"]), - n_blocks=int(checkpoint["n_blocks"]), - dropout=float(checkpoint["dropout"]), - use_schedule=bool(checkpoint.get("use_schedule", True)), - ) - model.load_state_dict(checkpoint["model_state_dict"]) - model.eval() - model.to(device) - return model, checkpoint - - -def predict_scaled_points( - model: TimeConditionedSurrogate, - params_x: np.ndarray, - schedule_x: np.ndarray, - time_x: np.ndarray, - device: torch.device, - batch_size: int, -) -> np.ndarray: - """在标准化空间中批量预测时间条件模型的逐点输出。""" - n_samples, n_time, time_dim = time_x.shape - params_flat = np.repeat(params_x, n_time, axis=0) - schedule_flat = np.repeat(schedule_x, n_time, axis=0) - time_flat = time_x.reshape(n_samples * n_time, time_dim) - - pred_flat = np.empty((n_samples * n_time, 2), dtype=np.float32) - use_schedule = bool(model.use_schedule) - with torch.no_grad(): - for start, end in iter_batches(len(time_flat), batch_size): - params_t = torch.tensor(params_flat[start:end], dtype=torch.float32, device=device) - time_t = torch.tensor(time_flat[start:end], dtype=torch.float32, device=device) - if use_schedule: - schedule_t = torch.tensor(schedule_flat[start:end], dtype=torch.float32, device=device) - else: - schedule_t = None - pred_flat[start:end] = model(params_t, time_t, schedule_t).detach().cpu().numpy() - return pred_flat.reshape(n_samples, n_time, 2) - - -def inverse_curve_part(values_scaled: np.ndarray, scaler_curve: object, part_slice: slice) -> np.ndarray: - """使用曲线 scaler 的对应分段参数把预测值恢复到原始尺度。""" - mean = np.asarray(scaler_curve.mean_[part_slice], dtype=np.float32) - scale = np.asarray(scaler_curve.scale_[part_slice], dtype=np.float32) - return values_scaled.astype(np.float32) * scale.reshape(1, -1) + mean.reshape(1, -1) - - -def recover_raw_params(data: dict) -> dict[str, np.ndarray]: - """把标准化后的参数特征反变换回原始物理参数。""" - meta = data.get("meta", {}) or {} - features = data["scaler_params"].inverse_transform(data["X_params_test"]) - raw = inverse_transform_param_features(features, meta.get("param_feature_transform")) - names = list(meta.get("param_names") or ["k", "skin", "wellboreC", "phi", "h", "Cf"]) - return {name: raw[:, idx].astype(np.float64) for idx, name in enumerate(names[: raw.shape[1]])} - - -def build_pso_mask(params: dict[str, np.ndarray], args: argparse.Namespace) -> np.ndarray: - """根据元数据标记出 PSO/自动拟合相关样本,用于单独统计。""" - return ( - (params["k"] >= float(args.pso_k_min)) - & (params["k"] <= float(args.pso_k_max)) - & (params["skin"] >= float(args.pso_skin_min)) - & (params["skin"] <= float(args.pso_skin_max)) - & (params["wellboreC"] >= float(args.pso_wellboreC_min)) - & (params["wellboreC"] <= float(args.pso_wellboreC_max)) - & (params["phi"] >= float(args.pso_phi_min)) - & (params["phi"] <= float(args.pso_phi_max)) - & (params["h"] >= float(args.pso_h_min)) - & (params["h"] <= float(args.pso_h_max)) - ) - - -def summarize_group(score: np.ndarray, rmse_p: np.ndarray, rmse_d: np.ndarray, mask: np.ndarray) -> dict: - """对一个样本分组计算误差、不确定性和样本数量等汇总指标。""" - m = np.asarray(mask, dtype=bool) - return { - "n": int(np.sum(m)), - "score": percentile_summary(score[m]), - "rmse_p": percentile_summary(rmse_p[m]), - "rmse_d": percentile_summary(rmse_d[m]), - "score_gt_1_ratio": float(np.mean(score[m] > 1.0)) if np.any(m) else None, - "score_gt_2_ratio": float(np.mean(score[m] > 2.0)) if np.any(m) else None, - "score_gt_5_ratio": float(np.mean(score[m] > 5.0)) if np.any(m) else None, - } - - -def build_domain_summary(sample_rows: list[dict], params: dict[str, np.ndarray], pso_mask: np.ndarray) -> dict: - """按数据域或流量制度类别构建评估摘要。""" - score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) - rmse_p = np.asarray([r["rmse_p"] for r in sample_rows], dtype=np.float64) - rmse_d = np.asarray([r["rmse_d"] for r in sample_rows], dtype=np.float64) - skin = params["skin"] - wellboreC = params["wellboreC"] - - order = np.argsort(-score) - top100 = order[: min(100, order.size)] - return { - "all": summarize_group(score, rmse_p, rmse_d, np.ones_like(pso_mask, dtype=bool)), - "pso_domain": summarize_group(score, rmse_p, rmse_d, pso_mask), - "outside_pso_domain": summarize_group(score, rmse_p, rmse_d, ~pso_mask), - "pso_skin_lt_minus_5": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -5.0)), - "pso_skin_lt_minus_8": summarize_group(score, rmse_p, rmse_d, pso_mask & (skin < -8.0)), - "pso_skin_lt_minus_5_wellboreC_gt_0_1": summarize_group( - score, - rmse_p, - rmse_d, - pso_mask & (skin < -5.0) & (wellboreC > 0.1), - ), - "top100": { - "outside_pso_domain": int(np.sum(~pso_mask[top100])), - "k_lt_0_001": int(np.sum(params["k"][top100] < 0.001)), - "k_gt_10": int(np.sum(params["k"][top100] > 10.0)), - "h_gt_50": int(np.sum(params["h"][top100] > 50.0)), - "pso_skin_lt_minus_5_wellboreC_gt_0_1": int( - np.sum((pso_mask & (skin < -5.0) & (wellboreC > 0.1))[top100]) - ), - }, - } - - -def summarize_params_for_indices(params: dict[str, np.ndarray], indices: np.ndarray) -> dict: - """统计指定样本索引对应的物理参数范围和中心值。""" - return { - name: percentile_summary(values[np.asarray(indices, dtype=int)]) - for name, values in params.items() - if name in {"k", "skin", "wellboreC", "phi", "h", "Cf"} - } - - -def build_worst_case_summary( - sample_rows: list[dict], - params: dict[str, np.ndarray], - pso_mask: np.ndarray, - top_k: int, -) -> dict: - """挑选误差最大的样本并整理其指标、参数和元数据。""" - score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) - order_worst = np.argsort(-score) - order_best = np.argsort(score) - top = order_worst[: min(int(top_k), order_worst.size)] - worst100 = order_worst[: min(100, order_worst.size)] - best100 = order_best[: min(100, order_best.size)] - - return { - "top_k": int(top.size), - "metrics": { - "score": percentile_summary(score), - "n_score_gt_1": int(np.sum(score > 1.0)), - "n_score_gt_2": int(np.sum(score > 2.0)), - "n_score_gt_5": int(np.sum(score > 5.0)), - }, - "pso_domain": { - "n_inside": int(np.sum(pso_mask)), - "n_outside": int(np.sum(~pso_mask)), - "top100_outside": int(np.sum(~pso_mask[worst100])), - "top100_k_lt_0_001": int(np.sum(params["k"][worst100] < 0.001)), - "top100_k_gt_10": int(np.sum(params["k"][worst100] > 10.0)), - "top100_h_gt_50": int(np.sum(params["h"][worst100] > 50.0)), - }, - "params": { - "all": summarize_params_for_indices(params, np.arange(score.size)), - "worst_top_k": summarize_params_for_indices(params, top), - "worst100": summarize_params_for_indices(params, worst100), - "best100": summarize_params_for_indices(params, best100), - }, - } - - -def build_worst_case_rows( - sample_rows: list[dict], - params: dict[str, np.ndarray], - data: dict, - true_p: np.ndarray, - true_d: np.ndarray, - pred_p: np.ndarray, - pred_d: np.ndarray, - pso_mask: np.ndarray, - top_k: int, -) -> tuple[list[dict], list[dict]]: - """生成最差样本明细行,供 CSV 输出和人工检查。""" - score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) - order = np.argsort(-score)[: min(int(top_k), len(sample_rows))] - family = data.get("family_name_test") - schedule_meta = data.get("schedule_meta_test") - schedule_meta_names = list((data.get("meta", {}) or {}).get("schedule_meta_names") or []) - - case_rows: list[dict] = [] - residual_rows: list[dict] = [] - for rank, idx in enumerate(order, 1): - p_res = pred_p[idx] - true_p[idx] - d_res = pred_d[idx] - true_d[idx] - p_rmse = float(np.sqrt(np.mean(p_res**2))) - d_rmse = float(np.sqrt(np.mean(d_res**2))) - p_mean = float(np.mean(p_res)) - d_mean = float(np.mean(d_res)) - p_std = float(np.std(p_res)) - d_std = float(np.std(d_res)) - - row = { - "rank": rank, - "idx": int(idx), - "score": float(score[idx]), - "rmse_p": float(sample_rows[idx]["rmse_p"]), - "rmse_d": float(sample_rows[idx]["rmse_d"]), - "mae_p": float(sample_rows[idx]["mae_p"]), - "mae_d": float(sample_rows[idx]["mae_d"]), - "in_pso_domain": int(bool(pso_mask[idx])), - "family": str(family[idx]) if family is not None else "", - } - for name in ["k", "skin", "wellboreC", "phi", "h", "Cf"]: - if name in params: - row[name] = float(params[name][idx]) - if schedule_meta is not None: - for midx, name in enumerate(schedule_meta_names): - if midx < schedule_meta.shape[1]: - row[f"sched_{name}"] = float(schedule_meta[idx, midx]) - case_rows.append(row) - - residual_rows.append( - { - "rank": rank, - "idx": int(idx), - "score": float(score[idx]), - "p_res_mean": p_mean, - "p_res_std": p_std, - "p_res_rmse": p_rmse, - "p_shift_ratio": float(abs(p_mean) / max(p_rmse, 1.0e-12)), - "d_res_mean": d_mean, - "d_res_std": d_std, - "d_res_rmse": d_rmse, - "d_shift_ratio": float(abs(d_mean) / max(d_rmse, 1.0e-12)), - "p_res_first": float(p_res[0]), - "p_res_last": float(p_res[-1]), - "d_res_first": float(d_res[0]), - "d_res_last": float(d_res[-1]), - } - ) - return case_rows, residual_rows - - -def plot_sample( - output_path: Path, - idx: int, - t: np.ndarray, - true_p: np.ndarray, - pred_p: np.ndarray, - true_d: np.ndarray, - pred_d: np.ndarray, - title: str, -) -> None: - """绘制单个样本的真实曲线、预测曲线和误差曲线,并保存为图片。""" - x = np.asarray(t, dtype=np.float64) - fig, axes = plt.subplots(2, 2, figsize=(13, 8)) - fig.suptitle(title) - - axes[0, 0].plot(x, true_p, label="True", linewidth=2) - axes[0, 0].plot(x, pred_p, label="Pred", linewidth=2) - axes[0, 0].set_title("Log Pressure") - axes[0, 0].set_xscale("log") - axes[0, 0].grid(True, alpha=0.3) - axes[0, 0].legend() - - axes[0, 1].plot(x, pred_p - true_p, linewidth=1.5) - axes[0, 1].axhline(0.0, linestyle="--", linewidth=1) - axes[0, 1].set_title("Pressure Residual") - axes[0, 1].set_xscale("log") - axes[0, 1].grid(True, alpha=0.3) - - axes[1, 0].plot(x, true_d, label="True", linewidth=2) - axes[1, 0].plot(x, pred_d, label="Pred", linewidth=2) - axes[1, 0].set_title("Log Derivative") - axes[1, 0].set_xscale("log") - axes[1, 0].grid(True, alpha=0.3) - axes[1, 0].legend() - - axes[1, 1].plot(x, pred_d - true_d, linewidth=1.5) - axes[1, 1].axhline(0.0, linestyle="--", linewidth=1) - axes[1, 1].set_title("Derivative Residual") - axes[1, 1].set_xscale("log") - axes[1, 1].grid(True, alpha=0.3) - - for ax in axes.ravel(): - ax.set_xlabel("Time") - - output_path.parent.mkdir(parents=True, exist_ok=True) - plt.tight_layout(rect=[0, 0, 1, 0.95]) - plt.savefig(output_path, dpi=150, bbox_inches="tight") - plt.close(fig) - - -def write_plots( - output_dir: Path, - sample_rows: list[dict], - t_curve: np.ndarray, - true_p: np.ndarray, - true_d: np.ndarray, - pred_p: np.ndarray, - pred_d: np.ndarray, - args: argparse.Namespace, -) -> None: - """批量绘制代表性样本或最差样本的曲线对比图。""" - plot_dir = output_dir / "plots" - score = np.asarray([r["score"] for r in sample_rows], dtype=np.float64) - random.seed(int(args.seed)) - n_random = min(int(args.n_random_plots), len(sample_rows)) - n_best = min(int(args.n_best_plots), len(sample_rows)) - n_worst = min(int(args.n_worst_plots), len(sample_rows)) - best = np.argsort(score)[:n_best].tolist() - worst = np.argsort(-score)[:n_worst].tolist() - random_idx = random.sample(range(len(sample_rows)), n_random) - - for idx in random_idx: - plot_sample( - plot_dir / f"sample_{idx:04d}.png", - idx, - t_curve[idx], - true_p[idx], - pred_p[idx], - true_d[idx], - pred_d[idx], - f"Random sample {idx} | score={score[idx]:.4f}", - ) - for idx in best: - plot_sample( - plot_dir / f"best_sample_{idx:04d}.png", - idx, - t_curve[idx], - true_p[idx], - pred_p[idx], - true_d[idx], - pred_d[idx], - f"Best sample {idx} | score={score[idx]:.4f}", - ) - for idx in worst: - plot_sample( - plot_dir / f"worst_sample_{idx:04d}.png", - idx, - t_curve[idx], - true_p[idx], - pred_p[idx], - true_d[idx], - pred_d[idx], - f"Worst sample {idx} | score={score[idx]:.4f}", - ) - - -def main() -> None: - """评估按时间点预测的代理模型,分别统计缩放域和原始对数域误差。""" - args = parse_args() - tag = normalize_tag(args.tag) - processed_path = Path(args.processed) if args.processed is not None else processed_path_for_tag(tag) - model_path = Path(args.model) if args.model is not None else default_model_path(tag) - output_dir = Path(args.output_dir) if args.output_dir is not None else default_output_dir(tag) - output_dir.mkdir(parents=True, exist_ok=True) - - device_name = args.device or ("cuda" if torch.cuda.is_available() else "cpu") - device = torch.device(device_name) - - print("Loading processed dataset...") - data = joblib.load(processed_path) - required = ["X_params_test", "X_schedule_test", "X_time_test", "Y_curve_test"] - missing = [key for key in required if key not in data] - if missing: - # 时间条件模型依赖逐点时间特征;旧版 processed 数据没有这些字段时不能评估。 - raise KeyError(f"processed dataset is missing time-conditioned fields: {missing}") - - print("Loading model...") - model, checkpoint = load_model(model_path, device) - curve_layout = checkpoint.get("curve_layout") or infer_curve_layout(data) - slices = get_part_slices(curve_layout) - - x_params = np.asarray(data["X_params_test"], dtype=np.float32) - x_schedule = np.asarray(data["X_schedule_test"], dtype=np.float32) - x_time = np.asarray(data["X_time_test"], dtype=np.float32) - y_curve = np.asarray(data["Y_curve_test"], dtype=np.float32) - scaler_curve = data["scaler_curve"] - - print( - f"test={x_params.shape[0]}, n_time={x_time.shape[1]}, " - f"param_dim={x_params.shape[1]}, schedule_dim={x_schedule.shape[1]}, time_dim={x_time.shape[-1]}" - ) - print(f"device={device}, batch_size={args.batch_size}") - - pred_scaled = predict_scaled_points( - model=model, - params_x=x_params, - schedule_x=x_schedule, - time_x=x_time, - device=device, - batch_size=int(args.batch_size), - ) - - p_slice = slices["log_pressure"] - d_slice = slices["log_derivative"] - # 时间条件模型只预测 pressure/derivative 两个通道,不直接预测辅助 slope。 - true_p_scaled = y_curve[:, p_slice] - true_d_scaled = y_curve[:, d_slice] - pred_p_scaled = pred_scaled[:, :, 0] - pred_d_scaled = pred_scaled[:, :, 1] - - true_p = inverse_curve_part(true_p_scaled, scaler_curve, p_slice) - true_d = inverse_curve_part(true_d_scaled, scaler_curve, d_slice) - pred_p = inverse_curve_part(pred_p_scaled, scaler_curve, p_slice) - pred_d = inverse_curve_part(pred_d_scaled, scaler_curve, d_slice) - - # scaled 指标看模型训练空间内的误差,raw 指标看真实物理/工程尺度下的误差。 - summary = { - "processed_path": str(processed_path), - "model_path": str(model_path), - "device": str(device), - "checkpoint": { - "hidden_dim": int(checkpoint["hidden_dim"]), - "n_blocks": int(checkpoint["n_blocks"]), - "dropout": float(checkpoint["dropout"]), - "use_schedule": bool(checkpoint.get("use_schedule", True)), - }, - "scaled_log_pressure": point_metrics(true_p_scaled, pred_p_scaled), - "scaled_log_derivative": point_metrics(true_d_scaled, pred_d_scaled), - "raw_log_pressure": point_metrics(true_p, pred_p), - "raw_log_derivative": point_metrics(true_d, pred_d), - } - - rows = sample_metrics(true_p=true_p, pred_p=pred_p, true_d=true_d, pred_d=pred_d) - params = recover_raw_params(data) - pso_mask = build_pso_mask(params, args) - domain_summary = build_domain_summary(rows, params, pso_mask) - # PSO-domain 单独统计,用来判断模型在自动拟合常用参数范围内是否稳定。 - summary["pso_domain"] = { - "bounds": { - "k": [float(args.pso_k_min), float(args.pso_k_max)], - "skin": [float(args.pso_skin_min), float(args.pso_skin_max)], - "wellboreC": [float(args.pso_wellboreC_min), float(args.pso_wellboreC_max)], - "phi": [float(args.pso_phi_min), float(args.pso_phi_max)], - "h": [float(args.pso_h_min), float(args.pso_h_max)], - }, - "metrics": domain_summary, - } - - case_rows, residual_rows = build_worst_case_rows( - sample_rows=rows, - params=params, - data=data, - true_p=true_p, - true_d=true_d, - pred_p=pred_p, - pred_d=pred_d, - pso_mask=pso_mask, - top_k=int(args.top_k_analysis), - ) - worst_case_summary = build_worst_case_summary( - sample_rows=rows, - params=params, - pso_mask=pso_mask, - top_k=int(args.top_k_analysis), - ) - - # 输出分为整体指标、逐样本指标、最差案例和最差残差,便于逐层排查误差来源。 - (output_dir / "summary_metrics.json").write_text( - json.dumps(summary, indent=2, ensure_ascii=False), - encoding="utf-8", - ) - write_csv(output_dir / "sample_metrics.csv", rows) - write_csv(output_dir / "worst_case_analysis.csv", case_rows) - write_csv(output_dir / "worst_residual_analysis.csv", residual_rows) - (output_dir / "worst_case_summary.json").write_text( - json.dumps(worst_case_summary, indent=2, ensure_ascii=False), - encoding="utf-8", - ) - - residual_summary = { - # 残差 shift ratio 用于观察坏样本是否主要表现为整条曲线偏移。 - "top_k": int(len(residual_rows)), - "top300_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows])), - "top300_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows])), - "top100_p_shift_ratio_median": float(np.median([r["p_shift_ratio"] for r in residual_rows[:100]])), - "top100_d_shift_ratio_median": float(np.median([r["d_shift_ratio"] for r in residual_rows[:100]])), - "top20": residual_rows[:20], - } - (output_dir / "worst_residual_summary.json").write_text( - json.dumps(residual_summary, indent=2, ensure_ascii=False), - encoding="utf-8", - ) - - t_curve = np.asarray(data.get("T_curve_test"), dtype=np.float32) - if t_curve.ndim == 2 and t_curve.shape == true_p.shape: - # 只有保存了与曲线形状一致的真实时间坐标时,才绘制时间轴曲线图。 - write_plots(output_dir, rows, t_curve, true_p, true_d, pred_p, pred_d, args) - - print("\nEvaluation complete.") - print(f"raw_log_pressure RMSE={summary['raw_log_pressure']['rmse']:.6f}, MAE={summary['raw_log_pressure']['mae']:.6f}") - print(f"raw_log_derivative RMSE={summary['raw_log_derivative']['rmse']:.6f}, MAE={summary['raw_log_derivative']['mae']:.6f}") - print( - "PSO-domain: " - f"n={domain_summary['pso_domain']['n']}, " - f"median={domain_summary['pso_domain']['score']['median']:.6f}, " - f"p95={domain_summary['pso_domain']['score']['p95']:.6f}, " - f"score>1={100.0 * domain_summary['pso_domain']['score_gt_1_ratio']:.3f}%" - ) - print(f"Artifacts written to: {output_dir}") - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/finetune_forward_local_ranking.py b/ML/nmWTAI-ML/scripts/finetune_forward_local_ranking.py index aad86e5..beca0aa 100644 --- a/ML/nmWTAI-ML/scripts/finetune_forward_local_ranking.py +++ b/ML/nmWTAI-ML/scripts/finetune_forward_local_ranking.py @@ -78,17 +78,18 @@ def set_seed(seed: int) -> None: def infer_curve_layout(meta: dict, curve_dim: int) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" + """从元数据读取布局;缺失时兼容推断双通道或旧三通道布局。""" curve_layout = meta.get("curve_layout") if curve_layout is not None: return curve_layout - n_time_points = curve_dim // 3 + n_parts = 3 if curve_dim % 3 == 0 else 2 + n_time_points = curve_dim // n_parts + names = ["log_pressure", "log_derivative"] + (["slope"] if n_parts == 3 else []) return { "n_time_points": int(n_time_points), "parts": [ - {"name": "log_pressure", "start": 0, "end": n_time_points}, - {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, + {"name": name, "start": i * n_time_points, "end": (i + 1) * n_time_points} + for i, name in enumerate(names) ], } diff --git a/ML/nmWTAI-ML/scripts/generate_autofit_neighborhood_dataset.py b/ML/nmWTAI-ML/scripts/generate_autofit_neighborhood_dataset.py index 210862b..8b91f9c 100644 --- a/ML/nmWTAI-ML/scripts/generate_autofit_neighborhood_dataset.py +++ b/ML/nmWTAI-ML/scripts/generate_autofit_neighborhood_dataset.py @@ -32,10 +32,18 @@ import numpy as np from src.common.config import Config from src.common.experiment_paths import config_for_stage, normalize_tag -from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features +from src.data.curve_processing import ( + clean_curve_for_dataset, + is_valid_curve, + resample_curve_to_features_with_time, +) from src.data.params import Params, Schedule, generate_params_dataset from src.data.runner_client import CppRunner, read_result_bin -from src.data.schedule_encoding import encode_schedule_to_timegrid +from src.data.schedule_features import ( + build_schedule_model_vector, + schedule_meta_feature_names, + use_schedule_meta_features, +) from src.evaluation.autofit_objective import dual_log_objective @@ -269,7 +277,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--config", type=str, default=None) parser.add_argument( "--stage", - choices=["fixed_case", "case_neighborhood", "family_random", "family_random_hard"], + choices=[ + "fixed_case", + "case_neighborhood", + "family_random", + "family_random_hard", + "family_random_v2_q", + ], default="family_random", ) parser.add_argument("--output", type=str, default=None, help="Output HDF5 path") @@ -289,6 +303,12 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--max-perturbed-dims", type=int, default=3) parser.add_argument("--balance-families", action="store_true", default=True) parser.add_argument("--objective-bins", type=int, default=3) + parser.add_argument( + "--early-stop-valid-factor", + type=float, + default=1.25, + help="Stop an anchor when total valid neighbors reaches this multiple of neighbors-per-anchor, even if some spans are sparse", + ) parser.add_argument("--solver-timeout", type=int, default=120) parser.add_argument("--well-index", type=int, default=0) parser.add_argument( @@ -333,21 +353,16 @@ def resolve_output_path(cfg: Config, args: argparse.Namespace) -> Path: return (cfg.paths.samples_dir / f"{tag}.h5").resolve() -def build_schedule_vector(cfg: Config, schedule: Schedule) -> np.ndarray: +def build_schedule_vector( + cfg: Config, + schedule: Schedule, + family_name: str = "unknown", +) -> np.ndarray: """把 Schedule 编码成正演代理模型可直接接收的流量制度特征向量。""" - enc = encode_schedule_to_timegrid( + return build_schedule_model_vector( cfg, - section_index=int(schedule.sectionIndex), - time_q=schedule.timeQ, - q=schedule.q, - n_sections=len(schedule.timeQ), - ) - return np.concatenate( - [ - np.asarray(enc.x_sched, dtype=np.float32).reshape(-1), - np.asarray(enc.x_sec, dtype=np.float32).reshape(-1), - ], - axis=0, + schedule, + family_name=family_name, ).astype(np.float32) @@ -357,7 +372,7 @@ def run_solver_and_extract_curve( params: Params, well_index: int, timeout: int, -) -> tuple[np.ndarray, dict]: +) -> tuple[np.ndarray, np.ndarray, dict]: """调用 C++ 求解器运行一次正演,并把双对数输出重采样为模型曲线向量。""" ok = runner.run_simulation(params, timeout=timeout, override_schedule=params.schedule, include_schedule=True) result = read_result_bin(runner.result_bin) if runner.result_bin.exists() else None @@ -382,7 +397,7 @@ def run_solver_and_extract_curve( if not valid: raise RuntimeError(f"curve_invalid_{reason}") - curve_feat = resample_curve_to_features(cfg, t_clean, p_clean, d_clean) + curve_feat, curve_time = resample_curve_to_features_with_time(cfg, t_clean, p_clean, d_clean) raw = { "t": t_clean.tolist(), "p": p_clean.tolist(), @@ -390,7 +405,7 @@ def run_solver_and_extract_curve( "n_steps": int(result["nSteps"]), "n_wells": int(result["nWells"]), } - return curve_feat, raw + return curve_feat, curve_time, raw def params_to_array(params: Params) -> np.ndarray: @@ -570,11 +585,11 @@ def select_objective_stratified_indices(objectives: np.ndarray, k: int, objectiv def select_multiscale_rows( - rows: list[tuple[np.ndarray, np.ndarray, dict[str, float], float]], + rows: list[tuple[np.ndarray, np.ndarray, dict[str, float], float, np.ndarray]], k: int, objective_bins: int, span_fracs: list[float], -) -> list[tuple[np.ndarray, np.ndarray, dict[str, float], float]]: +) -> list[tuple[np.ndarray, np.ndarray, dict[str, float], float, np.ndarray]]: """从不同扰动尺度的候选中挑选代表性行,形成多尺度邻域数据。""" if len(rows) <= k or len(span_fracs) <= 1: indices = select_objective_stratified_indices( @@ -659,7 +674,9 @@ def create_output_file( # anchor_* 保存中心样本;neighbor_* 保存围绕该中心扰动后的候选及其真实目标函数。 f.create_dataset("anchor_params", shape=(0, param_dim), maxshape=(None, param_dim), dtype=np.float32, chunks=(64, param_dim)) f.create_dataset("anchor_schedule", shape=(0, schedule_dim), maxshape=(None, schedule_dim), dtype=np.float32, chunks=(64, schedule_dim)) + n_time_points = int(curve_dim // 2) f.create_dataset("anchor_curve", shape=(0, curve_dim), maxshape=(None, curve_dim), dtype=np.float32, chunks=(64, curve_dim)) + f.create_dataset("anchor_curve_time", shape=(0, n_time_points), maxshape=(None, n_time_points), dtype=np.float32, chunks=(64, n_time_points)) f.create_dataset("anchor_schedule_meta", shape=(0, len(SCHEDULE_META_NAMES)), maxshape=(None, len(SCHEDULE_META_NAMES)), dtype=np.float32, chunks=(64, len(SCHEDULE_META_NAMES))) f.create_dataset("anchor_family_name", shape=(0,), maxshape=(None,), dtype=h5py.string_dtype(encoding="utf-8"), chunks=(64,)) f.create_dataset("anchor_section_index", shape=(0,), maxshape=(None,), dtype=np.int32, chunks=(64,)) @@ -669,6 +686,7 @@ def create_output_file( f.create_dataset("neighbor_anchor_id", shape=(0,), maxshape=(None,), dtype=np.int32, chunks=(256,)) f.create_dataset("neighbor_params", shape=(0, param_dim), maxshape=(None, param_dim), dtype=np.float32, chunks=(256, param_dim)) f.create_dataset("neighbor_curve", shape=(0, curve_dim), maxshape=(None, curve_dim), dtype=np.float32, chunks=(256, curve_dim)) + f.create_dataset("neighbor_curve_time", shape=(0, n_time_points), maxshape=(None, n_time_points), dtype=np.float32, chunks=(256, n_time_points)) f.create_dataset("neighbor_objective", shape=(0,), maxshape=(None,), dtype=np.float32, chunks=(256,)) f.create_dataset("neighbor_objective_p", shape=(0,), maxshape=(None,), dtype=np.float32, chunks=(256,)) f.create_dataset("neighbor_objective_d", shape=(0,), maxshape=(None,), dtype=np.float32, chunks=(256,)) @@ -683,6 +701,7 @@ def append_anchor( anchor_params: np.ndarray, schedule_vec: np.ndarray, curve: np.ndarray, + curve_time: np.ndarray, schedule_meta: np.ndarray, family_name: str, section_index: int, @@ -696,6 +715,7 @@ def append_anchor( ("anchor_params", anchor_params.reshape(1, -1)), ("anchor_schedule", schedule_vec.reshape(1, -1)), ("anchor_curve", curve.reshape(1, -1)), + ("anchor_curve_time", curve_time.reshape(1, -1)), ("anchor_schedule_meta", schedule_meta.reshape(1, -1)), ]: ds = f[name] @@ -719,6 +739,7 @@ def append_neighbors( anchor_id: int, params_list: list[np.ndarray], curve_list: list[np.ndarray], + curve_time_list: list[np.ndarray], obj_list: list[dict[str, float]], span_frac_list: list[float], ) -> None: @@ -742,6 +763,7 @@ def append_neighbors( resize_1d("neighbor_anchor_id")[start:end] = int(anchor_id) resize_2d("neighbor_params")[start:end] = np.stack(params_list, axis=0).astype(np.float32) resize_2d("neighbor_curve")[start:end] = np.stack(curve_list, axis=0).astype(np.float32) + resize_2d("neighbor_curve_time")[start:end] = np.stack(curve_time_list, axis=0).astype(np.float32) resize_1d("neighbor_objective")[start:end] = np.asarray([x["dual_log_objective"] for x in obj_list], dtype=np.float32) resize_1d("neighbor_objective_p")[start:end] = np.asarray([x["log_pressure_objective"] for x in obj_list], dtype=np.float32) resize_1d("neighbor_objective_d")[start:end] = np.asarray([x["log_derivative_objective"] for x in obj_list], dtype=np.float32) @@ -758,6 +780,8 @@ def main() -> None: span_fracs = resolve_span_fracs(args) schedule_dim = int(np.prod(cfg.schedule_grid_shape)) + int(cfg.sec_feat_dim) + if use_schedule_meta_features(cfg): + schedule_dim += len(schedule_meta_feature_names(cfg)) curve_dim = int(cfg.curve_dim) param_dim = len(cfg.raw["params"]["all_physical_param_names"]) local_search_names = search_param_names(cfg) @@ -809,7 +833,7 @@ def main() -> None: use_server=bool(args.use_runner_server), ) try: - anchor_curve, _ = run_solver_and_extract_curve( + anchor_curve, anchor_curve_time, _ = run_solver_and_extract_curve( runner=anchor_runner, cfg=cfg, params=anchor_params, @@ -822,12 +846,17 @@ def main() -> None: anchor_runner.close() continue - schedule_vec = build_schedule_vector(cfg, anchor_params.schedule) + schedule_vec = build_schedule_vector( + cfg, + anchor_params.schedule, + family_name=family_name, + ) anchor_id = append_anchor( f=f, anchor_params=params_to_array(anchor_params), schedule_vec=schedule_vec, curve=anchor_curve, + curve_time=anchor_curve_time, schedule_meta=schedule_meta, family_name=family_name, section_index=int(anchor_params.schedule.sectionIndex), @@ -841,7 +870,9 @@ def main() -> None: f"use_server={bool(args.use_runner_server)}" ) - valid_neighbor_rows: list[tuple[np.ndarray, np.ndarray, dict[str, float], float]] = [] + valid_neighbor_rows: list[ + tuple[np.ndarray, np.ndarray, dict[str, float], float, np.ndarray] + ] = [] valid_span_counter: Counter[str] = Counter() max_attempts = max( int(args.neighbors_per_anchor), @@ -849,9 +880,13 @@ def main() -> None: ) * len(span_fracs) span_targets = build_span_targets(int(args.neighbors_per_anchor), span_fracs) span_stop_targets = { - float(span_frac): max(int(target), int(target) * max(int(args.objective_bins), 1)) + float(span_frac): int(target) for span_frac, target in span_targets.items() } + total_stop_target = max( + int(args.neighbors_per_anchor), + int(np.ceil(float(args.neighbors_per_anchor) * float(args.early_stop_valid_factor))), + ) for attempt in range(max_attempts): span_frac = float(span_fracs[attempt % len(span_fracs)]) @@ -863,19 +898,27 @@ def main() -> None: max_perturbed_dims=int(args.max_perturbed_dims), ) try: - cand_curve, _ = run_solver_and_extract_curve( + cand_curve, cand_curve_time, _ = run_solver_and_extract_curve( runner=anchor_runner, cfg=cfg, params=cand, well_index=int(args.well_index), timeout=int(args.solver_timeout), ) + n_time_points = curve_dim // 2 obj = dual_log_objective(anchor_curve, cand_curve, {"parts": [ - {"name": "log_pressure", "start": 0, "end": curve_dim // 3}, - {"name": "log_derivative", "start": curve_dim // 3, "end": 2 * curve_dim // 3}, - {"name": "slope", "start": 2 * curve_dim // 3, "end": curve_dim}, + {"name": "log_pressure", "start": 0, "end": n_time_points}, + {"name": "log_derivative", "start": n_time_points, "end": curve_dim}, ]}) - valid_neighbor_rows.append((params_to_array(cand), cand_curve.astype(np.float32), obj, span_frac)) + valid_neighbor_rows.append( + ( + params_to_array(cand), + cand_curve.astype(np.float32), + obj, + span_frac, + cand_curve_time.astype(np.float32), + ) + ) summary["neighbor_span_counter_valid"][f"{span_frac:g}"] += 1 valid_span_counter[f"{span_frac:g}"] += 1 except Exception as exc: @@ -891,6 +934,13 @@ def main() -> None: f"valid={len(valid_neighbor_rows)} per_span={dict(sorted(valid_span_counter.items()))}" ) break + if len(valid_neighbor_rows) >= total_stop_target: + print( + f"[anchor {anchor_id:03d}] early-stop-total attempts={attempt + 1}/{max_attempts} " + f"valid={len(valid_neighbor_rows)} target={total_stop_target} " + f"per_span={dict(sorted(valid_span_counter.items()))}" + ) + break if (attempt + 1) % max(12, len(span_fracs) * 4) == 0: print( @@ -909,6 +959,7 @@ def main() -> None: neighbor_curve_rows = [x[1] for x in selected_rows] neighbor_obj_rows = [x[2] for x in selected_rows] neighbor_span_rows = [float(x[3]) for x in selected_rows] + neighbor_curve_time_rows = [x[4] for x in selected_rows] for span_frac in neighbor_span_rows: summary["neighbor_span_counter_kept"][f"{span_frac:g}"] += 1 @@ -918,6 +969,7 @@ def main() -> None: anchor_id=anchor_id, params_list=neighbor_params_rows, curve_list=neighbor_curve_rows, + curve_time_list=neighbor_curve_time_rows, obj_list=neighbor_obj_rows, span_frac_list=neighbor_span_rows, ) diff --git a/ML/nmWTAI-ML/scripts/preprocess_dataset.py b/ML/nmWTAI-ML/scripts/preprocess_dataset.py index 871fa08..3b82b65 100644 --- a/ML/nmWTAI-ML/scripts/preprocess_dataset.py +++ b/ML/nmWTAI-ML/scripts/preprocess_dataset.py @@ -17,7 +17,7 @@ ROOT = Path(__file__).resolve().parents[1] sys.path.append(str(ROOT)) from src.common.experiment_paths import normalize_tag, processed_path_for_tag -from src.data.preprocess import preprocess_dataset +from src.data.preprocess import migrate_processed_dataset_without_slope, preprocess_dataset def main() -> None: @@ -29,7 +29,7 @@ def main() -> None: "--input", type=str, required=True, - help="Path to the generated .h5 dataset", + help="Path to generated .h5 dataset or legacy processed .pkl", ) parser.add_argument( "--output", @@ -61,14 +61,18 @@ def main() -> None: else processed_path_for_tag(tag) ) - preprocess_dataset( - input_path=Path(args.input), - output_path=output_path, - test_size=args.test_size, - val_size=args.val_size, - random_seed=args.seed, - use_param_feature_transform=not args.no_param_feature_transform, - ) + input_path = Path(args.input) + if input_path.suffix.lower() == ".pkl": + migrate_processed_dataset_without_slope(input_path, output_path) + else: + preprocess_dataset( + input_path=input_path, + output_path=output_path, + test_size=args.test_size, + val_size=args.val_size, + random_seed=args.seed, + use_param_feature_transform=not args.no_param_feature_transform, + ) if __name__ == "__main__": diff --git a/ML/nmWTAI-ML/scripts/q_sweep_local_ranking.py b/ML/nmWTAI-ML/scripts/q_sweep_local_ranking.py index d0bc700..c42aedd 100644 --- a/ML/nmWTAI-ML/scripts/q_sweep_local_ranking.py +++ b/ML/nmWTAI-ML/scripts/q_sweep_local_ranking.py @@ -411,6 +411,7 @@ def main() -> None: params=target_params, well_index=int(args.well_index), timeout=int(args.solver_timeout), + target_curve_dim=int(processed["meta"]["curve_dim"]), ) except Exception as exc: failed_case_rows.append( @@ -445,6 +446,7 @@ def main() -> None: params=cand, well_index=int(args.well_index), timeout=int(args.solver_timeout), + target_curve_dim=int(processed["meta"]["curve_dim"]), ) pred_curve = predict_surrogate_curve( processed=processed, diff --git a/ML/nmWTAI-ML/scripts/replay_pso_trace_screening.py b/ML/nmWTAI-ML/scripts/replay_pso_trace_screening.py index 44f2e92..cb63bad 100644 --- a/ML/nmWTAI-ML/scripts/replay_pso_trace_screening.py +++ b/ML/nmWTAI-ML/scripts/replay_pso_trace_screening.py @@ -34,7 +34,7 @@ from src.common.experiment_paths import ( normalize_tag, processed_path_for_tag, ) -from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features +from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_model_features from src.data.params import Params, Schedule from src.evaluation.autofit_objective import dual_log_objective @@ -151,10 +151,8 @@ def build_target_curve_from_meta(cfg: Config, processed: dict, meta: dict) -> np if not valid: raise RuntimeError(f"Target loglog curve from meta is invalid: {reason}") - curve = resample_curve_to_features(cfg, t_clean, p_clean, d_clean) curve_dim = int(processed["meta"]["curve_dim"]) - if curve.size != curve_dim: - raise RuntimeError(f"Target curve dim mismatch after resample: {curve.size} != {curve_dim}") + curve = resample_curve_to_model_features(cfg, t_clean, p_clean, d_clean, curve_dim) return curve.astype(np.float32) diff --git a/ML/nmWTAI-ML/scripts/score_pso_candidates.py b/ML/nmWTAI-ML/scripts/score_pso_candidates.py index aef1106..bbe48a6 100644 --- a/ML/nmWTAI-ML/scripts/score_pso_candidates.py +++ b/ML/nmWTAI-ML/scripts/score_pso_candidates.py @@ -29,7 +29,7 @@ from src.common.experiment_paths import ( normalize_tag, processed_path_for_tag, ) -from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features +from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_model_features from src.data.params import Params, Schedule from src.evaluation.autofit_objective import dual_log_objective @@ -114,10 +114,8 @@ def build_target_curve_from_meta(cfg: Config, processed: dict, meta: dict) -> np valid, reason = is_valid_curve(cfg, t_clean, p_clean, d_clean) if not valid: raise RuntimeError(f"Target loglog curve from meta is invalid: {reason}") - curve = resample_curve_to_features(cfg, t_clean, p_clean, d_clean) curve_dim = int(processed["meta"]["curve_dim"]) - if curve.size != curve_dim: - raise RuntimeError(f"Target curve dim mismatch after resample: {curve.size} != {curve_dim}") + curve = resample_curve_to_model_features(cfg, t_clean, p_clean, d_clean, curve_dim) return curve.astype(np.float32) diff --git a/ML/nmWTAI-ML/scripts/train_forward.py b/ML/nmWTAI-ML/scripts/train_forward.py index 8206f8d..8b5febe 100644 --- a/ML/nmWTAI-ML/scripts/train_forward.py +++ b/ML/nmWTAI-ML/scripts/train_forward.py @@ -55,7 +55,6 @@ def main() -> None: parser.add_argument("--w-pressure", type=float, default=1.0) parser.add_argument("--w-derivative", type=float, default=2.0) - parser.add_argument("--w-slope", type=float, default=0.0) parser.add_argument("--w-bias-pressure", type=float, default=0.15) parser.add_argument("--w-bias-derivative", type=float, default=0.05) parser.add_argument("--w-derivative-shape", type=float, default=0.10) @@ -102,7 +101,6 @@ def main() -> None: weights=LossWeights( pressure=args.w_pressure, derivative=args.w_derivative, - slope=args.w_slope, bias_pressure=args.w_bias_pressure, bias_derivative=args.w_bias_derivative, derivative_shape=args.w_derivative_shape, diff --git a/ML/nmWTAI-ML/scripts/train_forward_ensemble.py b/ML/nmWTAI-ML/scripts/train_forward_ensemble.py deleted file mode 100644 index a2b580a..0000000 --- a/ML/nmWTAI-ML/scripts/train_forward_ensemble.py +++ /dev/null @@ -1,156 +0,0 @@ -"""训练多个随机种子的正演代理模型集成。 - -脚本基于同一份 processed 数据和训练超参数循环启动多次 `train_forward`,每个成员使用 -独立 seed 和输出目录。所得模型集合用于后续不确定性估计、误差风险分析和 fallback 筛选。 -""" - -# pylint: disable=import-error,wrong-import-position,duplicate-code - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[1] -sys.path.append(str(ROOT)) - -from src.common.experiment_paths import normalize_tag, processed_path_for_tag -from src.training.train_forward import ( - LossConfig, - LossWeights, - ModelConfig, - OptimConfig, - SampleReweightConfig, - TrainConfig, - TrainRuntime, - train_forward, -) - - -def parse_seed_list(seed_text: str) -> list[int]: - """解析逗号分隔的随机种子列表,用于训练或评估模型集成。""" - seeds = [] - for item in str(seed_text).split(","): - item = item.strip() - if not item: - continue - seeds.append(int(item)) - if not seeds: - raise ValueError("至少需要一个 seed") - return seeds - - -def default_output_root(tag: str | None, use_schedule: bool) -> Path: - """根据实验标签生成集成训练的默认模型输出根目录。""" - suffix = "" if use_schedule else "_no_schedule" - if tag: - return Path("models") / f"forward_surrogate_{tag}_ensemble{suffix}" - return Path("models") / f"forward_surrogate_ensemble{suffix}" - - -def main() -> None: - """按多个随机种子重复训练正演代理模型,并写出集成清单。""" - parser = argparse.ArgumentParser(description="Train a deep-ensemble forward surrogate for UQ") - parser.add_argument("--processed", type=str, default=None, help="Processed dataset path") - parser.add_argument("--tag", type=str, default=None, help="Experiment tag for auto naming") - parser.add_argument("--output-root", type=str, default=None, help="Optional ensemble root directory") - parser.add_argument("--seeds", type=str, default="41,42,43", help="Comma-separated seed list") - parser.add_argument("--batch-size", type=int, default=256) - parser.add_argument("--epochs", type=int, default=220) - parser.add_argument("--lr", type=float, default=0.001) - parser.add_argument("--weight-decay", type=float, default=0.0005) - parser.add_argument("--hidden-dim", type=int, default=256) - parser.add_argument("--dropout", type=float, default=0.1) - parser.add_argument("--w-pressure", type=float, default=1.0) - parser.add_argument("--w-derivative", type=float, default=2.0) - parser.add_argument("--w-slope", type=float, default=0.0) - parser.add_argument("--w-bias-pressure", type=float, default=0.15) - parser.add_argument("--w-bias-derivative", type=float, default=0.05) - parser.add_argument("--w-derivative-shape", type=float, default=0.10) - parser.add_argument("--w-autofit-pressure", type=float, default=0.0) - parser.add_argument("--w-autofit-derivative", type=float, default=0.0) - parser.add_argument("--huber-beta", type=float, default=0.05) - parser.add_argument("--use-sample-reweight", action="store_true", default=True) - parser.add_argument("--no-sample-reweight", action="store_false", dest="use_sample_reweight") - parser.add_argument("--sample-reweight-alpha", type=float, default=0.4) - parser.add_argument("--sample-weight-min", type=float, default=1.0) - parser.add_argument("--sample-weight-max", type=float, default=2.5) - parser.add_argument("--no-schedule", action="store_true") - args = parser.parse_args() - - tag = normalize_tag(args.tag) - use_schedule = not args.no_schedule - seeds = parse_seed_list(args.seeds) - processed_path = Path(args.processed) if args.processed is not None else processed_path_for_tag(tag) - output_root = Path(args.output_root) if args.output_root is not None else default_output_root(tag, use_schedule) - output_root.mkdir(parents=True, exist_ok=True) - - # manifest 记录每个成员模型的 seed、checkpoint 和 metrics,供不确定性评估脚本批量加载。 - manifest = { - "tag": tag, - "processed_path": str(processed_path), - "use_schedule": use_schedule, - "seeds": seeds, - "members": [], - } - - for seed in seeds: - member_dir = output_root / f"seed_{seed}" - print(f"\n=== Training ensemble member seed={seed} -> {member_dir} ===") - # 除随机种子和输出目录外,各成员使用相同数据和损失权重,形成深度集成。 - cfg = TrainConfig( - processed_path=processed_path, - output_dir=member_dir, - runtime=TrainRuntime(seed=seed), - optim=OptimConfig( - batch_size=args.batch_size, - epochs=args.epochs, - lr=args.lr, - weight_decay=args.weight_decay, - ), - model=ModelConfig( - hidden_dim=args.hidden_dim, - dropout=args.dropout, - use_schedule=use_schedule, - ), - loss=LossConfig( - weights=LossWeights( - pressure=args.w_pressure, - derivative=args.w_derivative, - slope=args.w_slope, - bias_pressure=args.w_bias_pressure, - bias_derivative=args.w_bias_derivative, - derivative_shape=args.w_derivative_shape, - autofit_pressure=args.w_autofit_pressure, - autofit_derivative=args.w_autofit_derivative, - ), - huber_beta=args.huber_beta, - ), - sample_reweight=SampleReweightConfig( - enabled=args.use_sample_reweight, - alpha=args.sample_reweight_alpha, - weight_min=args.sample_weight_min, - weight_max=args.sample_weight_max, - ), - ) - train_forward(cfg) - manifest["members"].append( - { - "seed": seed, - "dir": str(member_dir), - "checkpoint": str(member_dir / "forward_surrogate_best.pt"), - "metrics": str(member_dir / "metrics.json"), - } - ) - - with open(output_root / "ensemble_manifest.json", "w", encoding="utf-8") as f: - json.dump(manifest, f, ensure_ascii=False, indent=2) - - print("\nEnsemble training complete.") - print(f"Manifest saved: {output_root / 'ensemble_manifest.json'}") - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/train_time_conditioned.py b/ML/nmWTAI-ML/scripts/train_time_conditioned.py deleted file mode 100644 index 162e6a3..0000000 --- a/ML/nmWTAI-ML/scripts/train_time_conditioned.py +++ /dev/null @@ -1,104 +0,0 @@ -"""训练时间条件正演代理模型。 - -脚本把预处理数据展开为按时间点监督的训练任务,并将命令行参数封装为 -`TimeConditionedTrainConfig`。模型学习在给定参数、制度和时间特征时预测压力/导数, -适合处理可变时间采样或逐点推理场景。 -""" - -# pylint: disable=import-error,wrong-import-position - -from __future__ import annotations - -import argparse -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[1] -sys.path.append(str(ROOT)) - -from src.common.experiment_paths import normalize_tag, processed_path_for_tag -from src.training.train_time_conditioned import ( - RiskWeightConfig, - TimeConditionedTrainConfig, - TimeLossConfig, - TimeModelConfig, - TimeOptimConfig, - TimeRuntimeConfig, - train_time_conditioned, -) - - -def main() -> None: - """读取训练参数并训练按时间点展开的时间条件代理模型。""" - parser = argparse.ArgumentParser(description="Train a time-conditioned point-wise forward surrogate") - parser.add_argument("--processed", type=str, default=None) - parser.add_argument("--tag", type=str, default=None) - parser.add_argument("--output-dir", type=str, default=None) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--batch-size", type=int, default=4096) - parser.add_argument("--epochs", type=int, default=120) - parser.add_argument("--lr", type=float, default=1.0e-3) - parser.add_argument("--weight-decay", type=float, default=1.0e-4) - parser.add_argument("--hidden-dim", type=int, default=256) - parser.add_argument("--n-blocks", type=int, default=4) - parser.add_argument("--dropout", type=float, default=0.05) - parser.add_argument("--w-pressure", type=float, default=1.0) - parser.add_argument("--w-derivative", type=float, default=2.0) - parser.add_argument("--huber-beta", type=float, default=0.05) - parser.add_argument("--no-schedule", action="store_true") - parser.add_argument( - "--sample-weight-mode", - choices=["none", "risk_region"], - default="none", - help="Optional risk-region sample weighting; default keeps the original unweighted training behavior", - ) - parser.add_argument("--risk-weight", type=float, default=2.5) - parser.add_argument("--skin-lt-minus8-weight", type=float, default=3.5) - parser.add_argument("--sample-weight-min", type=float, default=1.0) - parser.add_argument("--sample-weight-max", type=float, default=4.0) - args = parser.parse_args() - - tag = normalize_tag(args.tag) - processed_path = Path(args.processed) if args.processed is not None else processed_path_for_tag(tag) - # 时间条件模型使用独立目录,避免覆盖固定长度曲线代理模型的 checkpoint。 - if args.output_dir is not None: - output_dir = Path(args.output_dir) - elif tag: - output_dir = Path("models") / f"time_conditioned_surrogate_{tag}" - else: - output_dir = Path("models") / "time_conditioned_surrogate" - - cfg = TimeConditionedTrainConfig( - processed_path=processed_path, - output_dir=output_dir, - runtime=TimeRuntimeConfig(seed=int(args.seed)), - optim=TimeOptimConfig( - batch_size=int(args.batch_size), - epochs=int(args.epochs), - lr=float(args.lr), - weight_decay=float(args.weight_decay), - ), - model=TimeModelConfig( - hidden_dim=int(args.hidden_dim), - n_blocks=int(args.n_blocks), - dropout=float(args.dropout), - use_schedule=not bool(args.no_schedule), - ), - loss=TimeLossConfig( - w_pressure=float(args.w_pressure), - w_derivative=float(args.w_derivative), - huber_beta=float(args.huber_beta), - ), - risk_weight=RiskWeightConfig( - mode=str(args.sample_weight_mode), - risk_weight=float(args.risk_weight), - skin_lt_minus8_weight=float(args.skin_lt_minus8_weight), - weight_min=float(args.sample_weight_min), - weight_max=float(args.sample_weight_max), - ), - ) - train_time_conditioned(cfg) - - -if __name__ == "__main__": - main() diff --git a/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking.py b/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking.py index b1cd3f4..7bc0d9b 100644 --- a/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking.py +++ b/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking.py @@ -26,7 +26,7 @@ import torch from src.common.config import Config from src.common.experiment_paths import config_for_stage, model_checkpoint_for_tag, normalize_tag, processed_path_for_tag -from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features +from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_model_features from src.data.param_features import param_feature_transform_from_meta, transform_param_features from src.data.params import Params, Schedule from src.data.runner_client import CppRunner, read_result_bin @@ -87,17 +87,18 @@ def parse_args() -> argparse.Namespace: def infer_curve_layout(meta: dict, curve_dim: int) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" + """从元数据读取布局;缺失时兼容推断双通道或旧三通道布局。""" curve_layout = meta.get("curve_layout") if curve_layout is not None: return curve_layout - n_time_points = curve_dim // 3 + n_parts = 3 if curve_dim % 3 == 0 else 2 + n_time_points = curve_dim // n_parts + names = ["log_pressure", "log_derivative"] + (["slope"] if n_parts == 3 else []) return { "n_time_points": int(n_time_points), "parts": [ - {"name": "log_pressure", "start": 0, "end": n_time_points}, - {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, + {"name": name, "start": i * n_time_points, "end": (i + 1) * n_time_points} + for i, name in enumerate(names) ], } @@ -218,6 +219,7 @@ def run_solver_and_extract_curve( params: Params, well_index: int, timeout: int, + target_curve_dim: int, ) -> tuple[np.ndarray, dict]: """调用 C++ 求解器运行一次正演,并把双对数输出重采样为模型曲线向量。""" ok = runner.run_simulation(params, timeout=timeout, override_schedule=params.schedule, include_schedule=True) @@ -243,7 +245,9 @@ def run_solver_and_extract_curve( if not valid: raise RuntimeError(f"Curve failed validation: {reason}") - curve_feat = resample_curve_to_features(cfg, t_clean, p_clean, d_clean) + curve_feat = resample_curve_to_model_features( + cfg, t_clean, p_clean, d_clean, target_curve_dim + ) raw = { "t": t_clean.tolist(), "p": p_clean.tolist(), @@ -429,6 +433,7 @@ def main() -> None: params=target_params, well_index=args.well_index, timeout=args.solver_timeout, + target_curve_dim=int(processed["meta"]["curve_dim"]), ) if not args.reuse_runner: @@ -458,6 +463,7 @@ def main() -> None: params=cand, well_index=args.well_index, timeout=args.solver_timeout, + target_curve_dim=int(processed["meta"]["curve_dim"]), ) pred_curve = predict_surrogate_curve( processed=processed, diff --git a/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking_batch.py b/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking_batch.py index d2560c0..700187d 100644 --- a/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking_batch.py +++ b/ML/nmWTAI-ML/scripts/validate_autofit_local_ranking_batch.py @@ -240,6 +240,7 @@ def main() -> None: params=target_params, well_index=int(args.well_index), timeout=int(args.solver_timeout), + target_curve_dim=int(processed["meta"]["curve_dim"]), ) except Exception as exc: failed_targets.append({"attempt_id": attempt, "reason": str(exc)}) @@ -279,6 +280,7 @@ def main() -> None: params=cand, well_index=int(args.well_index), timeout=int(args.solver_timeout), + target_curve_dim=int(processed["meta"]["curve_dim"]), ) pred_curve = predict_surrogate_curve( processed=processed, diff --git a/ML/nmWTAI-ML/src/common/config.py b/ML/nmWTAI-ML/src/common/config.py index 292aba6..c653e0f 100644 --- a/ML/nmWTAI-ML/src/common/config.py +++ b/ML/nmWTAI-ML/src/common/config.py @@ -87,8 +87,7 @@ class Config: def curve_dim(self) -> int: """返回配置中的曲线输出维度。""" n_time_points = int(self.get("curve_processing", "n_time_points", default=160)) - use_slope = bool(self.get("curve_processing", "use_slope_feature", default=True)) - return (2 + (1 if use_slope else 0)) * n_time_points + return 2 * n_time_points @property def sec_feat_dim(self) -> int: diff --git a/ML/nmWTAI-ML/src/data/curve_processing.py b/ML/nmWTAI-ML/src/data/curve_processing.py index 0459dc2..d91f215 100644 --- a/ML/nmWTAI-ML/src/data/curve_processing.py +++ b/ML/nmWTAI-ML/src/data/curve_processing.py @@ -4,8 +4,8 @@ C++ 数值试井求解器输出的双对数曲线通常是不等长时间序列,本模块负责在进入 机器学习数据集之前完成质量检查、无效点过滤、时间排序、去重、对数变换和重采样。 -输出曲线特征采用固定布局:log_pressure、log_derivative,以及可选的 slope -辅助通道按时间顺序拼接。该布局会被预处理、训练、评估脚本共同使用,因此这里 +输出曲线特征采用固定布局:log_pressure、log_derivative 按时间顺序拼接。 +该布局会被预处理、训练、评估脚本共同使用,因此这里 的维度和顺序必须与 Config.curve_dim、meta.curve_layout 保持一致。 """ @@ -167,15 +167,12 @@ def resample_curve_to_features_with_time( """把变长曲线转换为模型训练使用的固定长度特征。 输入曲线先转换为 log_pressure 和 log_abs_derivative,然后插值到几何时间网格。 - 如果配置启用 slope 通道,会额外计算 log_pressure 对 log_time 的中心差分斜率。 - Returns: tuple[np.ndarray, np.ndarray]: 第一个数组是按布局拼接的曲线特征,形状为 [cfg.curve_dim];第二个数组是该样本使用的时间网格,形状为 [n_time_points]。 """ eps = float(cfg.get("curve_processing", "feature_epsilon", default=1e-12)) n_time_points = int(cfg.get("curve_processing", "n_time_points", default=160)) - use_slope = bool(cfg.get("curve_processing", "use_slope_feature", default=True)) t = np.asarray(t, dtype=np.float64).reshape(-1) p = np.asarray(p, dtype=np.float64).reshape(-1) @@ -193,27 +190,43 @@ def resample_curve_to_features_with_time( lp = np.interp(grid, t, logp).astype(np.float32) ld = np.interp(grid, t, logd).astype(np.float32) - feats = [lp, ld] - - if use_slope: - # slope 作为辅助特征保留;即使训练权重设为 0,也能保持数据布局兼容。 - logt = np.log(np.maximum(grid, eps)).astype(np.float64) - s = np.zeros((n_time_points,), dtype=np.float32) - if n_time_points >= 3: - denom = logt[2:] - logt[:-2] - denom = np.maximum(denom, 1e-12) - s[1:-1] = ((lp[2:] - lp[:-2]) / denom).astype(np.float32) - s[0] = s[1] - s[-1] = s[-2] - feats.append(s) - - x = np.concatenate(feats, axis=0).astype(np.float32) + x = np.concatenate([lp, ld], axis=0).astype(np.float32) if x.size != cfg.curve_dim: raise RuntimeError(f"curve feature dim mismatch: got {x.size}, expect {cfg.curve_dim}") return x, grid.astype(np.float32) def resample_curve_to_features(cfg: Config, t: np.ndarray, p: np.ndarray, d: np.ndarray) -> np.ndarray: - """将不等长双对数曲线插值为固定长度的压力、导数和斜率拼接向量。""" + """将不等长双对数曲线插值为固定长度的压力和导数拼接向量。""" features, _ = resample_curve_to_features_with_time(cfg, t, p, d) return features + + +def resample_curve_to_model_features( + cfg: Config, + t: np.ndarray, + p: np.ndarray, + d: np.ndarray, + target_curve_dim: int, +) -> np.ndarray: + """按模型维度重采样;仅为旧 480 维模型临时补出 slope 通道。""" + features, grid = resample_curve_to_features_with_time(cfg, t, p, d) + target_curve_dim = int(target_curve_dim) + if features.size == target_curve_dim: + return features + + n_time_points = grid.size + if target_curve_dim != features.size + n_time_points: + raise RuntimeError( + f"无法把 {features.size} 维双通道曲线适配为模型要求的 {target_curve_dim} 维" + ) + + lp = features[:n_time_points] + logt = np.log(np.maximum(grid.astype(np.float64), 1e-12)) + slope = np.zeros((n_time_points,), dtype=np.float32) + if n_time_points >= 3: + denom = np.maximum(logt[2:] - logt[:-2], 1e-12) + slope[1:-1] = ((lp[2:] - lp[:-2]) / denom).astype(np.float32) + slope[0] = slope[1] + slope[-1] = slope[-2] + return np.concatenate([features, slope], axis=0).astype(np.float32) diff --git a/ML/nmWTAI-ML/src/data/dataset_generation.py b/ML/nmWTAI-ML/src/data/dataset_generation.py index 658b3e4..b5d922e 100644 --- a/ML/nmWTAI-ML/src/data/dataset_generation.py +++ b/ML/nmWTAI-ML/src/data/dataset_generation.py @@ -473,7 +473,7 @@ def _worker_simulate_parallel(args): if not ok: return task_idx, None, reason, None - # 模型监督目标固定为 pressure/derivative/slope 三段拼接向量,同时保留时间坐标。 + # 模型监督目标固定为 pressure/derivative 两段拼接向量,同时保留时间坐标。 curve_feat, curve_time = resample_curve_to_features_with_time(cfg, t, p_curve, d_curve) sec = int(np.clip(int(sch.sectionIndex), 1, max(len(sch.timeQ), 1))) @@ -549,7 +549,7 @@ class HDF5Appender: self.param_dim = int(param_dim) self.schedule_dim = int(schedule_dim) self.curve_dim = int(curve_dim) - self.time_dim = int(curve_dim) // 3 + self.time_dim = int(curve_dim) // 2 self.f = h5py.File(self.filepath, "w") self._n = 0 diff --git a/ML/nmWTAI-ML/src/data/preprocess.py b/ML/nmWTAI-ML/src/data/preprocess.py index 3e37368..f36ebb6 100644 --- a/ML/nmWTAI-ML/src/data/preprocess.py +++ b/ML/nmWTAI-ML/src/data/preprocess.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """原始 HDF5 样本预处理。 数据生成阶段写出的 HDF5 文件仍处在“原始样本”状态:参数尚未做特征变换,输入和 @@ -12,6 +12,7 @@ payload 中保留 curve_layout、参数变换配置、流量制度元数据和 from __future__ import annotations +import copy from pathlib import Path import h5py @@ -21,7 +22,6 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from src.data.param_features import build_param_feature_transform, transform_param_features -from src.data.time_features import build_default_curve_time, build_time_point_features @@ -64,6 +64,97 @@ def _validate_optional_length(name: str, arr: np.ndarray | None, expected_n: int raise ValueError(f"{name} row count does not match main arrays: {len(arr)} != {expected_n}") +def _curve_keep_indices(meta: dict, curve_dim: int) -> np.ndarray: + """返回旧 processed 曲线中 pressure/derivative 两段的列索引。""" + layout = meta.get("curve_layout") or {} + parts = {str(part["name"]): part for part in layout.get("parts", [])} + if "log_pressure" in parts and "log_derivative" in parts: + indices = [] + for name in ("log_pressure", "log_derivative"): + part = parts[name] + indices.extend(range(int(part["start"]), int(part["end"]))) + return np.asarray(indices, dtype=np.int64) + + if curve_dim % 3 != 0: + raise ValueError( + "processed 数据没有 curve_layout,且曲线维度不能按旧版三通道布局识别" + ) + n_time_points = curve_dim // 3 + return np.arange(2 * n_time_points, dtype=np.int64) + + +def _slice_standard_scaler(scaler: StandardScaler, keep_indices: np.ndarray) -> StandardScaler: + """裁剪按列独立拟合的 StandardScaler,同时保留其拟合状态。""" + result = copy.deepcopy(scaler) + for name in ("mean_", "scale_", "var_"): + value = getattr(result, name, None) + if value is not None: + setattr(result, name, np.asarray(value)[keep_indices].copy()) + result.n_features_in_ = int(keep_indices.size) + return result + + +def migrate_processed_dataset_without_slope(input_path: Path, output_path: Path) -> None: + """把旧三通道 processed pkl 精确迁移为 pressure/derivative 双通道数据。""" + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {input_path}") + + source = joblib.load(input_path) + required = ["Y_curve_train", "Y_curve_val", "Y_curve_test", "scaler_curve", "meta"] + missing = [name for name in required if name not in source] + if missing: + raise KeyError(f"processed 数据缺少字段: {missing}") + + curve_dim = int(source["Y_curve_train"].shape[1]) + keep_indices = _curve_keep_indices(source.get("meta", {}), curve_dim) + if keep_indices.size == curve_dim: + raise ValueError("输入 processed 数据已经不包含 slope,无需迁移") + if keep_indices.size % 2 != 0: + raise ValueError(f"保留后的曲线维度必须能被 2 整除,当前为 {keep_indices.size}") + + payload = dict(source) + for split in ("train", "val", "test"): + key = f"Y_curve_{split}" + payload[key] = np.asarray(source[key][:, keep_indices], dtype=np.float32) + payload["scaler_curve"] = _slice_standard_scaler(source["scaler_curve"], keep_indices) + + # 这些字段属于已停用的 time-conditioned 路径,不进入旧正演模型训练包。 + for key in ( + "T_curve_train", "T_curve_val", "T_curve_test", + "X_time_train", "X_time_val", "X_time_test", "scaler_time", + ): + payload.pop(key, None) + + n_time_points = int(keep_indices.size // 2) + meta = dict(source.get("meta", {})) + meta.update( + { + "curve_dim": int(keep_indices.size), + "raw_curve_dim": curve_dim, + "dropped_legacy_slope": True, + "migrated_from_processed": str(input_path), + "curve_layout": { + "n_time_points": n_time_points, + "parts": [ + {"name": "log_pressure", "start": 0, "end": n_time_points}, + {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, + ], + }, + } + ) + meta.pop("time_feature_dim", None) + meta.pop("curve_time_source", None) + payload["meta"] = meta + + output_path.parent.mkdir(parents=True, exist_ok=True) + joblib.dump(payload, output_path) + print(f"Processed migration complete, saved to: {output_path}") + print( + f"train={len(payload['Y_curve_train'])}, val={len(payload['Y_curve_val'])}, " + f"test={len(payload['Y_curve_test'])}, curve_dim={curve_dim}->{keep_indices.size}" + ) + + def preprocess_dataset( input_path: Path, output_path: Path, @@ -75,9 +166,8 @@ def preprocess_dataset( """将原始 HDF5 样本整理为训练脚本可直接加载的 joblib 数据包。 处理步骤包括:读取 params/schedule/curve 等核心数组,保留可选元数据,构造参数 - 特征变换,划分 train/val/test,只在训练集上拟合 StandardScaler,并生成时间条件 - 模型需要的逐点时间特征。输出 payload 同时保存 scaler 和 meta,确保后续评估可以 - 把标准化预测恢复到原始曲线尺度,并按 curve_layout 正确拆分压力、导数和 slope。 + 特征变换,划分 train/val/test,并只在训练集上拟合 StandardScaler。输出 payload + 同时保存 scaler 和 meta,确保后续评估可以恢复原始尺度并正确拆分压力和导数。 """ if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_path}") @@ -91,13 +181,6 @@ def preprocess_dataset( x_params = _ensure_2d("params", f["params"][:]) x_schedule = _ensure_2d("schedule", f["schedule"][:]) y_curve = _ensure_2d("curve", f["curve"][:]) - if "curve_time" in f: - curve_time = _ensure_2d("curve_time", f["curve_time"][:]) - curve_time_source = "saved_curve_time" - else: - curve_time = None - curve_time_source = "synthetic_unit_time" - # 可选元数据会保留下来,供后续分析和排序验证使用; # 基础正演训练循环不依赖这些字段。 param_names = _decode_attr_list(f.attrs.get("param_names")) @@ -126,11 +209,19 @@ def preprocess_dataset( if len(x_schedule) != n or len(y_curve) != n: raise ValueError("params / schedule / curve row counts do not match") + raw_curve_dim = int(y_curve.shape[1]) + if raw_curve_dim % 3 == 0: + # 兼容已有的 pressure/derivative/slope HDF5;预处理时丢弃未参与训练的 slope。 + n_time_points = raw_curve_dim // 3 + y_curve = y_curve[:, : 2 * n_time_points] + elif raw_curve_dim % 2 == 0: + n_time_points = raw_curve_dim // 2 + else: + raise ValueError( + f"curve 维度 {raw_curve_dim} 无法识别;期望 2*N(压力/导数)或 " + "3*N(旧版压力/导数/slope)" + ) curve_dim = int(y_curve.shape[1]) - n_time_points = curve_dim // 3 - if curve_time is None: - curve_time = build_default_curve_time(n, n_time_points) - _validate_optional_length("curve_time", curve_time, n) param_feature_transform = build_param_feature_transform( param_names=param_names, @@ -170,17 +261,14 @@ def preprocess_dataset( x_params_train = x_params_features[idx_train] x_schedule_train = x_schedule[idx_train] y_curve_train = y_curve[idx_train] - curve_time_train = curve_time[idx_train] x_params_val = x_params_features[idx_val] x_schedule_val = x_schedule[idx_val] y_curve_val = y_curve[idx_val] - curve_time_val = curve_time[idx_val] x_params_test = x_params_features[idx_test] x_schedule_test = x_schedule[idx_test] y_curve_test = y_curve[idx_test] - curve_time_test = curve_time[idx_test] schedule_meta_train = schedule_meta[idx_train] if schedule_meta is not None else None schedule_meta_val = schedule_meta[idx_val] if schedule_meta is not None else None @@ -222,26 +310,19 @@ def preprocess_dataset( scaler_params = StandardScaler() scaler_schedule = StandardScaler() scaler_curve = StandardScaler() - scaler_time = StandardScaler() # 只在训练集上拟合 scaler,避免验证/测试信息泄漏。 x_params_train_s = scaler_params.fit_transform(x_params_train) x_schedule_train_s = scaler_schedule.fit_transform(x_schedule_train) y_curve_train_s = scaler_curve.fit_transform(y_curve_train) - x_time_train = build_time_point_features(curve_time_train) - x_time_train_s = scaler_time.fit_transform(x_time_train.reshape(-1, x_time_train.shape[-1])).reshape(x_time_train.shape) x_params_val_s = scaler_params.transform(x_params_val) x_schedule_val_s = scaler_schedule.transform(x_schedule_val) y_curve_val_s = scaler_curve.transform(y_curve_val) - x_time_val = build_time_point_features(curve_time_val) - x_time_val_s = scaler_time.transform(x_time_val.reshape(-1, x_time_val.shape[-1])).reshape(x_time_val.shape) x_params_test_s = scaler_params.transform(x_params_test) x_schedule_test_s = scaler_schedule.transform(x_schedule_test) y_curve_test_s = scaler_curve.transform(y_curve_test) - x_time_test = build_time_point_features(curve_time_test) - x_time_test_s = scaler_time.transform(x_time_test.reshape(-1, x_time_test.shape[-1])).reshape(x_time_test.shape) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -256,8 +337,8 @@ def preprocess_dataset( "raw_param_dim": int(x_params.shape[1]), "schedule_dim": int(x_schedule.shape[1]), "curve_dim": curve_dim, - "time_feature_dim": int(x_time_train.shape[-1]), - "curve_time_source": curve_time_source, + "raw_curve_dim": raw_curve_dim, + "dropped_legacy_slope": bool(raw_curve_dim != curve_dim), "input_h5": str(input_path), "param_names": param_names, "param_feature_transform": param_feature_transform, @@ -270,7 +351,6 @@ def preprocess_dataset( "parts": [ {"name": "log_pressure", "start": 0, "end": int(n_time_points)}, {"name": "log_derivative", "start": int(n_time_points), "end": int(2 * n_time_points)}, - {"name": "slope", "start": int(2 * n_time_points), "end": int(3 * n_time_points)}, ], }, } @@ -279,22 +359,15 @@ def preprocess_dataset( "X_params_train": x_params_train_s.astype(np.float32), "X_schedule_train": x_schedule_train_s.astype(np.float32), "Y_curve_train": y_curve_train_s.astype(np.float32), - "T_curve_train": curve_time_train.astype(np.float32), - "X_time_train": x_time_train_s.astype(np.float32), "X_params_val": x_params_val_s.astype(np.float32), "X_schedule_val": x_schedule_val_s.astype(np.float32), "Y_curve_val": y_curve_val_s.astype(np.float32), - "T_curve_val": curve_time_val.astype(np.float32), - "X_time_val": x_time_val_s.astype(np.float32), "X_params_test": x_params_test_s.astype(np.float32), "X_schedule_test": x_schedule_test_s.astype(np.float32), "Y_curve_test": y_curve_test_s.astype(np.float32), - "T_curve_test": curve_time_test.astype(np.float32), - "X_time_test": x_time_test_s.astype(np.float32), "scaler_params": scaler_params, "scaler_schedule": scaler_schedule, "scaler_curve": scaler_curve, - "scaler_time": scaler_time, "meta": meta, } @@ -337,8 +410,7 @@ def preprocess_dataset( print( f"train={len(idx_train)}, val={len(idx_val)}, test={len(idx_test)}, " f"param_dim={x_params_features.shape[1]}, schedule_dim={x_schedule.shape[1]}, " - f"curve_dim={y_curve.shape[1]}, time_feature_dim={x_time_train.shape[-1]}, " - f"curve_time_source={curve_time_source}" + f"curve_dim={y_curve.shape[1]}" ) if schedule_meta is not None: print(f"schedule_meta_dim={schedule_meta.shape[1]}, family_name_saved={family_name is not None}") diff --git a/ML/nmWTAI-ML/src/data/runner_client.py b/ML/nmWTAI-ML/src/data/runner_client.py index 8fe5537..968ead7 100644 --- a/ML/nmWTAI-ML/src/data/runner_client.py +++ b/ML/nmWTAI-ML/src/data/runner_client.py @@ -14,10 +14,11 @@ from __future__ import annotations import os +import queue import shutil import struct import subprocess -import time +import threading from pathlib import Path from typing import Any, Dict, Optional @@ -167,6 +168,74 @@ class CppRunner: assert self.proc is not None and self.proc.stdin is not None and self.proc.stdout is not None line = f"{params_path} {result_path}\n" + write_timeout = min(max(float(timeout) * 0.10, 1.0), 5.0) + + def safe_write_once() -> bool: + results: queue.Queue[bool] = queue.Queue(maxsize=1) + proc_stdin = self.proc.stdin if self.proc is not None else None + + def write_line() -> None: + ok = False + try: + assert proc_stdin is not None + proc_stdin.write(line) + proc_stdin.flush() + ok = True + except Exception: + ok = False + try: + results.put(ok) + except Exception: + pass + + writer = threading.Thread(target=write_line, daemon=True) + writer.start() + writer.join(timeout=write_timeout) + if writer.is_alive(): + self.close() + return False + try: + return bool(results.get_nowait()) + except queue.Empty: + return False + + if not safe_write_once(): + self.close() + self._ensure_server_started() + assert self.proc is not None and self.proc.stdin is not None and self.proc.stdout is not None + if not safe_write_once(): + self.close() + return False + + responses_safe: queue.Queue[str] = queue.Queue(maxsize=1) + proc_stdout = self.proc.stdout if self.proc is not None else None + + def read_response_safe() -> None: + try: + assert proc_stdout is not None + responses_safe.put(proc_stdout.readline()) + except Exception: + try: + responses_safe.put("") + except Exception: + pass + + reader_safe = threading.Thread(target=read_response_safe, daemon=True) + reader_safe.start() + reader_safe.join(timeout=max(float(timeout), 1.0)) + if reader_safe.is_alive(): + self.close() + return False + if self.proc is not None and self.proc.poll() is not None: + self.close() + return False + + try: + resp_safe = responses_safe.get_nowait() + except queue.Empty: + resp_safe = "" + return bool(resp_safe.strip().startswith("OK")) + try: self.proc.stdin.write(line) self.proc.stdin.flush() @@ -178,25 +247,38 @@ class CppRunner: self.proc.stdin.write(line) self.proc.stdin.flush() - start = time.time() - while True: - if self.proc.poll() is not None: - self.close() - return False + responses: queue.Queue[str] = queue.Queue(maxsize=1) + def read_response() -> None: try: - resp = self.proc.stdout.readline() + assert self.proc is not None and self.proc.stdout is not None + responses.put(self.proc.stdout.readline()) except Exception: - resp = "" + try: + responses.put("") + except Exception: + pass - if resp: - resp = resp.strip() - if resp.startswith("OK"): - return True - return False + reader = threading.Thread(target=read_response, daemon=True) + reader.start() + reader.join(timeout=max(float(timeout), 1.0)) - if (time.time() - start) > timeout: - return False + if reader.is_alive(): + self.close() + return False + if self.proc is not None and self.proc.poll() is not None: + self.close() + return False + + try: + resp = responses.get_nowait() + except queue.Empty: + resp = "" + + resp = resp.strip() + if resp.startswith("OK"): + return True + return False def run_simulation( self, @@ -227,7 +309,14 @@ class CppRunner: return bool(ok and self.result_bin.exists()) # 非 server 模式保持兼容:每次模拟都启动 runner.exe,并等待 result.bin 生成。 - cmd = [str(self.runner_exe), str(self.dataset_bin), str(self.params_bin), str(self.result_bin)] + cmd = [ + str(self.runner_exe), + str(self.dataset_bin), + str(self.params_bin), + str(self.result_bin), + str(self.runner_dll), + str(self.license_file), + ] result = subprocess.run( cmd, cwd=str(self.runner_exe.parent), diff --git a/ML/nmWTAI-ML/src/data/time_features.py b/ML/nmWTAI-ML/src/data/time_features.py deleted file mode 100644 index 8f4de87..0000000 --- a/ML/nmWTAI-ML/src/data/time_features.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -"""时间条件模型的时间点特征。 - -正演代理模型一次性输出整条固定长度曲线,而时间条件模型会把曲线拆成逐时间点 -样本。本模块为每个曲线时间点构造 log 时间、相对时间和采样位置等特征,使模型 -能够在同一组物理参数和流量制度下预测任意时间点的压力/导数响应。 -""" - -from __future__ import annotations - -import numpy as np - - -def build_default_curve_time(n_samples: int, n_time_points: int) -> np.ndarray: - """为没有保存真实曲线时间坐标的旧数据集构造默认对数时间网格。""" - # 旧数据只保存曲线值,默认用 1e-6 到 1 的几何网格近似原始双对数时间轴。 - base = np.geomspace(1.0e-6, 1.0, int(n_time_points), dtype=np.float32) - return np.tile(base.reshape(1, -1), (int(n_samples), 1)).astype(np.float32) - - -def build_time_point_features(curve_time: np.ndarray) -> np.ndarray: - """为每个曲线时间点构造 log10(t)、相对时间和归一化索引等时间条件特征。""" - t = np.asarray(curve_time, dtype=np.float32) - if t.ndim != 2: - raise ValueError(f"curve_time must be 2D [N, T], got shape={t.shape}") - - eps = np.float32(1.0e-12) - # 避免 log10(0) 或除以 0;时间比值以每条曲线末端时间归一化。 - t_safe = np.maximum(t, eps) - t_end = np.maximum(t_safe[:, -1:], eps) - t_ratio = np.clip(t_safe / t_end, eps, None) - - n_time = t.shape[1] - if n_time <= 1: - index = np.zeros((n_time,), dtype=np.float32) - else: - index = np.linspace(0.0, 1.0, n_time, dtype=np.float32) - index = np.tile(index.reshape(1, -1), (t.shape[0], 1)) - - # 四个通道同时保留绝对对数时间、相对对数时间、线性比例和采样位置。 - return np.stack( - [ - np.log10(t_safe), - np.log10(t_ratio), - t_ratio, - index, - ], - axis=-1, - ).astype(np.float32) diff --git a/ML/nmWTAI-ML/src/evaluation/autofit_objective.py b/ML/nmWTAI-ML/src/evaluation/autofit_objective.py index 4970e02..15ef6d0 100644 --- a/ML/nmWTAI-ML/src/evaluation/autofit_objective.py +++ b/ML/nmWTAI-ML/src/evaluation/autofit_objective.py @@ -14,7 +14,7 @@ import numpy as np def split_curve_by_layout(curve: np.ndarray, layout: dict) -> dict[str, np.ndarray]: - """按照 curve_layout 将拼接曲线拆成 log_pressure、log_derivative 和 slope 三段。""" + """按照 curve_layout 拆分曲线。""" parts: dict[str, np.ndarray] = {} for part in layout["parts"]: start = int(part["start"]) diff --git a/ML/nmWTAI-ML/src/models/forward_surrogate.py b/ML/nmWTAI-ML/src/models/forward_surrogate.py index c007681..7bb82a2 100644 --- a/ML/nmWTAI-ML/src/models/forward_surrogate.py +++ b/ML/nmWTAI-ML/src/models/forward_surrogate.py @@ -2,7 +2,7 @@ """正演代理模型网络结构。 ForwardSurrogate 输入标准化后的物理参数特征和可选的流量制度编码,输出固定长度 -拼接曲线:log_pressure、log_derivative 和 slope。模型采用“参数分支 + 流量制度 +拼接曲线:log_pressure 和 log_derivative。模型采用“参数分支 + 流量制度 分支 + 融合主干 + 多输出头”的结构,便于分别学习静态地层信息、动态制度信息以及 二者共同决定的曲线形态。 @@ -103,8 +103,8 @@ class ForwardSurrogate(nn.Module): 当 use_schedule=False 时该输入可为空。 输出: - curve_pred: 形状 [B, curve_dim],按 log_pressure、log_derivative、slope - 三段顺序拼接。curve_dim 必须能被 3 整除,以便每段拥有相同时间点数。 + curve_pred: 形状 [B, curve_dim],按 log_pressure、log_derivative 顺序拼接。 + 新模型使用双通道布局;旧版三通道 checkpoint 仍可加载用于兼容部署。 """ def __init__( @@ -119,7 +119,7 @@ class ForwardSurrogate(nn.Module): dropout: float = 0.0, use_schedule: bool = True, ): - """构建参数分支、可选流量制度分支、融合主干和三组曲线输出头。""" + """构建参数分支、可选流量制度分支、融合主干和曲线输出头。""" super().__init__() config = self._coerce_config( config=config, @@ -175,8 +175,13 @@ class ForwardSurrogate(nn.Module): @property def part_dim(self) -> int: - """压力、导数和 slope 每一段的时间点数量。""" - return self.config.curve_dim // 3 + """每一段曲线的时间点数量。""" + return self.config.curve_dim // self.n_curve_parts + + @property + def n_curve_parts(self) -> int: + """返回输出通道数;480 等旧 checkpoint 保持三通道兼容。""" + return 3 if self.config.curve_dim % 3 == 0 else 2 @property def use_schedule(self) -> bool: @@ -186,10 +191,10 @@ class ForwardSurrogate(nn.Module): @staticmethod def _validate_config(config: ForwardSurrogateConfig) -> None: """检查模型配置是否满足网络结构约束。""" - if config.curve_dim % 3 != 0: + if config.curve_dim % 2 != 0 and config.curve_dim % 3 != 0: msg = ( - f"curve_dim={config.curve_dim} 不能被 3 整除;" - "期望为 pressure/derivative/slope 三段" + f"curve_dim={config.curve_dim} 无法识别;" + "期望为 pressure/derivative 双通道或旧版三通道布局" ) raise ValueError(msg) @@ -229,17 +234,17 @@ class ForwardSurrogate(nn.Module): ) def _build_heads(self) -> nn.ModuleDict: - """构建压力、导数和 slope 输出头。""" + """构建压力和导数输出头;仅旧版三通道模型增加 slope 头。""" trunk_out_dim = self.config.fusion_hidden_dims[-1] - return nn.ModuleDict( - { - "pressure_level": self._build_single_head(trunk_out_dim, 1), - "pressure_shape": self._build_single_head(trunk_out_dim, self.part_dim), - "derivative_level": self._build_single_head(trunk_out_dim, 1), - "derivative_shape": self._build_single_head(trunk_out_dim, self.part_dim), - "slope": self._build_single_head(trunk_out_dim, self.part_dim), - } - ) + heads = { + "pressure_level": self._build_single_head(trunk_out_dim, 1), + "pressure_shape": self._build_single_head(trunk_out_dim, self.part_dim), + "derivative_level": self._build_single_head(trunk_out_dim, 1), + "derivative_shape": self._build_single_head(trunk_out_dim, self.part_dim), + } + if self.n_curve_parts == 3: + heads["slope"] = self._build_single_head(trunk_out_dim, self.part_dim) + return nn.ModuleDict(heads) def _build_single_head(self, in_dim: int, out_dim: int) -> nn.Sequential: """构建一个曲线输出头。""" @@ -326,8 +331,8 @@ class ForwardSurrogate(nn.Module): """执行一次前向预测。 参数分支和流量制度分支先分别编码,再在隐空间拼接。融合主干提取共同特征后, - 压力和导数各自通过 level + centered shape 两个输出头生成;slope 作为辅助通道 - 直接由单独输出头预测。返回值仍保持预处理阶段约定的曲线拼接布局。 + 压力和导数各自通过 level + centered shape 两个输出头生成。旧版三通道模型 + 会继续生成 slope,以便已有 checkpoint 保持可加载。 """ fused_feat = self._encode_features(params_x, schedule_x) trunk_feat = self.trunk(fused_feat) @@ -342,6 +347,7 @@ class ForwardSurrogate(nn.Module): "derivative_level", "derivative_shape", ) - slope_pred = self.heads["slope"](trunk_feat) - - return torch.cat([pressure_pred, derivative_pred, slope_pred], dim=1) + outputs = [pressure_pred, derivative_pred] + if "slope" in self.heads: + outputs.append(self.heads["slope"](trunk_feat)) + return torch.cat(outputs, dim=1) diff --git a/ML/nmWTAI-ML/src/models/time_conditioned_surrogate.py b/ML/nmWTAI-ML/src/models/time_conditioned_surrogate.py deleted file mode 100644 index cde54e4..0000000 --- a/ML/nmWTAI-ML/src/models/time_conditioned_surrogate.py +++ /dev/null @@ -1,193 +0,0 @@ -# -*- coding: utf-8 -*- -"""时间条件代理模型网络结构。 - -TimeConditionedSurrogate 不一次性输出完整曲线,而是把“物理参数 + 流量制度 + 某个 -时间点特征”作为输入,预测该时间点的 log_pressure 和 log_derivative。它适合用于 -更灵活的时间采样、局部曲线重建以及需要按时间点加权的训练策略。 - -模型主体由参数编码器、制度编码器、时间编码器、融合投影和若干残差块组成。残差 -块让较深的全连接网络更容易优化,同时保留原始融合特征。 -""" - -from __future__ import annotations - -# pylint: disable=import-error,duplicate-code,too-many-arguments,too-many-positional-arguments - -from dataclasses import dataclass - -import torch -from torch import nn - - -@dataclass(frozen=True) -class TimeConditionedSurrogateConfig: - """时间条件代理模型配置。""" - - param_dim: int - schedule_dim: int - time_dim: int - hidden_dim: int = 256 - n_blocks: int = 4 - dropout: float = 0.05 - use_schedule: bool = True - - -class ResidualBlock(nn.Module): - """时间条件模型使用的全连接残差块,用于在较深网络中稳定传播特征。""" - - def __init__(self, dim: int, dropout: float = 0.0): - """构造两层全连接残差块,并根据 dropout 参数决定是否启用随机失活。""" - super().__init__() - self.net = nn.Sequential( - nn.LayerNorm(dim), - nn.Linear(dim, dim), - nn.GELU(), - nn.Dropout(dropout) if dropout > 0 else nn.Identity(), - nn.Linear(dim, dim), - ) - self.act = nn.GELU() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """在保留原始隐藏表示的基础上叠加两层非线性修正。""" - # 残差连接让块学习修正量,而不是每层都重新表示完整特征。 - return self.act(x + self.net(x)) - - -class TimeConditionedSurrogate(nn.Module): - """逐时间点预测的时间条件代理模型。 - - 与 ForwardSurrogate 一次输出整条曲线不同,该模型每次只预测一个时间点的 - log_pressure 和 log_derivative。训练数据通常由 PointCurveDataset 将 [N, T] 曲线 - 展开为 N*T 个样本,因此 batch 中每一行都带有自己的 time_x。 - """ - - def __init__( - self, - config: TimeConditionedSurrogateConfig | None = None, - *, - param_dim: int | None = None, - schedule_dim: int | None = None, - time_dim: int | None = None, - hidden_dim: int = 256, - n_blocks: int = 4, - dropout: float = 0.05, - use_schedule: bool = True, - ): - """按配置组装时间条件代理模型的编码、融合和输出层。""" - super().__init__() - config = self._coerce_config( - config=config, - param_dim=param_dim, - schedule_dim=schedule_dim, - time_dim=time_dim, - hidden_dim=hidden_dim, - n_blocks=n_blocks, - dropout=dropout, - use_schedule=use_schedule, - ) - - hidden_dim = int(config.hidden_dim) - time_hidden_dim = hidden_dim // 2 - self.use_schedule = bool(config.use_schedule) - - self.param_encoder = self._build_encoder(config.param_dim, hidden_dim) - self.time_encoder = self._build_encoder(config.time_dim, time_hidden_dim) - - if self.use_schedule: - self.schedule_encoder = self._build_encoder(config.schedule_dim, hidden_dim) - fusion_dim = hidden_dim * 2 + time_hidden_dim - else: - self.schedule_encoder = None - fusion_dim = hidden_dim + time_hidden_dim - - self.input_proj = nn.Sequential( - nn.Linear(fusion_dim, hidden_dim), - nn.GELU(), - ) - self.blocks = nn.Sequential( - *[ - ResidualBlock(hidden_dim, dropout=config.dropout) - for _ in range(int(config.n_blocks)) - ] - ) - self.head = nn.Sequential( - nn.LayerNorm(hidden_dim), - nn.Linear(hidden_dim, time_hidden_dim), - nn.GELU(), - nn.Linear(time_hidden_dim, 2), - ) - - @staticmethod - def _coerce_config( - config: TimeConditionedSurrogateConfig | None, - *, - param_dim: int | None, - schedule_dim: int | None, - time_dim: int | None, - hidden_dim: int, - n_blocks: int, - dropout: float, - use_schedule: bool, - ) -> TimeConditionedSurrogateConfig: - """兼容配置对象式构造和旧版关键字参数式构造。""" - if config is not None: - return config - if param_dim is None or schedule_dim is None or time_dim is None: - raise TypeError( - "TimeConditionedSurrogate requires either a TimeConditionedSurrogateConfig " - "or param_dim, schedule_dim and time_dim keyword arguments" - ) - return TimeConditionedSurrogateConfig( - param_dim=int(param_dim), - schedule_dim=int(schedule_dim), - time_dim=int(time_dim), - hidden_dim=int(hidden_dim), - n_blocks=int(n_blocks), - dropout=float(dropout), - use_schedule=bool(use_schedule), - ) - - @staticmethod - def _build_encoder(in_dim: int, out_dim: int) -> nn.Sequential: - """构建 Linear-LayerNorm-GELU 编码器。""" - return nn.Sequential( - nn.Linear(in_dim, out_dim), - nn.LayerNorm(out_dim), - nn.GELU(), - ) - - def forward( - self, - params_x: torch.Tensor, - time_x: torch.Tensor, - schedule_x: torch.Tensor | None = None, - ) -> torch.Tensor: - """融合样本级特征和点级时间特征,输出双通道响应。 - - params_x 与 schedule_x 在同一条曲线的所有时间点上相同,time_x 则随时间点变化。 - 三类特征编码后拼接进入残差主干,最后输出 [B, 2],分别对应标准化空间中的 - log_pressure 和 log_derivative。 - """ - # params_x 和 schedule_x 是样本级特征;time_x 是展开后的点级特征。 - param_feat = self.param_encoder(params_x) - time_feat = self.time_encoder(time_x) - - if self.use_schedule: - if schedule_x is None: - raise ValueError("use_schedule=True,但 forward 没有传入 schedule_x") - schedule_feat = self.schedule_encoder(schedule_x) - fused = torch.cat([param_feat, schedule_feat, time_feat], dim=-1) - else: - fused = torch.cat([param_feat, time_feat], dim=-1) - - # 融合后的特征经过残差主干,输出 log_pressure 和 log_derivative 两个通道。 - hidden = self.input_proj(fused) - hidden = self.blocks(hidden) - return self.head(hidden) - - -def build_time_conditioned_surrogate( - config: TimeConditionedSurrogateConfig, -) -> TimeConditionedSurrogate: - """根据配置创建时间条件代理模型。""" - return TimeConditionedSurrogate(config) diff --git a/ML/nmWTAI-ML/src/training/train_forward.py b/ML/nmWTAI-ML/src/training/train_forward.py index d30b3ac..7f49da8 100644 --- a/ML/nmWTAI-ML/src/training/train_forward.py +++ b/ML/nmWTAI-ML/src/training/train_forward.py @@ -3,7 +3,7 @@ 本模块读取 preprocess.py 生成的 joblib 数据,构造 PyTorch Dataset/DataLoader,训练 ForwardSurrogate,并按验证集损失保存最佳 checkpoint。损失函数不是单一 MSE,而是 -由压力、导数、slope、均值偏置、导数形状约束和可选自动拟合目标组成,目的是让 +由压力、导数、均值偏置、导数形状约束和可选自动拟合目标组成,目的是让 模型既能拟合点值,也能保持对自动试井拟合有意义的曲线形态。 训练过程会保存 history.json、metrics.json 和 forward_surrogate_best.pt,后续评估 @@ -34,7 +34,6 @@ METRIC_KEYS = ( "loss", "loss_pressure", "loss_derivative", - "loss_slope", "loss_bias_pressure", "loss_bias_derivative", "loss_derivative_shape", @@ -88,7 +87,6 @@ class LossWeights: pressure: float = 1.0 derivative: float = 2.0 - slope: float = 0.0 bias_pressure: float = 0.15 bias_derivative: float = 0.05 derivative_shape: float = 0.10 @@ -146,14 +144,12 @@ class CurveStats: @dataclass class LossBatchParts: - """曲线三段的预测值和真实值。""" + """压力和导数的预测值与真实值。""" pred_p: torch.Tensor pred_d: torch.Tensor - pred_s: torch.Tensor true_p: torch.Tensor true_d: torch.Tensor - true_s: torch.Tensor @dataclass @@ -202,24 +198,29 @@ def load_processed_dataset(path: Path) -> dict: def infer_curve_layout(data: dict) -> dict: - """从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。""" + """读取双通道布局,并拒绝未经重新预处理的旧三通道数据。""" meta = data.get("meta", {}) curve_dim = int(meta.get("curve_dim", data["Y_curve_train"].shape[1])) curve_layout = meta.get("curve_layout") if curve_layout is not None: + names = {str(part["name"]) for part in curve_layout.get("parts", [])} + if "slope" in names: + raise ValueError( + "processed 数据仍包含旧版 slope 通道;请先重新运行 " + "scripts/preprocess_dataset.py,预处理会自动裁掉 slope" + ) return curve_layout - if curve_dim % 3 != 0: - raise ValueError(f"curve_dim={curve_dim} 不能按 3 段均分,且 meta 中没有 curve_layout") + if curve_dim % 2 != 0: + raise ValueError(f"curve_dim={curve_dim} 不能按压力/导数两段均分") - n_time_points = curve_dim // 3 + n_time_points = curve_dim // 2 return { "n_time_points": int(n_time_points), "parts": [ {"name": "log_pressure", "start": 0, "end": n_time_points}, {"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points}, - {"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points}, ], } @@ -309,14 +310,12 @@ def split_curve_parts( target: torch.Tensor, slices: dict[str, slice], ) -> LossBatchParts: - """把拼接曲线拆成压力、导数和 slope 三段。""" + """把拼接曲线拆成压力和导数两段。""" return LossBatchParts( pred_p=pred[:, slices["log_pressure"]], pred_d=pred[:, slices["log_derivative"]], - pred_s=pred[:, slices["slope"]], true_p=target[:, slices["log_pressure"]], true_d=target[:, slices["log_derivative"]], - true_s=target[:, slices["slope"]], ) @@ -328,7 +327,6 @@ def compute_basic_loss_vectors( return { "loss_pressure": regression_per_sample(parts.pred_p, parts.true_p, loss_cfg), "loss_derivative": regression_per_sample(parts.pred_d, parts.true_d, loss_cfg), - "loss_slope": mse_per_sample(parts.pred_s, parts.true_s), "loss_bias_pressure": l1_per_sample( parts.pred_p.mean(dim=1, keepdim=True), parts.true_p.mean(dim=1, keepdim=True), @@ -378,7 +376,6 @@ def weighted_total_vector( return ( weights.pressure * loss_vectors["loss_pressure"] + weights.derivative * loss_vectors["loss_derivative"] - + weights.slope * loss_vectors["loss_slope"] + weights.bias_pressure * loss_vectors["loss_bias_pressure"] + weights.bias_derivative * loss_vectors["loss_bias_derivative"] + weights.derivative_shape * loss_vectors["loss_derivative_shape"] @@ -582,7 +579,7 @@ def print_training_config(cfg: TrainConfig, curve_layout: dict) -> None: print(f" use_schedule={cfg.model.use_schedule}") print( f" weights: pressure={weights.pressure}, derivative={weights.derivative}, " - f"slope={weights.slope}, bias_p={weights.bias_pressure}, " + f"bias_p={weights.bias_pressure}, " f"bias_d={weights.bias_derivative}, d_shape={weights.derivative_shape}, " f"autofit_p={weights.autofit_pressure}, autofit_d={weights.autofit_derivative}" ) @@ -601,7 +598,6 @@ def format_metric_line(epoch: int, train_metrics: dict[str, float], val_metrics: f"train={train_metrics['loss']:.6f} " f"(p={train_metrics['loss_pressure']:.6f}, " f"d={train_metrics['loss_derivative']:.6f}, " - f"s={train_metrics['loss_slope']:.6f}, " f"bp={train_metrics['loss_bias_pressure']:.6f}, " f"bd={train_metrics['loss_bias_derivative']:.6f}, " f"ds={train_metrics['loss_derivative_shape']:.6f}, " @@ -612,7 +608,6 @@ def format_metric_line(epoch: int, train_metrics: dict[str, float], val_metrics: f"val={val_metrics['loss']:.6f} " f"(p={val_metrics['loss_pressure']:.6f}, " f"d={val_metrics['loss_derivative']:.6f}, " - f"s={val_metrics['loss_slope']:.6f}, " f"bp={val_metrics['loss_bias_pressure']:.6f}, " f"bd={val_metrics['loss_bias_derivative']:.6f}, " f"ds={val_metrics['loss_derivative_shape']:.6f}, " @@ -629,7 +624,6 @@ def format_final_line(test_metrics: dict[str, float]) -> str: f"[Final] test={test_metrics['loss']:.6f} " f"(p={test_metrics['loss_pressure']:.6f}, " f"d={test_metrics['loss_derivative']:.6f}, " - f"s={test_metrics['loss_slope']:.6f}, " f"bp={test_metrics['loss_bias_pressure']:.6f}, " f"bd={test_metrics['loss_bias_derivative']:.6f}, " f"ds={test_metrics['loss_derivative_shape']:.6f}, " diff --git a/ML/nmWTAI-ML/src/training/train_time_conditioned.py b/ML/nmWTAI-ML/src/training/train_time_conditioned.py deleted file mode 100644 index 70e4d72..0000000 --- a/ML/nmWTAI-ML/src/training/train_time_conditioned.py +++ /dev/null @@ -1,552 +0,0 @@ -# -*- coding: utf-8 -*- -"""时间条件代理模型训练流程。 - -本模块训练逐时间点预测模型:每条曲线会被展开为 N*T 个点级样本,每个样本包含 -同一条曲线共享的物理参数/流量制度特征,以及当前时间点的时间特征。目标是该点的 -log_pressure 和 log_derivative。 - -该训练方式可以对风险参数区域或特殊时间点引入更细粒度的加权策略,也便于后续 -在任意时间网格上重建曲线响应。 -""" - -from __future__ import annotations - -# pylint: disable=import-error,duplicate-code - -import json -import random -from dataclasses import asdict, dataclass, field -from pathlib import Path -from typing import Any - -import joblib -import numpy as np -import torch -import torch.nn.functional as F -from torch.utils.data import DataLoader, Dataset - -from src.data.param_features import inverse_transform_param_features -from src.models.time_conditioned_surrogate import ( - TimeConditionedSurrogate, - TimeConditionedSurrogateConfig, -) -from src.training.train_forward import get_part_slices, infer_curve_layout - - -@dataclass -class PointCurveArrays: - """逐时间点数据集所需数组。""" - - params_x: np.ndarray - schedule_x: np.ndarray - time_x: np.ndarray - curve_y: np.ndarray - layout: dict - sample_weight: np.ndarray | None = None - - -class PointCurveDataset(Dataset): - """把完整曲线展开为逐时间点训练样本。""" - - def __init__(self, arrays: PointCurveArrays): - """保存逐点训练所需的参数、流量制度、时间特征、目标曲线和样本权重。""" - self.params_x = torch.tensor(arrays.params_x, dtype=torch.float32) - self.schedule_x = torch.tensor(arrays.schedule_x, dtype=torch.float32) - self.time_x = torch.tensor(arrays.time_x, dtype=torch.float32) - - slices = get_part_slices(arrays.layout) - pressure_y = arrays.curve_y[:, slices["log_pressure"]] - derivative_y = arrays.curve_y[:, slices["log_derivative"]] - self.y = torch.tensor(np.stack([pressure_y, derivative_y], axis=-1), dtype=torch.float32) - - self.n_samples = int(self.params_x.shape[0]) - self.n_time = int(self.time_x.shape[1]) - sample_weight = arrays.sample_weight - if sample_weight is None: - sample_weight = np.ones((self.n_samples,), dtype=np.float32) - - sample_weight = np.asarray(sample_weight, dtype=np.float32).reshape(-1) - if sample_weight.shape[0] != self.n_samples: - raise ValueError(f"sample_weight length mismatch: {sample_weight.shape[0]} != {self.n_samples}") - self.sample_weight = torch.tensor(sample_weight, dtype=torch.float32) - - def __len__(self) -> int: - """返回数据集或容器中可迭代样本的数量。""" - return self.n_samples * self.n_time - - def __getitem__(self, idx: int): - """按索引取出一个训练样本或数据项。""" - sample_idx = idx // self.n_time - time_idx = idx % self.n_time - return ( - self.params_x[sample_idx], - self.schedule_x[sample_idx], - self.time_x[sample_idx, time_idx], - self.y[sample_idx, time_idx], - self.sample_weight[sample_idx], - ) - - -@dataclass -class TimeModelConfig: - """时间条件代理模型结构配置。""" - - hidden_dim: int = 256 - n_blocks: int = 4 - dropout: float = 0.05 - use_schedule: bool = True - - -@dataclass -class TimeOptimConfig: - """训练轮次和优化器配置。""" - - batch_size: int = 4096 - epochs: int = 120 - lr: float = 1.0e-3 - weight_decay: float = 1.0e-4 - - -@dataclass -class TimeLossConfig: - """时间条件模型点级损失配置。""" - - w_pressure: float = 1.0 - w_derivative: float = 2.0 - huber_beta: float = 0.05 - - -@dataclass -class RiskWeightConfig: - """风险区域样本加权配置。""" - - mode: str = "none" - risk_weight: float = 2.5 - skin_lt_minus8_weight: float = 3.5 - weight_min: float = 1.0 - weight_max: float = 4.0 - - -@dataclass -class TimeRuntimeConfig: - """训练运行时配置。""" - - seed: int = 42 - device: str = "cuda" if torch.cuda.is_available() else "cpu" - - -@dataclass -class TimeConditionedTrainConfig: - """时间条件代理模型训练配置。""" - - processed_path: Path - output_dir: Path - runtime: TimeRuntimeConfig = field(default_factory=TimeRuntimeConfig) - optim: TimeOptimConfig = field(default_factory=TimeOptimConfig) - model: TimeModelConfig = field(default_factory=TimeModelConfig) - loss: TimeLossConfig = field(default_factory=TimeLossConfig) - risk_weight: RiskWeightConfig = field(default_factory=RiskWeightConfig) - - -@dataclass -class DataBundle: - """训练、验证、测试数据加载器与输入维度。""" - - train_loader: DataLoader - val_loader: DataLoader - test_loader: DataLoader - param_dim: int - schedule_dim: int - time_dim: int - - -@dataclass -class TrainArtifacts: - """训练过程中需要跨函数传递的数据。""" - - data: dict - curve_layout: dict - train_weight_summary: dict[str, Any] - bundle: DataBundle - - -def set_global_seed(seed: int) -> None: - """设置 Python、NumPy 和 PyTorch 随机种子,并在 CUDA 可用时同步设置 GPU 随机种子。""" - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - -def _smooth_l1_vector(pred: torch.Tensor, target: torch.Tensor, beta: float) -> torch.Tensor: - """计算逐向量 Smooth L1 损失,保留样本维度用于加权。""" - return F.smooth_l1_loss(pred, target, beta=float(beta), reduction="none") - - -def _loss( - pred: torch.Tensor, - target: torch.Tensor, - loss_cfg: TimeLossConfig, - sample_weight: torch.Tensor | None = None, -) -> torch.Tensor: - """计算时间条件模型的点级损失,并按样本权重求平均。""" - loss_p = _smooth_l1_vector(pred[:, 0], target[:, 0], beta=float(loss_cfg.huber_beta)) - loss_d = _smooth_l1_vector(pred[:, 1], target[:, 1], beta=float(loss_cfg.huber_beta)) - loss_vec = float(loss_cfg.w_pressure) * loss_p + float(loss_cfg.w_derivative) * loss_d - if sample_weight is None: - return loss_vec.mean() - - weight = sample_weight.to(loss_vec.device).reshape(-1).clamp_min(0.0) - return (loss_vec * weight).sum() / torch.clamp(weight.sum(), min=1.0e-12) - - -def _move_batch_to_device( - batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], - device: str, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """把一个 batch 的所有张量移动到目标设备。""" - params_x, schedule_x, time_x, target_y, sample_weight = batch - return ( - params_x.to(device), - schedule_x.to(device), - time_x.to(device), - target_y.to(device), - sample_weight.to(device), - ) - - -def _forward_batch( - model: TimeConditionedSurrogate, - batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], - cfg: TimeConditionedTrainConfig, - use_weight: bool, -) -> tuple[torch.Tensor, int]: - """完成一个 batch 的前向计算并返回损失和样本数。""" - params_x, schedule_x, time_x, target_y, sample_weight = _move_batch_to_device( - batch, - cfg.runtime.device, - ) - schedule_input = schedule_x if cfg.model.use_schedule else None - pred = model(params_x, time_x, schedule_input) - weight = sample_weight if use_weight else None - loss = _loss(pred, target_y, cfg.loss, sample_weight=weight) - return loss, int(target_y.shape[0]) - - -def _run_loader( - model: TimeConditionedSurrogate, - loader: DataLoader, - cfg: TimeConditionedTrainConfig, - optimizer: torch.optim.Optimizer | None = None, -) -> float: - """执行一个训练或评估 epoch。""" - is_train = optimizer is not None - model.train(mode=is_train) - total = 0.0 - total_n = 0 - grad_context = torch.enable_grad() if is_train else torch.no_grad() - - with grad_context: - for batch in loader: - if is_train: - optimizer.zero_grad() - - loss, batch_size = _forward_batch(model, batch, cfg, use_weight=is_train) - - if is_train: - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) - optimizer.step() - - total += float(loss.detach().cpu()) * batch_size - total_n += batch_size - - return total / max(total_n, 1) - - -def _evaluate( - model: TimeConditionedSurrogate, - loader: DataLoader, - cfg: TimeConditionedTrainConfig, -) -> float: - """在验证集或测试集上评估时间条件模型的平均损失。""" - return _run_loader(model, loader, cfg, optimizer=None) - - -def _raw_params_from_processed_split(data: dict, split: str) -> dict[str, np.ndarray]: - """从预处理数据中读取某个划分的原始参数,用于构造样本权重。""" - key = f"X_params_{split}" - features = data["scaler_params"].inverse_transform(data[key]) - transform = data.get("meta", {}).get("param_feature_transform") - raw = inverse_transform_param_features(features, transform) - default_names = ["k", "skin", "wellboreC", "phi", "h", "Cf"] - names = list(data.get("meta", {}).get("param_names") or default_names) - return { - name: raw[:, idx].astype(np.float64) - for idx, name in enumerate(names[: raw.shape[1]]) - } - - -def _build_sample_weight( - data: dict, - cfg: TimeConditionedTrainConfig, - split: str = "train", -) -> np.ndarray: - """根据原始物理参数生成样本权重,使关键参数区域得到更多关注。""" - mode = str(cfg.risk_weight.mode or "none").lower() - n_samples = int(data[f"X_params_{split}"].shape[0]) - if mode in {"none", "off", "false"}: - return np.ones((n_samples,), dtype=np.float32) - if mode != "risk_region": - raise ValueError(f"Unknown sample_weight_mode={cfg.risk_weight.mode!r}") - - params = _raw_params_from_processed_split(data, split) - weight = np.ones((n_samples,), dtype=np.float32) - risk = (params["skin"] < -5.0) & (params["wellboreC"] > 0.1) - skin_extreme = params["skin"] < -8.0 - weight[risk] = np.maximum(weight[risk], float(cfg.risk_weight.risk_weight)) - weight[skin_extreme] = np.maximum(weight[skin_extreme], float(cfg.risk_weight.skin_lt_minus8_weight)) - return np.clip( - weight, - float(cfg.risk_weight.weight_min), - float(cfg.risk_weight.weight_max), - ).astype(np.float32) - - -def _summarize_sample_weight(sample_weight: np.ndarray) -> dict[str, Any]: - """统计样本权重的最小值、最大值和分位数,便于检查加权强度。""" - weight = np.asarray(sample_weight, dtype=np.float32).reshape(-1) - return { - "min": float(np.min(weight)), - "mean": float(np.mean(weight)), - "median": float(np.median(weight)), - "max": float(np.max(weight)), - "n_weight_gt_1": int(np.sum(weight > 1.0)), - "n_weight_lt_1": int(np.sum(weight < 1.0)), - } - - -def _load_processed_data(path: Path) -> dict: - """读取预处理数据并检查时间条件训练所需字段。""" - data = joblib.load(path) - required = ["X_time_train", "X_time_val", "X_time_test"] - missing = [key for key in required if key not in data] - if missing: - raise KeyError(f"processed dataset is missing time-conditioned fields: {missing}") - return data - - -def _make_point_dataset( - data: dict, - split: str, - curve_layout: dict, - sample_weight: np.ndarray | None = None, -) -> PointCurveDataset: - """构造某个数据划分对应的逐点数据集。""" - return PointCurveDataset( - PointCurveArrays( - params_x=data[f"X_params_{split}"], - schedule_x=data[f"X_schedule_{split}"], - time_x=data[f"X_time_{split}"], - curve_y=data[f"Y_curve_{split}"], - layout=curve_layout, - sample_weight=sample_weight, - ) - ) - - -def _build_dataloaders( - data: dict, - curve_layout: dict, - train_weight: np.ndarray, - cfg: TimeConditionedTrainConfig, -) -> DataBundle: - """构建训练、验证和测试 DataLoader。""" - train_ds = _make_point_dataset(data, "train", curve_layout, train_weight) - val_ds = _make_point_dataset(data, "val", curve_layout) - test_ds = _make_point_dataset(data, "test", curve_layout) - - generator = torch.Generator() - generator.manual_seed(int(cfg.runtime.seed)) - - train_loader = DataLoader( - train_ds, - batch_size=cfg.optim.batch_size, - shuffle=True, - generator=generator, - ) - val_loader = DataLoader(val_ds, batch_size=cfg.optim.batch_size, shuffle=False) - test_loader = DataLoader(test_ds, batch_size=cfg.optim.batch_size, shuffle=False) - - return DataBundle( - train_loader=train_loader, - val_loader=val_loader, - test_loader=test_loader, - param_dim=int(data["X_params_train"].shape[1]), - schedule_dim=int(data["X_schedule_train"].shape[1]), - time_dim=int(data["X_time_train"].shape[-1]), - ) - - -def _prepare_training_artifacts(cfg: TimeConditionedTrainConfig) -> TrainArtifacts: - """加载数据、推断布局、构造样本权重和 DataLoader。""" - data = _load_processed_data(cfg.processed_path) - curve_layout = infer_curve_layout(data) - train_weight = _build_sample_weight(data, cfg, split="train") - train_weight_summary = _summarize_sample_weight(train_weight) - bundle = _build_dataloaders(data, curve_layout, train_weight, cfg) - return TrainArtifacts( - data=data, - curve_layout=curve_layout, - train_weight_summary=train_weight_summary, - bundle=bundle, - ) - - -def _build_model(bundle: DataBundle, cfg: TimeConditionedTrainConfig) -> TimeConditionedSurrogate: - """兼容新版配置式模型和旧版关键字参数式模型。""" - model_cfg = TimeConditionedSurrogateConfig( - param_dim=bundle.param_dim, - schedule_dim=bundle.schedule_dim, - time_dim=bundle.time_dim, - hidden_dim=int(cfg.model.hidden_dim), - n_blocks=int(cfg.model.n_blocks), - dropout=float(cfg.model.dropout), - use_schedule=bool(cfg.model.use_schedule), - ) - return TimeConditionedSurrogate(model_cfg).to(cfg.runtime.device) - - -def _build_optimizer( - model: TimeConditionedSurrogate, - cfg: TimeConditionedTrainConfig, -) -> torch.optim.Optimizer: - """构建 AdamW 优化器。""" - return torch.optim.AdamW( - model.parameters(), - lr=float(cfg.optim.lr), - weight_decay=float(cfg.optim.weight_decay), - ) - - -def _print_training_config( - cfg: TimeConditionedTrainConfig, - artifacts: TrainArtifacts, -) -> None: - """打印时间条件训练配置摘要。""" - meta = artifacts.data.get("meta", {}) - print("Time-conditioned training config:") - print(f" processed={cfg.processed_path}") - print(f" output_dir={cfg.output_dir}") - print( - f" device={cfg.runtime.device}, batch_size={cfg.optim.batch_size}, " - f"epochs={cfg.optim.epochs}" - ) - print( - f" dims: param={artifacts.bundle.param_dim}, " - f"schedule={artifacts.bundle.schedule_dim}, time={artifacts.bundle.time_dim}" - ) - print(f" curve_time_source={meta.get('curve_time_source', 'unknown')}") - print( - f" sample_weight_mode={cfg.risk_weight.mode}, " - f"sample_weight={artifacts.train_weight_summary}" - ) - - -def _checkpoint_payload( - model: TimeConditionedSurrogate, - cfg: TimeConditionedTrainConfig, - artifacts: TrainArtifacts, -) -> dict[str, Any]: - """构造 checkpoint 保存内容。""" - return { - "model_state_dict": model.state_dict(), - "param_dim": artifacts.bundle.param_dim, - "schedule_dim": artifacts.bundle.schedule_dim, - "time_dim": artifacts.bundle.time_dim, - "hidden_dim": int(cfg.model.hidden_dim), - "n_blocks": int(cfg.model.n_blocks), - "dropout": float(cfg.model.dropout), - "use_schedule": bool(cfg.model.use_schedule), - "curve_layout": artifacts.curve_layout, - "processed_path": str(cfg.processed_path), - "seed": int(cfg.runtime.seed), - "sample_weight_mode": str(cfg.risk_weight.mode), - "sample_weight_summary": artifacts.train_weight_summary, - "model_config": asdict(cfg.model), - "loss_config": asdict(cfg.loss), - "risk_weight_config": asdict(cfg.risk_weight), - } - - -def _save_json(path: Path, payload: dict | list) -> None: - """写出 JSON 文件。""" - path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - - -def _train_epochs( - model: TimeConditionedSurrogate, - cfg: TimeConditionedTrainConfig, - artifacts: TrainArtifacts, -) -> tuple[float, Path, list[dict]]: - """执行训练循环并保存最佳模型。""" - optimizer = _build_optimizer(model, cfg) - best_val = float("inf") - best_path = cfg.output_dir / "time_conditioned_surrogate_best.pt" - history: list[dict] = [] - - for epoch in range(1, int(cfg.optim.epochs) + 1): - train_loss = _run_loader(model, artifacts.bundle.train_loader, cfg, optimizer=optimizer) - val_loss = _evaluate(model, artifacts.bundle.val_loader, cfg) - history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss}) - print(f"[Epoch {epoch:03d}] train={train_loss:.6f} val={val_loss:.6f}") - - if val_loss < best_val: - best_val = val_loss - torch.save(_checkpoint_payload(model, cfg, artifacts), best_path) - print(f" -> best model saved to: {best_path}") - - return best_val, best_path, history - - -def _write_final_outputs( - cfg: TimeConditionedTrainConfig, - best_val: float, - test_loss: float, - history: list[dict], - artifacts: TrainArtifacts, -) -> None: - """保存 history.json 和 metrics.json。""" - _save_json(cfg.output_dir / "history.json", history) - _save_json( - cfg.output_dir / "metrics.json", - { - "best_val_loss": best_val, - "test_loss": test_loss, - "sample_weight_mode": str(cfg.risk_weight.mode), - "sample_weight_summary": artifacts.train_weight_summary, - "model_config": asdict(cfg.model), - "loss_config": asdict(cfg.loss), - "risk_weight_config": asdict(cfg.risk_weight), - }, - ) - - -def train_time_conditioned(cfg: TimeConditionedTrainConfig) -> None: - """训练时间条件代理模型并保存训练产物。""" - cfg.output_dir.mkdir(parents=True, exist_ok=True) - set_global_seed(int(cfg.runtime.seed)) - - artifacts = _prepare_training_artifacts(cfg) - model = _build_model(artifacts.bundle, cfg) - - _print_training_config(cfg, artifacts) - best_val, best_path, history = _train_epochs(model, cfg, artifacts) - - checkpoint = torch.load(best_path, map_location=cfg.runtime.device) - model.load_state_dict(checkpoint["model_state_dict"]) - test_loss = _evaluate(model, artifacts.bundle.test_loader, cfg) - - _write_final_outputs(cfg, best_val, test_loss, history, artifacts) - print(f"[Final] test={test_loss:.6f}") diff --git a/Src/nmNum/nmCalculation/nmCalculationAutoFitPSO.cpp b/Src/nmNum/nmCalculation/nmCalculationAutoFitPSO.cpp index abe62a3..a0f47f6 100644 --- a/Src/nmNum/nmCalculation/nmCalculationAutoFitPSO.cpp +++ b/Src/nmNum/nmCalculation/nmCalculationAutoFitPSO.cpp @@ -1223,7 +1223,7 @@ QString nmCalculationAutoFitPSO::getSurrogateTag() const { // 当前 C++ 绑定的代理模型实验标识,对应 ML 侧训练输出目录/checkpoint 命名。 // 如果以后换模型,需要同步更新这里和 ML 脚本可识别的 tag。 - return "family_random_v2_q_50k_logparam"; + return "family_random_v2_q_50k_noslope"; } QString nmCalculationAutoFitPSO::getSurrogateStage() const @@ -6001,4 +6001,4 @@ void nmCalculationAutoFitPSO::finishSimulation() QString message = QString(tr("PSO optimization converged to stable solution. Best error: %1, Iterations: %2")) .arg(m_globalBestFitness, 0, 'e', 4).arg(m_simulationMaxIterations); emit fittingFinished(true, message); -} \ No newline at end of file +}