You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nmWTAI-Platform/ML/nmWTAI-ML/scripts/validate_autofit_local_rank...

634 lines
26 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""验证正演代理模型是否保留自动拟合局部排序。
脚本围绕一个目标参数点采样局部候选,对每个候选同时计算真实数值求解器目标函数和
代理模型目标函数,并统计 Pearson/Spearman 相关性、Top-K 命中和保留比例表现。
它直接回答“代理模型能否用于 PSO 候选预筛选”这个问题。
"""
# pylint: disable=import-error,wrong-import-position,invalid-name,too-many-arguments,too-many-positional-arguments,too-many-locals,too-many-branches,too-many-statements,broad-exception-caught,no-member
from __future__ import annotations
import argparse
import csv
import json
import math
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.append(str(ROOT))
import joblib
import matplotlib.pyplot as plt
import numpy as np
import torch
from src.common.config import Config
from src.common.experiment_paths import config_for_stage, model_checkpoint_for_tag, normalize_tag, processed_path_for_tag
from src.data.curve_processing import clean_curve_for_dataset, is_valid_curve, resample_curve_to_features
from src.data.param_features import param_feature_transform_from_meta, transform_param_features
from src.data.params import Params, Schedule
from src.data.runner_client import CppRunner, read_result_bin
from src.data.schedule_features import build_schedule_model_vector
from src.evaluation.autofit_objective import dual_log_objective
from src.models.forward_surrogate import ForwardSurrogate
DEFAULT_CASE = {
"config": "configs/data_gen_family_random.yaml",
"tag": "family_random_mixed_50k",
"params": {
"k": 0.025,
"skin": 0.0,
"wellboreC": 0.01,
"phi": 0.0245,
"h": 9.144,
"Cf": 0.0004315,
},
}
def parse_args() -> argparse.Namespace:
"""解析单目标局部排序验证所需的扰动范围、候选数量、模型和输出 CSV。"""
parser = argparse.ArgumentParser(
description="Validate whether surrogate preserves local autofit objective ranking around a target case"
)
parser.add_argument("--config", type=str, default=DEFAULT_CASE["config"])
parser.add_argument("--stage", choices=["fixed_case", "case_neighborhood", "family_random", "family_random_v2_q"], default=None)
parser.add_argument("--processed", type=str, default=None)
parser.add_argument("--model", type=str, default=None)
parser.add_argument("--output-dir", type=str, default=None)
parser.add_argument("--tag", type=str, default=DEFAULT_CASE["tag"])
parser.add_argument("--well-index", type=int, default=0)
parser.add_argument("--solver-timeout", type=int, default=120)
parser.add_argument("--n-candidates", type=int, default=64)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max-attempts-factor", type=int, default=4)
parser.add_argument(
"--span-frac",
type=float,
default=0.08,
help="Local box width as a fraction of each parameter range in transformed space",
)
parser.add_argument(
"--reuse-runner",
action="store_true",
help="Reuse one long-lived runner server across candidates; faster but less robust",
)
parser.add_argument("--k", type=float, default=DEFAULT_CASE["params"]["k"])
parser.add_argument("--skin", type=float, default=DEFAULT_CASE["params"]["skin"])
parser.add_argument("--wellboreC", type=float, default=DEFAULT_CASE["params"]["wellboreC"])
parser.add_argument("--phi", type=float, default=DEFAULT_CASE["params"]["phi"])
parser.add_argument("--h", type=float, default=DEFAULT_CASE["params"]["h"])
parser.add_argument("--Cf", type=float, default=DEFAULT_CASE["params"]["Cf"])
return parser.parse_args()
def infer_curve_layout(meta: dict, curve_dim: int) -> dict:
"""从元数据读取曲线分段布局;旧数据没有布局时按压力/导数/斜率三等分回退。"""
curve_layout = meta.get("curve_layout")
if curve_layout is not None:
return curve_layout
n_time_points = curve_dim // 3
return {
"n_time_points": int(n_time_points),
"parts": [
{"name": "log_pressure", "start": 0, "end": n_time_points},
{"name": "log_derivative", "start": n_time_points, "end": 2 * n_time_points},
{"name": "slope", "start": 2 * n_time_points, "end": 3 * n_time_points},
],
}
def resolve_case_schedule(cfg: Config) -> Schedule:
"""解析局部排序验证目标案例的流量制度,支持命令行覆盖配置默认值。"""
schedule_cfg = cfg.raw["schedule"]["case_schedule"]
time_q = list(map(float, schedule_cfg["timeQ"]))
q = list(map(float, schedule_cfg["q"]))
policy = cfg.raw["schedule"].get("section_policy", {}) or {}
mode = str(policy.get("mode", "fixed_last")).lower()
n_sections = len(time_q)
if mode == "fixed_last":
section_index = n_sections
elif mode == "fixed_value":
section_index = int(np.clip(int(policy.get("fixed_value", n_sections)), 1, n_sections))
else:
section_index = int(np.clip(int(schedule_cfg.get("default_section_index", n_sections)), 1, n_sections))
schedule = Schedule(sectionIndex=section_index, timeQ=time_q, q=q)
if not schedule.validate():
raise ValueError("Invalid case_schedule in config")
return schedule
def resolve_paths(args: argparse.Namespace) -> tuple[Config, Path, Path, Path]:
"""解析局部排序验证所需的配置、模型、预处理数据和输出目录。"""
tag = normalize_tag(args.tag)
config_path = args.config
if config_path is None:
config_path = str(config_for_stage(args.stage) or Path(DEFAULT_CASE["config"]))
processed_path = (Path(args.processed) if args.processed is not None else processed_path_for_tag(tag)).resolve()
model_path = (Path(args.model) if args.model is not None else model_checkpoint_for_tag(tag, use_schedule=True)).resolve()
if args.output_dir is not None:
output_dir = Path(args.output_dir).resolve()
else:
output_dir = (Path("results") / f"autofit_local_validation_{tag}").resolve()
return Config(config_path), processed_path, model_path, output_dir
def build_params_from_args(args: argparse.Namespace, schedule: Schedule) -> Params:
"""根据命令行参数和默认值构造目标 Params 对象。"""
return Params(
k=float(args.k),
skin=float(args.skin),
wellboreC=float(args.wellboreC),
phi=float(args.phi),
h=float(args.h),
Cf=float(args.Cf),
schedule=schedule,
)
def build_schedule_vector(cfg: Config, schedule: Schedule) -> np.ndarray:
"""把 Schedule 编码成正演代理模型可直接接收的流量制度特征向量。"""
return build_schedule_model_vector(cfg, schedule)
def load_model(checkpoint_path: Path) -> tuple[ForwardSurrogate, bool, torch.device]:
"""加载模型检查点,按保存的维度和超参数重建网络并切换到评估模式。"""
checkpoint = torch.load(checkpoint_path, map_location="cpu")
use_schedule = bool(checkpoint.get("use_schedule", True))
model = ForwardSurrogate(
param_dim=int(checkpoint["param_dim"]),
schedule_dim=int(checkpoint["schedule_dim"]),
curve_dim=int(checkpoint["curve_dim"]),
hidden_dim=int(checkpoint["hidden_dim"]),
dropout=float(checkpoint["dropout"]),
use_schedule=use_schedule,
)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
return model, use_schedule, device
def predict_surrogate_curve(
processed: dict,
model: ForwardSurrogate,
device: torch.device,
use_schedule: bool,
params: Params,
schedule: Schedule,
cfg: Config,
) -> np.ndarray:
"""使用正演代理模型预测单个参数和流量制度对应的完整曲线。"""
scaler_params = processed["scaler_params"]
scaler_schedule = processed["scaler_schedule"]
scaler_curve = processed["scaler_curve"]
param_transform = param_feature_transform_from_meta(processed.get("meta", {}))
params_vec = np.asarray(
[params.k, params.skin, params.wellboreC, params.phi, params.h, params.Cf],
dtype=np.float32,
).reshape(1, -1)
schedule_vec = build_schedule_vector(cfg, schedule).reshape(1, -1)
params_x = scaler_params.transform(transform_param_features(params_vec, param_transform)).astype(np.float32)
schedule_x = scaler_schedule.transform(schedule_vec).astype(np.float32)
with torch.no_grad():
params_t = torch.tensor(params_x, dtype=torch.float32, device=device)
schedule_t = torch.tensor(schedule_x, dtype=torch.float32, device=device) if use_schedule else None
pred_scaled = model(params_t, schedule_t).cpu().numpy()
return scaler_curve.inverse_transform(pred_scaled)[0].astype(np.float32)
def run_solver_and_extract_curve(
runner: CppRunner,
cfg: Config,
params: Params,
well_index: int,
timeout: int,
) -> tuple[np.ndarray, dict]:
"""调用 C++ 求解器运行一次正演,并把双对数输出重采样为模型曲线向量。"""
ok = runner.run_simulation(params, timeout=timeout, override_schedule=params.schedule, include_schedule=True)
result = read_result_bin(runner.result_bin) if runner.result_bin.exists() else None
if not ok and result is None:
raise RuntimeError("Solver failed and result.bin is unavailable")
if result is None or not result["loglog"]:
raise RuntimeError("Solver produced no loglog data")
if well_index < 0 or well_index >= len(result["loglog"]):
raise IndexError(f"well-index={well_index} out of range; n_wells={len(result['loglog'])}")
loglog = result["loglog"][well_index]
t = np.asarray(loglog["t"], dtype=np.float64)
p = np.asarray(loglog["p"], dtype=np.float64)
d = np.asarray(loglog["deriv"], dtype=np.float64)
cleaned = clean_curve_for_dataset(cfg, t, p, d)
if cleaned is None:
raise RuntimeError("Curve invalid after cleaning")
t_clean, p_clean, d_clean = cleaned
valid, reason = is_valid_curve(cfg, t_clean, p_clean, d_clean)
if not valid:
raise RuntimeError(f"Curve failed validation: {reason}")
curve_feat = resample_curve_to_features(cfg, t_clean, p_clean, d_clean)
raw = {
"t": t_clean.tolist(),
"p": p_clean.tolist(),
"d": d_clean.tolist(),
"n_steps": int(result["nSteps"]),
"n_wells": int(result["nWells"]),
}
return curve_feat, raw
def _corr_pearson(x: np.ndarray, y: np.ndarray) -> float:
"""计算 Pearson 相关系数,用于评估代理分数与真实分数的线性相关。"""
if x.size < 2:
return float("nan")
if np.std(x) < 1e-12 or np.std(y) < 1e-12:
return float("nan")
return float(np.corrcoef(x, y)[0, 1])
def _corr_spearman(x: np.ndarray, y: np.ndarray) -> float:
"""计算 Spearman 秩相关,用于评估候选排序是否一致。"""
if x.size < 2:
return float("nan")
xr = np.argsort(np.argsort(x))
yr = np.argsort(np.argsort(y))
return _corr_pearson(xr.astype(np.float64), yr.astype(np.float64))
def _rank_positions(values: np.ndarray) -> np.ndarray:
"""把目标函数值转换成排名位置,用于比较代理排序和真实排序。"""
order = np.argsort(values)
rank = np.empty_like(order)
rank[order] = np.arange(order.size)
return rank
def _fixed_param_value(cfg: Config, name: str) -> float | None:
"""读取固定参数值;优先使用命令行覆盖,其次使用配置默认值。"""
fixed_cfg = ((cfg.raw["params"].get("fixed_params", {}) or {}).get(name, {}) or {})
if bool(fixed_cfg.get("enabled", False)):
return float(fixed_cfg["value"])
return None
def _search_param_names(cfg: Config) -> list[str]:
"""确定局部邻域中需要扰动搜索的物理参数名称。"""
active_names = list(cfg.raw["params"].get("active_param_names", cfg.raw["params"]["all_physical_param_names"]))
names = [name for name in active_names if _fixed_param_value(cfg, name) is None]
if not names:
raise ValueError("No searchable parameters: active_param_names is empty after removing fixed_params")
return names
def _sample_local_candidates(cfg: Config, base: Params, n: int, seed: int, span_frac: float) -> list[Params]:
"""围绕目标参数按扰动比例随机生成局部候选参数。"""
rng = np.random.RandomState(seed)
all_names = list(cfg.raw["params"]["all_physical_param_names"])
names = _search_param_names(cfg)
log_params = set(cfg.raw["params"]["log_params"])
base_dict = {
"k": float(base.k),
"skin": float(base.skin),
"wellboreC": float(base.wellboreC),
"phi": float(base.phi),
"h": float(base.h),
"Cf": float(base.Cf),
}
samples: list[Params] = []
for _ in range(n):
cand = dict(base_dict)
for name in all_names:
fixed_value = _fixed_param_value(cfg, name)
if fixed_value is not None:
cand[name] = fixed_value
continue
lo, hi = map(float, cfg.raw["params"]["ranges"][name])
base_val = float(base_dict[name])
if name not in names:
cand[name] = base_val
continue
if name in log_params:
# 跨数量级参数在 log 空间扰动,保持相对扰动尺度一致。
lo_t = math.log10(max(lo, 1e-30))
hi_t = math.log10(max(hi, 1e-30))
base_t = math.log10(max(base_val, 1e-30))
span = span_frac * (hi_t - lo_t)
v_t = np.clip(rng.uniform(base_t - span, base_t + span), lo_t, hi_t)
cand[name] = float(10 ** v_t)
else:
# 线性参数直接在物理空间扰动,并裁剪回配置范围。
span = span_frac * (hi - lo)
v = np.clip(rng.uniform(base_val - span, base_val + span), lo, hi)
cand[name] = float(v)
samples.append(
Params(
k=cand["k"],
skin=cand["skin"],
wellboreC=cand["wellboreC"],
phi=cand["phi"],
h=cand["h"],
Cf=cand["Cf"],
schedule=base.schedule,
)
)
return samples
def _params_to_dict(params: Params) -> dict:
"""把 Params 对象转换为可写入 CSV 的普通字典。"""
return {
"k": float(params.k),
"skin": float(params.skin),
"wellboreC": float(params.wellboreC),
"phi": float(params.phi),
"h": float(params.h),
"Cf": float(params.Cf),
}
def _plot_scatter(csv_rows: list[dict], output_path: Path) -> None:
"""绘制局部候选的代理目标和真实目标散点图。"""
solver_obj = np.asarray([float(r["solver_objective"]) for r in csv_rows], dtype=np.float64)
surrogate_obj = np.asarray([float(r["surrogate_objective"]) for r in csv_rows], dtype=np.float64)
plt.figure(figsize=(7, 6))
plt.scatter(solver_obj, surrogate_obj, alpha=0.75)
if solver_obj.size >= 2:
lo = min(float(np.min(solver_obj)), float(np.min(surrogate_obj)))
hi = max(float(np.max(solver_obj)), float(np.max(surrogate_obj)))
plt.plot([lo, hi], [lo, hi], linestyle="--", linewidth=1)
plt.xlabel("Solver objective to target")
plt.ylabel("Surrogate objective to target")
plt.title("Local Candidate Ranking Validation")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
def main() -> None:
"""围绕一个目标样本生成候选参数,比较真实目标函数排序与代理排序。"""
args = parse_args()
cfg, processed_path, model_path, output_dir = resolve_paths(args)
output_dir.mkdir(parents=True, exist_ok=True)
if not processed_path.exists():
raise FileNotFoundError(f"Processed dataset not found: {processed_path}")
if not model_path.exists():
raise FileNotFoundError(f"Model checkpoint not found: {model_path}")
processed = joblib.load(processed_path)
curve_layout = infer_curve_layout(processed["meta"], int(processed["meta"]["curve_dim"]))
schedule = resolve_case_schedule(cfg)
target_params = build_params_from_args(args, schedule)
search_names = _search_param_names(cfg)
model, use_schedule, device = load_model(model_path)
shared_runner = None
def make_runner(tag: str) -> CppRunner:
"""创建 C++ 求解器客户端,并根据参数决定是否启用常驻服务。"""
temp_dir = output_dir / "_runner_temp" / tag
temp_dir.mkdir(parents=True, exist_ok=True)
return CppRunner(cfg=cfg, auto_init=False, temp_dir=temp_dir)
try:
if args.reuse_runner:
# 复用 runner 可以显著减少大量候选逐个启动 C++ 进程的开销。
shared_runner = make_runner("shared")
target_runner = shared_runner
else:
target_runner = make_runner("target")
print("Running target solver case...")
# 目标曲线只计算一次,后面所有候选都以它为拟合对象。
target_curve, target_raw = run_solver_and_extract_curve(
runner=target_runner,
cfg=cfg,
params=target_params,
well_index=args.well_index,
timeout=args.solver_timeout,
)
if not args.reuse_runner:
target_runner.close()
max_attempts = max(int(args.n_candidates), int(args.n_candidates) * int(args.max_attempts_factor))
candidates = _sample_local_candidates(
cfg=cfg,
base=target_params,
n=max_attempts,
seed=int(args.seed),
span_frac=float(args.span_frac),
)
rows: list[dict] = []
failed_rows: list[dict] = []
for attempt_idx, cand in enumerate(candidates):
if len(rows) >= int(args.n_candidates):
break
runner = shared_runner if shared_runner is not None else make_runner(f"cand_{attempt_idx:04d}")
try:
# 对每个候选同时拿到真实求解器曲线和代理预测曲线,比较二者目标函数排序。
solver_curve, _ = run_solver_and_extract_curve(
runner=runner,
cfg=cfg,
params=cand,
well_index=args.well_index,
timeout=args.solver_timeout,
)
pred_curve = predict_surrogate_curve(
processed=processed,
model=model,
device=device,
use_schedule=use_schedule,
params=cand,
schedule=schedule,
cfg=cfg,
)
solver_obj = dual_log_objective(target_curve, solver_curve, curve_layout)
surrogate_obj = dual_log_objective(target_curve, pred_curve, curve_layout)
rows.append(
{
"candidate_id": len(rows),
"attempt_id": attempt_idx,
"k": cand.k,
"skin": cand.skin,
"wellboreC": cand.wellboreC,
"phi": cand.phi,
"h": cand.h,
"Cf": cand.Cf,
"solver_objective": solver_obj["dual_log_objective"],
"solver_p_obj": solver_obj["log_pressure_objective"],
"solver_d_obj": solver_obj["log_derivative_objective"],
"surrogate_objective": surrogate_obj["dual_log_objective"],
"surrogate_p_obj": surrogate_obj["log_pressure_objective"],
"surrogate_d_obj": surrogate_obj["log_derivative_objective"],
"objective_gap": surrogate_obj["dual_log_objective"] - solver_obj["dual_log_objective"],
}
)
except Exception as exc:
failed_row = {"attempt_id": attempt_idx, "reason": str(exc)}
failed_row.update(_params_to_dict(cand))
failed_rows.append(failed_row)
print(
"[warn] candidate attempt "
f"{attempt_idx} skipped: {exc} | "
f"k={cand.k:.6g}, skin={cand.skin:.6g}, C={cand.wellboreC:.6g}, "
f"phi={cand.phi:.6g}, h={cand.h:.6g}, Cf={cand.Cf:.6g}"
)
finally:
if shared_runner is None:
runner.close()
if not rows:
raise RuntimeError("No valid candidate evaluations completed")
solver_obj = np.asarray([float(r["solver_objective"]) for r in rows], dtype=np.float64)
surrogate_obj = np.asarray([float(r["surrogate_objective"]) for r in rows], dtype=np.float64)
solver_rank = _rank_positions(solver_obj)
surrogate_rank = _rank_positions(surrogate_obj)
# rank_gap > 0 表示该候选在代理排序中比真实排序更靠后,可能被代理筛掉。
for i, row in enumerate(rows):
row["solver_rank"] = int(solver_rank[i])
row["surrogate_rank"] = int(surrogate_rank[i])
row["rank_gap"] = int(surrogate_rank[i] - solver_rank[i])
top5_solver = set(np.argsort(solver_obj)[: min(5, solver_obj.size)].tolist())
top5_sur = set(np.argsort(surrogate_obj)[: min(5, surrogate_obj.size)].tolist())
top10_solver = set(np.argsort(solver_obj)[: min(10, solver_obj.size)].tolist())
top10_sur = set(np.argsort(surrogate_obj)[: min(10, surrogate_obj.size)].tolist())
best_solver_idx = int(np.argmin(solver_obj))
best_surrogate_idx = int(np.argmin(surrogate_obj))
# summary 聚焦排序质量,而不是单纯曲线 RMSE因为自动拟合筛选更关心候选相对优劣。
summary = {
"config_path": str(cfg.path),
"processed_path": str(processed_path),
"model_path": str(model_path),
"target_params": {
"k": target_params.k,
"skin": target_params.skin,
"wellboreC": target_params.wellboreC,
"phi": target_params.phi,
"h": target_params.h,
"Cf": target_params.Cf,
},
"schedule": {
"sectionIndex": schedule.sectionIndex,
"timeQ": schedule.timeQ,
"q": schedule.q,
},
"n_requested": int(args.n_candidates),
"n_valid": int(len(rows)),
"span_frac": float(args.span_frac),
"search_param_names": search_names,
"pearson_objective": _corr_pearson(solver_obj, surrogate_obj),
"spearman_objective": _corr_spearman(solver_obj, surrogate_obj),
"mean_objective_gap": float(np.mean(surrogate_obj - solver_obj)),
"mae_objective_gap": float(np.mean(np.abs(surrogate_obj - solver_obj))),
"top5_overlap": int(len(top5_solver & top5_sur)),
"top10_overlap": int(len(top10_solver & top10_sur)),
"best_solver_candidate_id": best_solver_idx,
"best_solver_candidate_surrogate_rank": int(surrogate_rank[best_solver_idx]),
"best_surrogate_candidate_id": best_surrogate_idx,
"best_surrogate_candidate_solver_rank": int(solver_rank[best_surrogate_idx]),
"target_solver_raw_loglog": target_raw,
}
csv_path = output_dir / "candidate_objectives.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(
f,
fieldnames=[
"candidate_id",
"attempt_id",
"k",
"skin",
"wellboreC",
"phi",
"h",
"Cf",
"solver_objective",
"solver_p_obj",
"solver_d_obj",
"surrogate_objective",
"surrogate_p_obj",
"surrogate_d_obj",
"objective_gap",
"solver_rank",
"surrogate_rank",
"rank_gap",
],
)
writer.writeheader()
writer.writerows(rows)
failed_csv_path = output_dir / "failed_candidates.csv"
with open(failed_csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(
f,
fieldnames=["attempt_id", "reason", "k", "skin", "wellboreC", "phi", "h", "Cf"],
)
writer.writeheader()
writer.writerows(failed_rows)
with open(output_dir / "summary.json", "w", encoding="utf-8") as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
_plot_scatter(rows, output_dir / "objective_scatter.png")
print("\nLocal autofit validation complete.")
print(f"Output dir: {output_dir}")
print(f"Valid candidates={len(rows)}/{args.n_candidates}")
print(
f"Objective Pearson={summary['pearson_objective']:.6f}, "
f"Spearman={summary['spearman_objective']:.6f}, "
f"Top5 overlap={summary['top5_overlap']}, Top10 overlap={summary['top10_overlap']}"
)
print(
f"Best solver candidate id={summary['best_solver_candidate_id']}, "
f"its surrogate rank={summary['best_solver_candidate_surrogate_rank']}"
)
print(
f"Best surrogate candidate id={summary['best_surrogate_candidate_id']}, "
f"its solver rank={summary['best_surrogate_candidate_solver_rank']}"
)
print(f"CSV saved: {csv_path}")
print(f"Failed candidates saved: {failed_csv_path}")
print(f"Summary saved: {output_dir / 'summary.json'}")
print(f"Plot saved: {output_dir / 'objective_scatter.png'}")
finally:
if shared_runner is not None:
shared_runner.close()
if __name__ == "__main__":
main()