|
|
|
|
@ -11,10 +11,13 @@ log_pressure 和 log_derivative。
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
# pylint: disable=import-error,duplicate-code
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import random
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from dataclasses import asdict, dataclass, field
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import joblib
|
|
|
|
|
import numpy as np
|
|
|
|
|
@ -23,40 +26,45 @@ import torch.nn.functional as F
|
|
|
|
|
from torch.utils.data import DataLoader, Dataset
|
|
|
|
|
|
|
|
|
|
from src.data.param_features import inverse_transform_param_features
|
|
|
|
|
from src.models.time_conditioned_surrogate import TimeConditionedSurrogate
|
|
|
|
|
from src.models.time_conditioned_surrogate import (
|
|
|
|
|
TimeConditionedSurrogate,
|
|
|
|
|
TimeConditionedSurrogateConfig,
|
|
|
|
|
)
|
|
|
|
|
from src.training.train_forward import get_part_slices, infer_curve_layout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class PointCurveArrays:
|
|
|
|
|
"""逐时间点数据集所需数组。"""
|
|
|
|
|
|
|
|
|
|
params_x: np.ndarray
|
|
|
|
|
schedule_x: np.ndarray
|
|
|
|
|
time_x: np.ndarray
|
|
|
|
|
curve_y: np.ndarray
|
|
|
|
|
layout: dict
|
|
|
|
|
sample_weight: np.ndarray | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PointCurveDataset(Dataset):
|
|
|
|
|
"""把完整曲线展开为逐时间点训练样本。
|
|
|
|
|
|
|
|
|
|
原始数据形状是 N 条曲线,每条曲线 T 个时间点。本 Dataset 的长度为 N*T,
|
|
|
|
|
__getitem__ 会根据一维索引反推出 sample_idx 和 time_idx,返回该时间点对应的
|
|
|
|
|
参数特征、制度特征、时间特征、双通道目标值以及样本级权重。
|
|
|
|
|
"""
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
params_x: np.ndarray,
|
|
|
|
|
schedule_x: np.ndarray,
|
|
|
|
|
time_x: np.ndarray,
|
|
|
|
|
curve_y: np.ndarray,
|
|
|
|
|
layout: dict,
|
|
|
|
|
sample_weight: np.ndarray | None = None,
|
|
|
|
|
):
|
|
|
|
|
"""把完整曲线展开为逐时间点训练样本。"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, arrays: PointCurveArrays):
|
|
|
|
|
"""保存逐点训练所需的参数、流量制度、时间特征、目标曲线和样本权重。"""
|
|
|
|
|
self.params_x = torch.tensor(params_x, dtype=torch.float32)
|
|
|
|
|
self.schedule_x = torch.tensor(schedule_x, dtype=torch.float32)
|
|
|
|
|
self.time_x = torch.tensor(time_x, dtype=torch.float32)
|
|
|
|
|
self.params_x = torch.tensor(arrays.params_x, dtype=torch.float32)
|
|
|
|
|
self.schedule_x = torch.tensor(arrays.schedule_x, dtype=torch.float32)
|
|
|
|
|
self.time_x = torch.tensor(arrays.time_x, dtype=torch.float32)
|
|
|
|
|
|
|
|
|
|
slices = get_part_slices(layout)
|
|
|
|
|
p = curve_y[:, slices["log_pressure"]]
|
|
|
|
|
d = curve_y[:, slices["log_derivative"]]
|
|
|
|
|
self.y = torch.tensor(np.stack([p, d], axis=-1), dtype=torch.float32)
|
|
|
|
|
slices = get_part_slices(arrays.layout)
|
|
|
|
|
pressure_y = arrays.curve_y[:, slices["log_pressure"]]
|
|
|
|
|
derivative_y = arrays.curve_y[:, slices["log_derivative"]]
|
|
|
|
|
self.y = torch.tensor(np.stack([pressure_y, derivative_y], axis=-1), dtype=torch.float32)
|
|
|
|
|
|
|
|
|
|
self.n_samples = int(self.params_x.shape[0])
|
|
|
|
|
self.n_time = int(self.time_x.shape[1])
|
|
|
|
|
sample_weight = arrays.sample_weight
|
|
|
|
|
if sample_weight is None:
|
|
|
|
|
sample_weight = np.ones((self.n_samples,), dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
sample_weight = np.asarray(sample_weight, dtype=np.float32).reshape(-1)
|
|
|
|
|
if sample_weight.shape[0] != self.n_samples:
|
|
|
|
|
raise ValueError(f"sample_weight length mismatch: {sample_weight.shape[0]} != {self.n_samples}")
|
|
|
|
|
@ -80,30 +88,88 @@ class PointCurveDataset(Dataset):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TimeConditionedTrainConfig:
|
|
|
|
|
"""时间条件代理模型训练配置,包括批量大小、学习率、模型宽度和设备。"""
|
|
|
|
|
processed_path: Path
|
|
|
|
|
output_dir: Path
|
|
|
|
|
seed: int = 42
|
|
|
|
|
class TimeModelConfig:
|
|
|
|
|
"""时间条件代理模型结构配置。"""
|
|
|
|
|
|
|
|
|
|
hidden_dim: int = 256
|
|
|
|
|
n_blocks: int = 4
|
|
|
|
|
dropout: float = 0.05
|
|
|
|
|
use_schedule: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TimeOptimConfig:
|
|
|
|
|
"""训练轮次和优化器配置。"""
|
|
|
|
|
|
|
|
|
|
batch_size: int = 4096
|
|
|
|
|
epochs: int = 120
|
|
|
|
|
lr: float = 1.0e-3
|
|
|
|
|
weight_decay: float = 1.0e-4
|
|
|
|
|
hidden_dim: int = 256
|
|
|
|
|
n_blocks: int = 4
|
|
|
|
|
dropout: float = 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TimeLossConfig:
|
|
|
|
|
"""时间条件模型点级损失配置。"""
|
|
|
|
|
|
|
|
|
|
w_pressure: float = 1.0
|
|
|
|
|
w_derivative: float = 2.0
|
|
|
|
|
huber_beta: float = 0.05
|
|
|
|
|
use_schedule: bool = True
|
|
|
|
|
sample_weight_mode: str = "none"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class RiskWeightConfig:
|
|
|
|
|
"""风险区域样本加权配置。"""
|
|
|
|
|
|
|
|
|
|
mode: str = "none"
|
|
|
|
|
risk_weight: float = 2.5
|
|
|
|
|
skin_lt_minus8_weight: float = 3.5
|
|
|
|
|
sample_weight_min: float = 1.0
|
|
|
|
|
sample_weight_max: float = 4.0
|
|
|
|
|
weight_min: float = 1.0
|
|
|
|
|
weight_max: float = 4.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TimeRuntimeConfig:
|
|
|
|
|
"""训练运行时配置。"""
|
|
|
|
|
|
|
|
|
|
seed: int = 42
|
|
|
|
|
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TimeConditionedTrainConfig:
|
|
|
|
|
"""时间条件代理模型训练配置。"""
|
|
|
|
|
|
|
|
|
|
processed_path: Path
|
|
|
|
|
output_dir: Path
|
|
|
|
|
runtime: TimeRuntimeConfig = field(default_factory=TimeRuntimeConfig)
|
|
|
|
|
optim: TimeOptimConfig = field(default_factory=TimeOptimConfig)
|
|
|
|
|
model: TimeModelConfig = field(default_factory=TimeModelConfig)
|
|
|
|
|
loss: TimeLossConfig = field(default_factory=TimeLossConfig)
|
|
|
|
|
risk_weight: RiskWeightConfig = field(default_factory=RiskWeightConfig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class DataBundle:
|
|
|
|
|
"""训练、验证、测试数据加载器与输入维度。"""
|
|
|
|
|
|
|
|
|
|
train_loader: DataLoader
|
|
|
|
|
val_loader: DataLoader
|
|
|
|
|
test_loader: DataLoader
|
|
|
|
|
param_dim: int
|
|
|
|
|
schedule_dim: int
|
|
|
|
|
time_dim: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TrainArtifacts:
|
|
|
|
|
"""训练过程中需要跨函数传递的数据。"""
|
|
|
|
|
|
|
|
|
|
data: dict
|
|
|
|
|
curve_layout: dict
|
|
|
|
|
train_weight_summary: dict[str, Any]
|
|
|
|
|
bundle: DataBundle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_global_seed(seed: int) -> None:
|
|
|
|
|
"""设置 Python、NumPy 和 PyTorch 随机种子,并在 CUDA 可用时同步设置 GPU 随机种子。"""
|
|
|
|
|
random.seed(seed)
|
|
|
|
|
@ -121,215 +187,366 @@ def _smooth_l1_vector(pred: torch.Tensor, target: torch.Tensor, beta: float) ->
|
|
|
|
|
def _loss(
|
|
|
|
|
pred: torch.Tensor,
|
|
|
|
|
target: torch.Tensor,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
loss_cfg: TimeLossConfig,
|
|
|
|
|
sample_weight: torch.Tensor | None = None,
|
|
|
|
|
) -> torch.Tensor:
|
|
|
|
|
"""计算时间条件模型的点级损失,并按样本权重求平均。"""
|
|
|
|
|
loss_p = _smooth_l1_vector(pred[:, 0], target[:, 0], beta=float(cfg.huber_beta))
|
|
|
|
|
loss_d = _smooth_l1_vector(pred[:, 1], target[:, 1], beta=float(cfg.huber_beta))
|
|
|
|
|
loss_vec = float(cfg.w_pressure) * loss_p + float(cfg.w_derivative) * loss_d
|
|
|
|
|
loss_p = _smooth_l1_vector(pred[:, 0], target[:, 0], beta=float(loss_cfg.huber_beta))
|
|
|
|
|
loss_d = _smooth_l1_vector(pred[:, 1], target[:, 1], beta=float(loss_cfg.huber_beta))
|
|
|
|
|
loss_vec = float(loss_cfg.w_pressure) * loss_p + float(loss_cfg.w_derivative) * loss_d
|
|
|
|
|
if sample_weight is None:
|
|
|
|
|
return loss_vec.mean()
|
|
|
|
|
w = sample_weight.to(loss_vec.device).reshape(-1).clamp_min(0.0)
|
|
|
|
|
return (loss_vec * w).sum() / torch.clamp(w.sum(), min=1.0e-12)
|
|
|
|
|
|
|
|
|
|
weight = sample_weight.to(loss_vec.device).reshape(-1).clamp_min(0.0)
|
|
|
|
|
return (loss_vec * weight).sum() / torch.clamp(weight.sum(), min=1.0e-12)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _move_batch_to_device(
|
|
|
|
|
batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
|
|
|
|
|
device: str,
|
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
|
|
|
"""把一个 batch 的所有张量移动到目标设备。"""
|
|
|
|
|
params_x, schedule_x, time_x, target_y, sample_weight = batch
|
|
|
|
|
return (
|
|
|
|
|
params_x.to(device),
|
|
|
|
|
schedule_x.to(device),
|
|
|
|
|
time_x.to(device),
|
|
|
|
|
target_y.to(device),
|
|
|
|
|
sample_weight.to(device),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _evaluate(model: TimeConditionedSurrogate, loader: DataLoader, cfg: TimeConditionedTrainConfig) -> float:
|
|
|
|
|
"""在验证集上评估时间条件模型的平均损失。"""
|
|
|
|
|
model.eval()
|
|
|
|
|
|
|
|
|
|
def _forward_batch(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
use_weight: bool,
|
|
|
|
|
) -> tuple[torch.Tensor, int]:
|
|
|
|
|
"""完成一个 batch 的前向计算并返回损失和样本数。"""
|
|
|
|
|
params_x, schedule_x, time_x, target_y, sample_weight = _move_batch_to_device(
|
|
|
|
|
batch,
|
|
|
|
|
cfg.runtime.device,
|
|
|
|
|
)
|
|
|
|
|
schedule_input = schedule_x if cfg.model.use_schedule else None
|
|
|
|
|
pred = model(params_x, time_x, schedule_input)
|
|
|
|
|
weight = sample_weight if use_weight else None
|
|
|
|
|
loss = _loss(pred, target_y, cfg.loss, sample_weight=weight)
|
|
|
|
|
return loss, int(target_y.shape[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _run_loader(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
loader: DataLoader,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
optimizer: torch.optim.Optimizer | None = None,
|
|
|
|
|
) -> float:
|
|
|
|
|
"""执行一个训练或评估 epoch。"""
|
|
|
|
|
is_train = optimizer is not None
|
|
|
|
|
model.train(mode=is_train)
|
|
|
|
|
total = 0.0
|
|
|
|
|
total_n = 0
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
for params_x, schedule_x, time_x, y, _sample_weight in loader:
|
|
|
|
|
params_x = params_x.to(cfg.device)
|
|
|
|
|
schedule_x = schedule_x.to(cfg.device)
|
|
|
|
|
time_x = time_x.to(cfg.device)
|
|
|
|
|
y = y.to(cfg.device)
|
|
|
|
|
pred = model(params_x, time_x, schedule_x if cfg.use_schedule else None)
|
|
|
|
|
loss = _loss(pred, y, cfg)
|
|
|
|
|
bs = int(y.shape[0])
|
|
|
|
|
total += float(loss.detach().cpu()) * bs
|
|
|
|
|
total_n += bs
|
|
|
|
|
grad_context = torch.enable_grad() if is_train else torch.no_grad()
|
|
|
|
|
|
|
|
|
|
with grad_context:
|
|
|
|
|
for batch in loader:
|
|
|
|
|
if is_train:
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
|
|
|
|
|
loss, batch_size = _forward_batch(model, batch, cfg, use_weight=is_train)
|
|
|
|
|
|
|
|
|
|
if is_train:
|
|
|
|
|
loss.backward()
|
|
|
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
|
|
|
|
total += float(loss.detach().cpu()) * batch_size
|
|
|
|
|
total_n += batch_size
|
|
|
|
|
|
|
|
|
|
return total / max(total_n, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _evaluate(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
loader: DataLoader,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
) -> float:
|
|
|
|
|
"""在验证集或测试集上评估时间条件模型的平均损失。"""
|
|
|
|
|
return _run_loader(model, loader, cfg, optimizer=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _raw_params_from_processed_split(data: dict, split: str) -> dict[str, np.ndarray]:
|
|
|
|
|
"""从预处理数据中读取某个划分的原始参数,用于构造样本权重。"""
|
|
|
|
|
key = f"X_params_{split}"
|
|
|
|
|
features = data["scaler_params"].inverse_transform(data[key])
|
|
|
|
|
raw = inverse_transform_param_features(features, data.get("meta", {}).get("param_feature_transform"))
|
|
|
|
|
names = list(data.get("meta", {}).get("param_names") or ["k", "skin", "wellboreC", "phi", "h", "Cf"])
|
|
|
|
|
return {name: raw[:, idx].astype(np.float64) for idx, name in enumerate(names[: raw.shape[1]])}
|
|
|
|
|
transform = data.get("meta", {}).get("param_feature_transform")
|
|
|
|
|
raw = inverse_transform_param_features(features, transform)
|
|
|
|
|
default_names = ["k", "skin", "wellboreC", "phi", "h", "Cf"]
|
|
|
|
|
names = list(data.get("meta", {}).get("param_names") or default_names)
|
|
|
|
|
return {
|
|
|
|
|
name: raw[:, idx].astype(np.float64)
|
|
|
|
|
for idx, name in enumerate(names[: raw.shape[1]])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_sample_weight(data: dict, cfg: TimeConditionedTrainConfig, split: str = "train") -> np.ndarray:
|
|
|
|
|
def _build_sample_weight(
|
|
|
|
|
data: dict,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
split: str = "train",
|
|
|
|
|
) -> np.ndarray:
|
|
|
|
|
"""根据原始物理参数生成样本权重,使关键参数区域得到更多关注。"""
|
|
|
|
|
mode = str(cfg.sample_weight_mode or "none").lower()
|
|
|
|
|
n = int(data[f"X_params_{split}"].shape[0])
|
|
|
|
|
mode = str(cfg.risk_weight.mode or "none").lower()
|
|
|
|
|
n_samples = int(data[f"X_params_{split}"].shape[0])
|
|
|
|
|
if mode in {"none", "off", "false"}:
|
|
|
|
|
return np.ones((n,), dtype=np.float32)
|
|
|
|
|
return np.ones((n_samples,), dtype=np.float32)
|
|
|
|
|
if mode != "risk_region":
|
|
|
|
|
raise ValueError(f"Unknown sample_weight_mode={cfg.sample_weight_mode!r}")
|
|
|
|
|
raise ValueError(f"Unknown sample_weight_mode={cfg.risk_weight.mode!r}")
|
|
|
|
|
|
|
|
|
|
params = _raw_params_from_processed_split(data, split)
|
|
|
|
|
weight = np.ones((n,), dtype=np.float32)
|
|
|
|
|
weight = np.ones((n_samples,), dtype=np.float32)
|
|
|
|
|
risk = (params["skin"] < -5.0) & (params["wellboreC"] > 0.1)
|
|
|
|
|
skin_extreme = params["skin"] < -8.0
|
|
|
|
|
weight[risk] = np.maximum(weight[risk], float(cfg.risk_weight))
|
|
|
|
|
weight[skin_extreme] = np.maximum(weight[skin_extreme], float(cfg.skin_lt_minus8_weight))
|
|
|
|
|
weight[risk] = np.maximum(weight[risk], float(cfg.risk_weight.risk_weight))
|
|
|
|
|
weight[skin_extreme] = np.maximum(weight[skin_extreme], float(cfg.risk_weight.skin_lt_minus8_weight))
|
|
|
|
|
return np.clip(
|
|
|
|
|
weight,
|
|
|
|
|
float(cfg.risk_weight.weight_min),
|
|
|
|
|
float(cfg.risk_weight.weight_max),
|
|
|
|
|
).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
weight = np.clip(weight, float(cfg.sample_weight_min), float(cfg.sample_weight_max))
|
|
|
|
|
return weight.astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _summarize_sample_weight(sample_weight: np.ndarray) -> dict:
|
|
|
|
|
def _summarize_sample_weight(sample_weight: np.ndarray) -> dict[str, Any]:
|
|
|
|
|
"""统计样本权重的最小值、最大值和分位数,便于检查加权强度。"""
|
|
|
|
|
w = np.asarray(sample_weight, dtype=np.float32).reshape(-1)
|
|
|
|
|
weight = np.asarray(sample_weight, dtype=np.float32).reshape(-1)
|
|
|
|
|
return {
|
|
|
|
|
"min": float(np.min(w)),
|
|
|
|
|
"mean": float(np.mean(w)),
|
|
|
|
|
"median": float(np.median(w)),
|
|
|
|
|
"max": float(np.max(w)),
|
|
|
|
|
"n_weight_gt_1": int(np.sum(w > 1.0)),
|
|
|
|
|
"n_weight_lt_1": int(np.sum(w < 1.0)),
|
|
|
|
|
"min": float(np.min(weight)),
|
|
|
|
|
"mean": float(np.mean(weight)),
|
|
|
|
|
"median": float(np.median(weight)),
|
|
|
|
|
"max": float(np.max(weight)),
|
|
|
|
|
"n_weight_gt_1": int(np.sum(weight > 1.0)),
|
|
|
|
|
"n_weight_lt_1": int(np.sum(weight < 1.0)),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_time_conditioned(cfg: TimeConditionedTrainConfig) -> None:
|
|
|
|
|
"""训练时间条件代理模型并保存训练产物。
|
|
|
|
|
|
|
|
|
|
输入数据必须由新版预处理流程生成,包含 X_time_train/val/test。训练时只有训练集
|
|
|
|
|
打乱顺序,验证和测试保持固定顺序以便复现指标。最佳模型按验证损失保存,最终
|
|
|
|
|
写出 history.json 和 metrics.json,用于查看训练趋势和测试集性能。
|
|
|
|
|
"""
|
|
|
|
|
cfg.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
set_global_seed(int(cfg.seed))
|
|
|
|
|
|
|
|
|
|
data = joblib.load(cfg.processed_path)
|
|
|
|
|
def _load_processed_data(path: Path) -> dict:
|
|
|
|
|
"""读取预处理数据并检查时间条件训练所需字段。"""
|
|
|
|
|
data = joblib.load(path)
|
|
|
|
|
required = ["X_time_train", "X_time_val", "X_time_test"]
|
|
|
|
|
missing = [key for key in required if key not in data]
|
|
|
|
|
if missing:
|
|
|
|
|
# 时间条件训练需要每个曲线点的时间特征;缺失时说明预处理版本不匹配。
|
|
|
|
|
raise KeyError(f"processed dataset is missing time-conditioned fields: {missing}")
|
|
|
|
|
|
|
|
|
|
curve_layout = infer_curve_layout(data)
|
|
|
|
|
train_weight = _build_sample_weight(data, cfg, split="train")
|
|
|
|
|
train_weight_summary = _summarize_sample_weight(train_weight)
|
|
|
|
|
# PointCurveDataset 会把 [N, T] 曲线展开成 N*T 个点级样本。
|
|
|
|
|
train_ds = PointCurveDataset(
|
|
|
|
|
data["X_params_train"],
|
|
|
|
|
data["X_schedule_train"],
|
|
|
|
|
data["X_time_train"],
|
|
|
|
|
data["Y_curve_train"],
|
|
|
|
|
curve_layout,
|
|
|
|
|
sample_weight=train_weight,
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_point_dataset(
|
|
|
|
|
data: dict,
|
|
|
|
|
split: str,
|
|
|
|
|
curve_layout: dict,
|
|
|
|
|
sample_weight: np.ndarray | None = None,
|
|
|
|
|
) -> PointCurveDataset:
|
|
|
|
|
"""构造某个数据划分对应的逐点数据集。"""
|
|
|
|
|
return PointCurveDataset(
|
|
|
|
|
PointCurveArrays(
|
|
|
|
|
params_x=data[f"X_params_{split}"],
|
|
|
|
|
schedule_x=data[f"X_schedule_{split}"],
|
|
|
|
|
time_x=data[f"X_time_{split}"],
|
|
|
|
|
curve_y=data[f"Y_curve_{split}"],
|
|
|
|
|
layout=curve_layout,
|
|
|
|
|
sample_weight=sample_weight,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
val_ds = PointCurveDataset(data["X_params_val"], data["X_schedule_val"], data["X_time_val"], data["Y_curve_val"], curve_layout)
|
|
|
|
|
test_ds = PointCurveDataset(data["X_params_test"], data["X_schedule_test"], data["X_time_test"], data["Y_curve_test"], curve_layout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_dataloaders(
|
|
|
|
|
data: dict,
|
|
|
|
|
curve_layout: dict,
|
|
|
|
|
train_weight: np.ndarray,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
) -> DataBundle:
|
|
|
|
|
"""构建训练、验证和测试 DataLoader。"""
|
|
|
|
|
train_ds = _make_point_dataset(data, "train", curve_layout, train_weight)
|
|
|
|
|
val_ds = _make_point_dataset(data, "val", curve_layout)
|
|
|
|
|
test_ds = _make_point_dataset(data, "test", curve_layout)
|
|
|
|
|
|
|
|
|
|
generator = torch.Generator()
|
|
|
|
|
generator.manual_seed(int(cfg.seed))
|
|
|
|
|
# 只打乱训练集;验证/测试保持固定顺序方便复现指标。
|
|
|
|
|
train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, generator=generator)
|
|
|
|
|
val_loader = DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False)
|
|
|
|
|
test_loader = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False)
|
|
|
|
|
generator.manual_seed(int(cfg.runtime.seed))
|
|
|
|
|
|
|
|
|
|
train_loader = DataLoader(
|
|
|
|
|
train_ds,
|
|
|
|
|
batch_size=cfg.optim.batch_size,
|
|
|
|
|
shuffle=True,
|
|
|
|
|
generator=generator,
|
|
|
|
|
)
|
|
|
|
|
val_loader = DataLoader(val_ds, batch_size=cfg.optim.batch_size, shuffle=False)
|
|
|
|
|
test_loader = DataLoader(test_ds, batch_size=cfg.optim.batch_size, shuffle=False)
|
|
|
|
|
|
|
|
|
|
model = TimeConditionedSurrogate(
|
|
|
|
|
return DataBundle(
|
|
|
|
|
train_loader=train_loader,
|
|
|
|
|
val_loader=val_loader,
|
|
|
|
|
test_loader=test_loader,
|
|
|
|
|
param_dim=int(data["X_params_train"].shape[1]),
|
|
|
|
|
schedule_dim=int(data["X_schedule_train"].shape[1]),
|
|
|
|
|
time_dim=int(data["X_time_train"].shape[-1]),
|
|
|
|
|
hidden_dim=int(cfg.hidden_dim),
|
|
|
|
|
n_blocks=int(cfg.n_blocks),
|
|
|
|
|
dropout=float(cfg.dropout),
|
|
|
|
|
use_schedule=bool(cfg.use_schedule),
|
|
|
|
|
).to(cfg.device)
|
|
|
|
|
|
|
|
|
|
# AdamW 的 weight_decay 与梯度更新解耦,适合这个全连接回归模型。
|
|
|
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=float(cfg.lr), weight_decay=float(cfg.weight_decay))
|
|
|
|
|
best_val = float("inf")
|
|
|
|
|
best_path = cfg.output_dir / "time_conditioned_surrogate_best.pt"
|
|
|
|
|
history: list[dict] = []
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _prepare_training_artifacts(cfg: TimeConditionedTrainConfig) -> TrainArtifacts:
|
|
|
|
|
"""加载数据、推断布局、构造样本权重和 DataLoader。"""
|
|
|
|
|
data = _load_processed_data(cfg.processed_path)
|
|
|
|
|
curve_layout = infer_curve_layout(data)
|
|
|
|
|
train_weight = _build_sample_weight(data, cfg, split="train")
|
|
|
|
|
train_weight_summary = _summarize_sample_weight(train_weight)
|
|
|
|
|
bundle = _build_dataloaders(data, curve_layout, train_weight, cfg)
|
|
|
|
|
return TrainArtifacts(
|
|
|
|
|
data=data,
|
|
|
|
|
curve_layout=curve_layout,
|
|
|
|
|
train_weight_summary=train_weight_summary,
|
|
|
|
|
bundle=bundle,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_model(bundle: DataBundle, cfg: TimeConditionedTrainConfig) -> TimeConditionedSurrogate:
|
|
|
|
|
"""兼容新版配置式模型和旧版关键字参数式模型。"""
|
|
|
|
|
model_cfg = TimeConditionedSurrogateConfig(
|
|
|
|
|
param_dim=bundle.param_dim,
|
|
|
|
|
schedule_dim=bundle.schedule_dim,
|
|
|
|
|
time_dim=bundle.time_dim,
|
|
|
|
|
hidden_dim=int(cfg.model.hidden_dim),
|
|
|
|
|
n_blocks=int(cfg.model.n_blocks),
|
|
|
|
|
dropout=float(cfg.model.dropout),
|
|
|
|
|
use_schedule=bool(cfg.model.use_schedule),
|
|
|
|
|
)
|
|
|
|
|
return TimeConditionedSurrogate(model_cfg).to(cfg.runtime.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_optimizer(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
) -> torch.optim.Optimizer:
|
|
|
|
|
"""构建 AdamW 优化器。"""
|
|
|
|
|
return torch.optim.AdamW(
|
|
|
|
|
model.parameters(),
|
|
|
|
|
lr=float(cfg.optim.lr),
|
|
|
|
|
weight_decay=float(cfg.optim.weight_decay),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _print_training_config(
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
artifacts: TrainArtifacts,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""打印时间条件训练配置摘要。"""
|
|
|
|
|
meta = artifacts.data.get("meta", {})
|
|
|
|
|
print("Time-conditioned training config:")
|
|
|
|
|
print(f" processed={cfg.processed_path}")
|
|
|
|
|
print(f" output_dir={cfg.output_dir}")
|
|
|
|
|
print(f" device={cfg.device}, batch_size={cfg.batch_size}, epochs={cfg.epochs}")
|
|
|
|
|
print(
|
|
|
|
|
f" dims: param={data['X_params_train'].shape[1]}, "
|
|
|
|
|
f"schedule={data['X_schedule_train'].shape[1]}, time={data['X_time_train'].shape[-1]}"
|
|
|
|
|
f" device={cfg.runtime.device}, batch_size={cfg.optim.batch_size}, "
|
|
|
|
|
f"epochs={cfg.optim.epochs}"
|
|
|
|
|
)
|
|
|
|
|
print(
|
|
|
|
|
f" dims: param={artifacts.bundle.param_dim}, "
|
|
|
|
|
f"schedule={artifacts.bundle.schedule_dim}, time={artifacts.bundle.time_dim}"
|
|
|
|
|
)
|
|
|
|
|
print(f" curve_time_source={data.get('meta', {}).get('curve_time_source', 'unknown')}")
|
|
|
|
|
print(f" sample_weight_mode={cfg.sample_weight_mode}, sample_weight={train_weight_summary}")
|
|
|
|
|
|
|
|
|
|
for epoch in range(1, int(cfg.epochs) + 1):
|
|
|
|
|
model.train()
|
|
|
|
|
total = 0.0
|
|
|
|
|
total_n = 0
|
|
|
|
|
for params_x, schedule_x, time_x, y, sample_weight in train_loader:
|
|
|
|
|
params_x = params_x.to(cfg.device)
|
|
|
|
|
schedule_x = schedule_x.to(cfg.device)
|
|
|
|
|
time_x = time_x.to(cfg.device)
|
|
|
|
|
y = y.to(cfg.device)
|
|
|
|
|
sample_weight = sample_weight.to(cfg.device)
|
|
|
|
|
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
# 每个点的输入由“样本级参数/制度 + 点级时间特征”组成。
|
|
|
|
|
pred = model(params_x, time_x, schedule_x if cfg.use_schedule else None)
|
|
|
|
|
loss = _loss(pred, y, cfg, sample_weight=sample_weight)
|
|
|
|
|
loss.backward()
|
|
|
|
|
# 点级样本数量大,偶发高误差 batch 可能产生尖峰梯度,训练时做裁剪。
|
|
|
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
|
|
|
|
bs = int(y.shape[0])
|
|
|
|
|
total += float(loss.detach().cpu()) * bs
|
|
|
|
|
total_n += bs
|
|
|
|
|
|
|
|
|
|
train_loss = total / max(total_n, 1)
|
|
|
|
|
val_loss = _evaluate(model, val_loader, cfg)
|
|
|
|
|
print(f" curve_time_source={meta.get('curve_time_source', 'unknown')}")
|
|
|
|
|
print(
|
|
|
|
|
f" sample_weight_mode={cfg.risk_weight.mode}, "
|
|
|
|
|
f"sample_weight={artifacts.train_weight_summary}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _checkpoint_payload(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
artifacts: TrainArtifacts,
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
"""构造 checkpoint 保存内容。"""
|
|
|
|
|
return {
|
|
|
|
|
"model_state_dict": model.state_dict(),
|
|
|
|
|
"param_dim": artifacts.bundle.param_dim,
|
|
|
|
|
"schedule_dim": artifacts.bundle.schedule_dim,
|
|
|
|
|
"time_dim": artifacts.bundle.time_dim,
|
|
|
|
|
"hidden_dim": int(cfg.model.hidden_dim),
|
|
|
|
|
"n_blocks": int(cfg.model.n_blocks),
|
|
|
|
|
"dropout": float(cfg.model.dropout),
|
|
|
|
|
"use_schedule": bool(cfg.model.use_schedule),
|
|
|
|
|
"curve_layout": artifacts.curve_layout,
|
|
|
|
|
"processed_path": str(cfg.processed_path),
|
|
|
|
|
"seed": int(cfg.runtime.seed),
|
|
|
|
|
"sample_weight_mode": str(cfg.risk_weight.mode),
|
|
|
|
|
"sample_weight_summary": artifacts.train_weight_summary,
|
|
|
|
|
"model_config": asdict(cfg.model),
|
|
|
|
|
"loss_config": asdict(cfg.loss),
|
|
|
|
|
"risk_weight_config": asdict(cfg.risk_weight),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save_json(path: Path, payload: dict | list) -> None:
|
|
|
|
|
"""写出 JSON 文件。"""
|
|
|
|
|
path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _train_epochs(
|
|
|
|
|
model: TimeConditionedSurrogate,
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
artifacts: TrainArtifacts,
|
|
|
|
|
) -> tuple[float, Path, list[dict]]:
|
|
|
|
|
"""执行训练循环并保存最佳模型。"""
|
|
|
|
|
optimizer = _build_optimizer(model, cfg)
|
|
|
|
|
best_val = float("inf")
|
|
|
|
|
best_path = cfg.output_dir / "time_conditioned_surrogate_best.pt"
|
|
|
|
|
history: list[dict] = []
|
|
|
|
|
|
|
|
|
|
for epoch in range(1, int(cfg.optim.epochs) + 1):
|
|
|
|
|
train_loss = _run_loader(model, artifacts.bundle.train_loader, cfg, optimizer=optimizer)
|
|
|
|
|
val_loss = _evaluate(model, artifacts.bundle.val_loader, cfg)
|
|
|
|
|
history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss})
|
|
|
|
|
print(f"[Epoch {epoch:03d}] train={train_loss:.6f} val={val_loss:.6f}")
|
|
|
|
|
|
|
|
|
|
if val_loss < best_val:
|
|
|
|
|
best_val = val_loss
|
|
|
|
|
# checkpoint 保存曲线布局和输入维度,评估脚本可据此重建同构模型。
|
|
|
|
|
torch.save(
|
|
|
|
|
{
|
|
|
|
|
"model_state_dict": model.state_dict(),
|
|
|
|
|
"param_dim": int(data["X_params_train"].shape[1]),
|
|
|
|
|
"schedule_dim": int(data["X_schedule_train"].shape[1]),
|
|
|
|
|
"time_dim": int(data["X_time_train"].shape[-1]),
|
|
|
|
|
"hidden_dim": int(cfg.hidden_dim),
|
|
|
|
|
"n_blocks": int(cfg.n_blocks),
|
|
|
|
|
"dropout": float(cfg.dropout),
|
|
|
|
|
"use_schedule": bool(cfg.use_schedule),
|
|
|
|
|
"curve_layout": curve_layout,
|
|
|
|
|
"processed_path": str(cfg.processed_path),
|
|
|
|
|
"seed": int(cfg.seed),
|
|
|
|
|
"sample_weight_mode": str(cfg.sample_weight_mode),
|
|
|
|
|
"sample_weight_summary": train_weight_summary,
|
|
|
|
|
},
|
|
|
|
|
best_path,
|
|
|
|
|
)
|
|
|
|
|
torch.save(_checkpoint_payload(model, cfg, artifacts), best_path)
|
|
|
|
|
print(f" -> best model saved to: {best_path}")
|
|
|
|
|
|
|
|
|
|
checkpoint = torch.load(best_path, map_location=cfg.device)
|
|
|
|
|
model.load_state_dict(checkpoint["model_state_dict"])
|
|
|
|
|
test_loss = _evaluate(model, test_loader, cfg)
|
|
|
|
|
|
|
|
|
|
# history 记录逐 epoch 走势;metrics 记录最终选择的最佳验证和测试损失。
|
|
|
|
|
(cfg.output_dir / "history.json").write_text(json.dumps(history, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
(cfg.output_dir / "metrics.json").write_text(
|
|
|
|
|
json.dumps(
|
|
|
|
|
{
|
|
|
|
|
"best_val_loss": best_val,
|
|
|
|
|
"test_loss": test_loss,
|
|
|
|
|
"sample_weight_mode": str(cfg.sample_weight_mode),
|
|
|
|
|
"sample_weight_summary": train_weight_summary,
|
|
|
|
|
},
|
|
|
|
|
indent=2,
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
),
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
return best_val, best_path, history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_final_outputs(
|
|
|
|
|
cfg: TimeConditionedTrainConfig,
|
|
|
|
|
best_val: float,
|
|
|
|
|
test_loss: float,
|
|
|
|
|
history: list[dict],
|
|
|
|
|
artifacts: TrainArtifacts,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""保存 history.json 和 metrics.json。"""
|
|
|
|
|
_save_json(cfg.output_dir / "history.json", history)
|
|
|
|
|
_save_json(
|
|
|
|
|
cfg.output_dir / "metrics.json",
|
|
|
|
|
{
|
|
|
|
|
"best_val_loss": best_val,
|
|
|
|
|
"test_loss": test_loss,
|
|
|
|
|
"sample_weight_mode": str(cfg.risk_weight.mode),
|
|
|
|
|
"sample_weight_summary": artifacts.train_weight_summary,
|
|
|
|
|
"model_config": asdict(cfg.model),
|
|
|
|
|
"loss_config": asdict(cfg.loss),
|
|
|
|
|
"risk_weight_config": asdict(cfg.risk_weight),
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_time_conditioned(cfg: TimeConditionedTrainConfig) -> None:
|
|
|
|
|
"""训练时间条件代理模型并保存训练产物。"""
|
|
|
|
|
cfg.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
set_global_seed(int(cfg.runtime.seed))
|
|
|
|
|
|
|
|
|
|
artifacts = _prepare_training_artifacts(cfg)
|
|
|
|
|
model = _build_model(artifacts.bundle, cfg)
|
|
|
|
|
|
|
|
|
|
_print_training_config(cfg, artifacts)
|
|
|
|
|
best_val, best_path, history = _train_epochs(model, cfg, artifacts)
|
|
|
|
|
|
|
|
|
|
checkpoint = torch.load(best_path, map_location=cfg.runtime.device)
|
|
|
|
|
model.load_state_dict(checkpoint["model_state_dict"])
|
|
|
|
|
test_loss = _evaluate(model, artifacts.bundle.test_loader, cfg)
|
|
|
|
|
|
|
|
|
|
_write_final_outputs(cfg, best_val, test_loss, history, artifacts)
|
|
|
|
|
print(f"[Final] test={test_loss:.6f}")
|
|
|
|
|
|