# -*- coding: utf-8 -*- """物理参数特征变换。 数值试井物理参数的量纲和数量级差异很大,例如渗透率、井筒储集系数和地层厚度 可能跨越多个数量级,而 skin 又可能为负值。直接把原始参数送入神经网络会增加 标准化和优化难度。 本模块把原始物理参数变换为更适合模型学习的特征空间,并保存可逆的 transform 元数据。训练后如果需要回到物理尺度,可以使用 inverse_transform_param_features 恢复原始参数含义。 """ from __future__ import annotations from typing import Any import numpy as np # 部分物理参数跨越多个数量级,因此先做特征变换,再交给模型学习。 DEFAULT_PARAM_NAMES = ["k", "skin", "wellboreC", "phi", "h", "Cf"] DEFAULT_LOG_PARAM_NAMES = {"k", "wellboreC", "h"} DEFAULT_ASINH_PARAM_NAMES = {"skin"} DEFAULT_COMPOSITE_FEATURES = [ "log10_kh", "log10_phi_h", "log10_k_over_phi", "log10_wellboreC_over_h", "log10_wellboreC_over_phi_h", ] def _decode_names(raw_names: Any) -> list[str] | None: """把 HDF5 元数据中的参数名列表解码为字符串列表。""" if raw_names is None: return None return [ item.decode("utf-8") if isinstance(item, (bytes, np.bytes_)) else str(item) for item in raw_names ] def build_param_feature_transform( param_names: list[str] | None = None, log_param_names: set[str] | None = None, asinh_param_names: set[str] | None = None, enabled: bool = True, include_composite_features: bool = True, ) -> dict[str, Any]: """根据参数名建立正向/反向变换配置,例如对渗透率取 log10、对 skin 取 asinh。""" names = list(param_names or DEFAULT_PARAM_NAMES) log_names = set(DEFAULT_LOG_PARAM_NAMES if log_param_names is None else log_param_names) asinh_names = set(DEFAULT_ASINH_PARAM_NAMES if asinh_param_names is None else asinh_param_names) feature_names: list[str] = [] transforms: dict[str, str] = {} for name in names: # 单参数特征名称直接记录变换方式,便于训练后反查每一列的物理含义。 if enabled and name in log_names: transforms[name] = "log10" feature_names.append(f"log10_{name}") elif enabled and name in asinh_names: transforms[name] = "asinh" feature_names.append(f"asinh_{name}") else: transforms[name] = "identity" feature_names.append(name) composite_features = list(DEFAULT_COMPOSITE_FEATURES) if (enabled and include_composite_features) else [] # 复合特征编码常见试井组合量,例如 kh、k/phi 和井筒储集相关比值。 feature_names.extend(composite_features) return { "enabled": bool(enabled), "param_names": names, "feature_names": feature_names, "transforms": transforms, "log_param_names": sorted(log_names), "asinh_param_names": sorted(asinh_names), "composite_features": composite_features, "log_eps": 1.0e-30, } def param_feature_transform_from_meta(meta: dict[str, Any] | None) -> dict[str, Any] | None: """从预处理元数据恢复参数特征变换配置。""" if not meta: return None transform = meta.get("param_feature_transform") if transform is None: return None return dict(transform) def transform_param_features( params: np.ndarray, transform: dict[str, Any] | None, ) -> np.ndarray: """把原始物理参数转换到更适合神经网络学习的特征空间。""" x = np.asarray(params, dtype=np.float32) if x.ndim != 2: raise ValueError(f"params must be a 2D array, got shape={x.shape}") if transform is None or not bool(transform.get("enabled", False)): return x.astype(np.float32, copy=False) names = list(transform.get("param_names") or DEFAULT_PARAM_NAMES) transforms = dict(transform.get("transforms") or {}) if x.shape[1] != len(names): raise ValueError(f"param feature transform expects {len(names)} columns, got {x.shape[1]}") raw = x.astype(np.float64, copy=True) out = raw.copy() log_eps = float(transform.get("log_eps", 1.0e-30)) for col, name in enumerate(names): mode = str(transforms.get(name, "identity")).lower() if mode == "log10": # 对跨数量级参数取 log10,减少大数值范围对 StandardScaler 和网络的压力。 out[:, col] = np.log10(np.maximum(out[:, col], log_eps)) elif mode == "asinh": # skin 可能为负,asinh 保留符号且在大值区域近似对数。 out[:, col] = np.arcsinh(out[:, col]) elif mode == "identity": continue else: raise ValueError(f"Unknown transform mode for {name}: {mode}") composite_features = list(transform.get("composite_features") or []) if composite_features: name_to_col = {name: idx for idx, name in enumerate(names)} def value(name: str) -> np.ndarray: """按原始参数名取出对应列;缺失时返回 NaN 以便复合特征显式失效。""" idx = name_to_col.get(name) if idx is None: return np.full((raw.shape[0],), np.nan, dtype=np.float64) return raw[:, idx] k = value("k") phi = value("phi") wellboreC = value("wellboreC") h = value("h") composite_map = { # 复合特征均使用原始物理量计算,再按需要取 log10。 "log10_kh": np.log10(np.maximum(k * h, log_eps)), "log10_phi_h": np.log10(np.maximum(phi * h, log_eps)), "log10_k_over_phi": np.log10(np.maximum(k / np.maximum(phi, log_eps), log_eps)), "log10_wellboreC_over_h": np.log10(np.maximum(wellboreC / np.maximum(h, log_eps), log_eps)), "log10_wellboreC_over_phi_h": np.log10( np.maximum(wellboreC / np.maximum(phi * h, log_eps), log_eps) ), } extras = [composite_map[name] for name in composite_features if name in composite_map] if extras: out = np.concatenate([out, np.stack(extras, axis=1)], axis=1) return out.astype(np.float32) def inverse_transform_param_features( features: np.ndarray, transform: dict[str, Any] | None, ) -> np.ndarray: """把模型特征空间中的参数反变换回原始物理尺度。""" x = np.asarray(features, dtype=np.float32) if x.ndim != 2: raise ValueError(f"features must be a 2D array, got shape={x.shape}") if transform is None or not bool(transform.get("enabled", False)): return x.astype(np.float32, copy=False) names = list(transform.get("param_names") or DEFAULT_PARAM_NAMES) transforms = dict(transform.get("transforms") or {}) if x.shape[1] < len(names): raise ValueError(f"param inverse transform expects at least {len(names)} columns, got {x.shape[1]}") out = x[:, : len(names)].astype(np.float64, copy=True) for col, name in enumerate(names): mode = str(transforms.get(name, "identity")).lower() if mode == "log10": out[:, col] = 10.0 ** out[:, col] elif mode == "asinh": out[:, col] = np.sinh(out[:, col]) elif mode == "identity": continue else: raise ValueError(f"Unknown transform mode for {name}: {mode}") return out.astype(np.float32)