Source code for watertap.flowsheets.METAB.surrogate_model_generator

#################################################################################
# WaterTAP Copyright (c) 2020-2025, The Regents of the University of California,
# through Lawrence Berkeley National Laboratory, Oak Ridge National Laboratory,
# National Renewable Energy Laboratory, and National Energy Technology
# Laboratory (subject to receipt of any required approvals from the U.S. Dept.
# of Energy). All rights reserved.
#
# Please see the files COPYRIGHT.md and LICENSE.md for full copyright and license
# information, respectively. These files are also available online at the URL
# "https://github.com/watertap-org/watertap/"
#################################################################################
import os
import pandas as pd

# Import IDAES libraries
from idaes.core.surrogate.plotting.sm_plotter import (
    surrogate_parity,
)

from idaes.core.surrogate.pysmo_surrogate import (
    PysmoPolyTrainer,
    PysmoRBFTrainer,
    PysmoKrigingTrainer,
    PysmoSurrogate,
)
from idaes.core.surrogate.alamopy import AlamoTrainer, AlamoSurrogate

__author__ = "Marcus Holly"

local_path = os.path.dirname(os.path.abspath(__file__))


[docs]def get_data( input_data_file=os.path.join(local_path, "input_data.csv"), output_data_file=os.path.join(local_path, "output_data.csv"), ): """ Load input and output data from CSV files and combine them into a single feed DataFrame. Args: input_data_file: Path to the input variables CSV file. output_data_file: Path to the output variables CSV file. Returns: tuple: - ``feed_data`` : Combined input and output DataFrame used for training. - ``input_data`` : Input variables only. - ``output_data`` : Output variables only (first column dropped). """ input_data = pd.read_csv(input_data_file, header=0) output_data = pd.read_csv(output_data_file, header=0).iloc[:, 1:] feed_data = pd.concat([input_data, output_data], axis=1) return feed_data, input_data, output_data
[docs]def outputs_selections(output_data): """ Filter and reorder output data columns to the standard ADM1 component set. The selected outputs are (in order): ``S_su``, ``S_aa``, ``S_fa``, ``S_va``, ``S_bu``, ``S_pro``, ``S_ac``, ``S_h2``, ``S_ch4``, ``S_IC``, ``S_IN``, ``S_I``, ``X_c``, ``X_ch``, ``X_pr``, ``X_li``, ``X_su``, ``X_aa``, ``X_fa``, ``X_c4``, ``X_pro``, ``X_ac``, ``X_h2``, ``X_I``, ``VolumetricFlowrate``. Args: output_data (pd.DataFrame): Raw output DataFrame as loaded from CSV. Returns: pd.DataFrame: Output DataFrame restricted to the standard ADM1 component columns in the order listed above """ outputs_list = [ "S_su", "S_aa", "S_fa", "S_va", "S_bu", "S_pro", "S_ac", "S_h2", "S_ch4", "S_IC", "S_IN", "S_I", "X_c", "X_ch", "X_pr", "X_li", "X_su", "X_aa", "X_fa", "X_c4", "X_pro", "X_ac", "X_h2", "X_I", "VolumetricFlowrate", ] output_data.columns = output_data.columns.str.strip().str.replace(" ", "") print(output_data.columns.tolist()) output_data = output_data[outputs_list] return output_data
[docs]def gen_surrogate_model( tool="idaes", method="poly", feed_data=None, input_data=None, output_data=None ): """ Train an IDAES surrogate model and save it to disk alongside a parity plot. Method-specific configuration: - ``"poly"`` : PySMO polynomial up to order 6, multinomials enabled, 80/20 train/test split, 3-fold cross-validation. - ``"kri"`` : PySMO Kriging with numerical gradients and regularization. - ``"rbf"`` : PySMO RBF with cubic basis function. - ``"alamo"`` : ALAMO with constant, linear, exponential, log, sin, cos basis functions and monomial/multi-linear powers up to 3. Args: tool (str, optional): Surrogate toolbox identifier method (str, optional): Surrogate modelling method feed_data: Combined input and output DataFrame used for training input_data (pd.DataFrame, optional): Input variables DataFrame. Used to derive input labels and bounds output_data (pd.DataFrame, optional): Output variables DataFrame. Returns: None. Writes the following files to the module directory: - ``<method>_surrogate.json`` : serialised surrogate model. - ``<method>_parity.pdf`` : parity plot for all output variables. """ if method not in ("poly", "kri", "rbf", "alamo"): raise ValueError( f"Unsupported method: {method}. Choose from 'poly', 'kri', 'rbf', or 'alamo'." ) if feed_data is None: feed_data = pd.concat([input_data, output_data], axis=1) input_labels = list(input_data.columns) output_labels = list(output_data.columns) xmin, xmax = input_data.min().tolist(), input_data.max().tolist() input_bounds = { input_labels[i]: (xmin[i], xmax[i]) for i in range(len(input_labels)) } if method == "poly": # Create PySMO trainer object trainer = PysmoPolyTrainer( input_labels=input_labels, output_labels=output_labels, training_dataframe=feed_data, ) # Set PySMO options trainer.config.maximum_polynomial_order = 6 trainer.config.multinomials = True trainer.config.training_split = 0.8 trainer.config.number_of_crossvalidations = 3 # Train surrogate (calls PySMO through IDAES Python wrapper) poly_train = trainer.train_surrogate() poly_surr = PysmoSurrogate( poly_train, input_labels, output_labels, input_bounds ) poly_surr.save_to_file( os.path.join(local_path, f"{method}_surrogate.json"), overwrite=True ) surrogate_parity( poly_surr, feed_data, filename=os.path.join(local_path, f"{method}_parity.pdf"), ) elif method == "kri": trainer = PysmoKrigingTrainer( input_labels=input_labels, output_labels=output_labels, training_dataframe=feed_data, ) # Set PySMO options trainer.config.numerical_gradients = True trainer.config.regularization = True # Train surrogate (calls PySMO through IDAES Python wrapper) krig_train = trainer.train_surrogate() krig_surr = PysmoSurrogate( krig_train, input_labels, output_labels, input_bounds ) krig_surr.save_to_file( os.path.join(local_path, f"{method}_surrogate.json"), overwrite=True ) surrogate_parity( krig_surr, feed_data, filename=os.path.join(local_path, f"{method}_parity.pdf"), ) elif method == "rbf": trainer = PysmoRBFTrainer( input_labels=input_labels, output_labels=output_labels, training_dataframe=feed_data, ) trainer.config.basis_function = "cubic" rbf_train = trainer.train_surrogate() rbf_surr = PysmoSurrogate(rbf_train, input_labels, output_labels, input_bounds) rbf_surr.save_to_file( os.path.join(local_path, f"{method}_surrogate.json"), overwrite=True ) surrogate_parity( rbf_surr, feed_data, filename=os.path.join(local_path, f"{method}_parity.pdf"), ) elif method == "alamo": trainer = AlamoTrainer( input_labels=input_labels, output_labels=output_labels, training_dataframe=feed_data, ) trainer.config.constant = 1 trainer.config.linfcns = 1 trainer.config.expfcns = 1 trainer.config.logfcns = 1 trainer.config.sinfcns = 1 trainer.config.cosfcns = 1 trainer.config.monomialpower = [2, 3, 4] trainer.config.multi2power = [1, 2, 3] trainer.config.multi3power = [1, 2, 3] success, alm_surr, msg = trainer.train_surrogate() surrogate_expressions = trainer._results["Model"] alm_surr = AlamoSurrogate( surrogate_expressions, input_labels, output_labels, input_bounds ) alm_surr.save_to_file( os.path.join(local_path, f"{method}_surrogate.json"), overwrite=True ) surrogate_parity( alm_surr, feed_data, filename=os.path.join(local_path, f"{method}_parity.pdf"), )
if __name__ == "__main__": feed_data, input_data, output_data = get_data() output_data = outputs_selections(output_data) feed_data = pd.concat([input_data, output_data], axis=1) gen_surrogate_model( tool="idaes", method="poly", # kri, poly,rbf,alamo feed_data=feed_data, input_data=input_data, output_data=output_data, )