The SCE (Stepwise Clustered Ensemble) package provides implementation of Stepwise Clustered Ensemble (SCE) and Stepwise Cluster Analysis (SCA) methods for multivariate data analysis. These methods are particularly useful for handling complex, high-dimensional datasets and building robust predictive models.
You can install the development version of SCE from GitHub:
# install.packages("devtools")
::install_github("loong2020/Stepwise-Clustered-Ensemble") devtools
SCE
: Main function for building a Stepwise Clustered
Ensemble modelSCA
: Stepwise Cluster Analysis (ensemble member of
SCE)Model_simulation
: Perform SCE model predictionSCA_tree_predict
: Perform SCA model predictionSCA_Model_evaluation
: Evaluate model performance for
SCASCE_Model_evaluation
: Evaluate model performance for
SCERFE_SCE
: Recursive Feature Elimination for SCEWilks_importance
: Calculate variable importance for SCE
using Wilks’ lambdaSCA_importance
: Calculate variable importance for a
single SCA treeFirst, load the required packages and data:
# Load required packages
library(SCE)
library(parallel)
# Load the example datasets
data(Streamflow_training_10var)
data(Streamflow_testing_10var)
# Define predictors and predictants
<- c("Prcp", "SRad", "Tmax", "Tmin", "VP", "smlt", "swvl1", "swvl2", "swvl3", "swvl4")
Predictors <- c("Flow")
Predictants
# Perform SCA
set.seed(123)
<- SCA(alpha = 0.05,
model Training_data = Streamflow_training_10var,
X = Predictors,
Y = Predictants,
Nmin = 5,
resolution = 100)
# Calculate variable importance
<- SCA_importance(model)
importance print(importance)
# Make predictions
<- SCA_tree_predict(Testing_data = Streamflow_testing_10var, model = model)
prediction
# Evaluate performance
<- SCA_Model_evaluation(Testing_data = Streamflow_testing_10var,
performance Simulations = prediction,
Predictant = Predictants)
print(performance)
<- importance[order(-importance$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCE)",
ylab = "Importance",
xlab = "Predictor"
)
# Build SCE model
set.seed(123)
<- SCE(Training_data = Streamflow_training_10var,
Ensemble X = Predictors,
Y = Predictants,
mfeature = round(0.5 * length(Predictors)),
Nmin = 5,
Ntree = 40,
alpha = 0.05,
resolution = 100)
# Make predictions
<- Model_simulation(Testing_data = Streamflow_testing_10var, model = Ensemble)
Simulations
# Evaluate model performance
<- SCE_Model_evaluation(Testing_data = Streamflow_testing_10var,
Evaluation Training_data = Streamflow_training_10var,
Simulations = Simulations,
Predictant = Predictants,
digits = 2)
# Calculate variable importance
<- Wilks_importance(Ensemble)
importance print(Evaluation)
<- importance[order(-importance$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCE)",
ylab = "Importance",
xlab = "Predictor"
)
# Define predictors and multiple predictants
# Load the example datasets
data(Air_quality_training)
data(Air_quality_testing)
<- c("SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM")
Predictors <- c("PM2.5", "PM10")
Predictants
# Build and evaluate model
set.seed(123)
<- SCE(Training_data = Air_quality_training,
Ensemble X = Predictors,
Y = Predictants,
mfeature = round(0.5 * length(Predictors)),
Nmin = 5,
Ntree = 40,
alpha = 0.05,
resolution = 100)
<- Model_simulation(Testing_data = Air_quality_testing, model = Ensemble)
Simulations
<- SCE_Model_evaluation(Testing_data = Air_quality_testing,
Evaluation Training_data = Air_quality_training,
Simulations = Simulations,
Predictant = Predictants)
print(Evaluation)
<- Wilks_importance(Ensemble)
importance print(Evaluation)
<- importance[order(-importance$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCE)",
ylab = "Importance",
xlab = "Predictor"
)
# Load the example datasets
data(Streamflow_training_22var)
data(Streamflow_testing_22var)
# Define predictors and predictants
<- c(
Predictors "Precipitation", "Radiation", "Tmax", "Tmin", "VP",
"Precipitation_2Mon", "Radiation_2Mon", "Tmax_2Mon", "Tmin_2Mon", "VP_2Mon",
"PNA", "Nino3.4", "IPO", "PDO",
"PNA_lag1", "Nino3.4_lag1", "IPO_lag1", "PDO_lag1",
"PNA_lag2", "Nino3.4_lag2", "IPO_lag2", "PDO_lag2"
)<- c("Flow")
Predictants
# Perform RFE
set.seed(123)
<- RFE_SCE(
result Training_data = Streamflow_training_22var,
Testing_data = Streamflow_testing_22var,
Predictors = Predictors,
Predictant = Predictants,
Nmin = 5,
Ntree = 48,
alpha = 0.05,
resolution = 1000,
step = 3 # Number of predictors to remove at each iteration
)
# Plot Testing R² results
library(ggplot2)
# Extract Validation and Testing R² values
<- sapply(result[["performances"]], function(x) x["R2", "Validation"])
validation_r2 <- sapply(result[["performances"]], function(x) x["R2", "Testing"])
testing_r2 <- result[["summary"]][["n_predictors"]]
n_predictors
# Create base R plot
plot(n_predictors, validation_r2,
type = "b", # both points and lines
col = "blue",
pch = 16, # filled circle point type
xlim = rev(range(n_predictors)), # reverse x-axis
ylim = c(min(c(validation_r2, testing_r2)), max(c(validation_r2, testing_r2))), # explicit y-axis limits
xlab = "Number of Predictors",
ylab = "R²",
main = "Validation and Testing R² vs Number of Predictors")
# Add testing data
lines(n_predictors, testing_r2, type = "b", col = "red", pch = 16)
# Add legend
legend("bottomleft",
legend = c("Validation", "Testing"),
col = c("blue", "red"),
pch = 16,
lty = 1)
Full documentation is available through the R help system:
# Core functions
?SCE
?SCA
?Model_simulation
?SCA_tree_predict
# Evaluation functions
?SCA_Model_evaluation
?SCE_Model_evaluation
# Feature selection and importance
?RFE_SCE
?Wilks_importance ?SCA_importance
This package is licensed under the GPL-3 License.