This vignette demonstrates supervised learning capabilities in tidylearn. All methods shown here wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.
Wrapped packages include:
lm(), glm()) for linear and
logistic regressionAccess raw model objects via model$fit for
package-specific functionality.
Let’s create a binary classification problem from the iris dataset:
# Create binary classification dataset
iris_binary <- iris %>%
filter(Species %in% c("setosa", "versicolor")) %>%
mutate(Species = droplevels(Species))
# Split data
split <- tl_split(iris_binary, prop = 0.7, stratify = "Species", seed = 123)# Train logistic regression
model_logistic <- tl_model(split$train, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_logistic)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: logistic
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 70# Train decision tree
model_tree <- tl_model(split$train, Species ~ ., method = "tree")
print(model_tree)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: tree
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 70
# Predictions
preds_tree <- predict(model_tree, new_data = split$test)# Split full iris dataset
split_multi <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)# Train random forest
model_forest <- tl_model(split_multi$train, Species ~ ., method = "forest")
print(model_forest)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: forest
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 105# Split mtcars data
split_reg <- tl_split(mtcars, prop = 0.7, seed = 123)
# Train linear model
model_lm <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")
print(model_lm)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: linear
#> Task: Regression
#> Formula: mpg ~ wt + hp + disp
#>
#> Training observations: 22# Polynomial regression for non-linear relationships
model_poly <- tl_model(split_reg$train, mpg ~ wt, method = "polynomial", degree = 2)
print(model_poly)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: polynomial
#> Task: Regression
#> Formula: mpg ~ wt
#>
#> Training observations: 22Regularization helps prevent overfitting by adding penalties to model complexity.
# Compare multiple models
models <- list(
linear = tl_model(split_reg$train, mpg ~ ., method = "linear"),
tree = tl_model(split_reg$train, mpg ~ ., method = "tree"),
forest = tl_model(split_reg$train, mpg ~ ., method = "forest")
)# Calculate RMSE for each model
results <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
for (model_name in names(models)) {
preds <- predict(models[[model_name]], new_data = split_reg$test)
rmse <- sqrt(mean((preds$.pred - split_reg$test$mpg)^2))
results <- rbind(results, data.frame(
Model = model_name,
RMSE = rmse
))
}
results <- results %>% arrange(RMSE)
print(results)
#> Model RMSE
#> 1 forest 2.046967
#> 2 linear 2.281450
#> 3 tree 4.095888# Interaction terms
model_interact <- tl_model(split_reg$train, mpg ~ wt * hp, method = "linear")
# Polynomial terms using I()
model_poly_manual <- tl_model(split_reg$train, mpg ~ wt + I(wt^2), method = "linear")
# Subset of predictors
model_subset <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")# Create dataset with categorical variables
mtcars_cat <- mtcars %>%
mutate(
cyl = as.factor(cyl),
gear = as.factor(gear),
am = as.factor(am)
)
split_cat <- tl_split(mtcars_cat, prop = 0.7, seed = 123)
# Model with categorical predictors
model_cat <- tl_model(split_cat$train, mpg ~ ., method = "forest")
print(model_cat)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: forest
#> Task: Regression
#> Formula: mpg ~ .
#>
#> Training observations: 22# Create data with missing values
mtcars_missing <- mtcars
mtcars_missing[sample(1:nrow(mtcars_missing), 5), "hp"] <- NA
mtcars_missing[sample(1:nrow(mtcars_missing), 3), "wt"] <- NA
# Preprocess to handle missing values
processed_missing <- tl_prepare_data(
mtcars_missing,
mpg ~ .,
impute_method = "mean",
scale_method = "standardize"
)
#> Imputing missing values using method: mean
#> Scaling numeric features using method: standardize
# Train model
model_imputed <- tl_model(processed_missing$data, mpg ~ ., method = "linear")tidylearn provides a unified interface for supervised learning:
tl_model()) for all methods# Complete workflow example
final_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
final_prep <- tl_prepare_data(final_split$train, Species ~ ., scale_method = "standardize")
#> Scaling numeric features using method: standardize
final_model <- tl_model(final_prep$data, Species ~ ., method = "forest")
final_preds <- predict(final_model, new_data = final_split$test)
# Evaluate
accuracy <- mean(final_preds$.pred == final_split$test$Species)
cat("Test Accuracy:", round(accuracy * 100, 1), "%\n")
#> Test Accuracy: 33.3 %