基于R 4.2.2版本演示
一、写在前面
有不少大佬问做机器学习分类能不能用R语言,不想学Python咯。
答曰:可!用GPT或者Kimi转一下就得了呗。
加上最近也没啥内容写了,就帮各位搬运一下吧。
二、R代码实现Catboost分类
(1)导入数据
我习惯用RStudio自带的导入功能:
(2)建立Catboost模型(默认参数)
# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(catboost)
# Assume 'data' is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)
# Split data into training and validation sets (80% training, 20% validation)
trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
validData <- data[-trainIndex, ]
# Prepare pools for CatBoost
trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)
# Define parameters for CatBoost
params <- list(
iterations = 250,
depth = 6,
learning_rate = 0.1,
l2_leaf_reg = 10,
loss_function = "Logloss",
eval_metric = "AUC"
)
# Train the CatBoost model
model <- catboost.train(learn_pool = trainPool, params = params)
# Predict on the training and validation sets using the correct parameter
trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")
# Convert predictions to binary using 0.5 as threshold
trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
# 计算ROC对象
trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)
# 使用ggplot绘制ROC曲线
trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
geom_line(color = "blue") +
geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
ggtitle("Training ROC Curve") +
xlab("False Positive Rate") +
ylab("True Positive Rate") +
annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
geom_line(color = "red") +
geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
ggtitle("Validation ROC Curve") +
xlab("False Positive Rate") +
ylab("True Positive Rate") +
annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
# 显示绘图
print(trainRocPlot)
print(validRocPlot)
# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain <- table(trainData$X, trainPredict >= 0.5)
confMatValid <- table(validData$X, validPredict >= 0.5)
# Plot and display confusion matrices
plot_confusion_matrix <- function(pred, actual, dataset_name) {
conf_mat <- table(Predicted = pred >= 0.5, Actual = actual)
conf_mat_df <- as.data.frame(as.table(conf_mat))
colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
print(p)
}
# Call the function to plot confusion matrices for both training and validation sets
plot_confusion_matrix(trainPredict, trainData$X, "Training")
plot_confusion_matrix(validPredict, validData$X, "Validation")
# Extract values for calculations
a_train <- confMatTrain[1, 1]
b_train <- confMatTrain[1, 2]
c_train <- confMatTrain[2, 1]
d_train <- confMatTrain[2, 2]
a_valid <- confMatValid[1, 1]
b_valid <- confMatValid[1, 2]
c_valid <- confMatValid[2, 1]
d_valid <- confMatValid[2, 2]
# Training Set Metrics
acc_train <- (a_train + d_train) / sum(confMatTrain)
error_rate_train <- 1 - acc_train
sen_train <- d_train / (d_train + c_train)
sep_train <- a_train / (a_train + b_train)
precision_train <- d_train / (b_train + d_train)
F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
# Validation Set Metrics
acc_valid <- (a_valid + d_valid) / sum(confMatValid)
error_rate_valid <- 1 - acc_valid
sen_valid <- d_valid / (d_valid + c_valid)
sep_valid <- a_valid / (a_valid + b_valid)
precision_valid <- d_valid / (b_valid + d_valid)
F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
# Print Metrics
cat("Training Metrics\n")
cat("Accuracy:", acc_train, "\n")
cat("Error Rate:", error_rate_train, "\n")
cat("Sensitivity:", sen_train, "\n")
cat("Specificity:", sep_train, "\n")
cat("Precision:", precision_train, "\n")
cat("F1 Score:", F1_train, "\n")
cat("MCC:", MCC_train, "\n")
cat("AUC:", auc_train, "\n\n")
cat("Validation Metrics\n")
cat("Accuracy:", acc_valid, "\n")
cat("Error Rate:", error_rate_valid, "\n")
cat("Sensitivity:", sen_valid, "\n")
cat("Specificity:", sep_valid, "\n")
cat("Precision:", precision_valid, "\n")
cat("F1 Score:", F1_valid, "\n")
cat("MCC:", MCC_valid, "\n")
cat("AUC:", auc_valid, "\n")
在R语言中,Catboost模型得单独安装,下面是一些可以调整的关键参数:
①学习率 (learning_rate):控制每步模型更新的幅度。较小的学习率可以提高模型的训练稳定性和准确性,但可能需要更多的时间和更多的树来收敛。
②树的深度 (depth):决定了每棵树的最大深度。较深的树可以更好地捕捉数据中的复杂关系,但也可能导致过拟合。
③树的数量 (iterations):模型中树的总数。更多的树可以增加模型的复杂度和能力,但同样可能导致过拟合。
④L2 正则化系数 (l2_leaf_reg):在模型的损失函数中增加一个正则项,以减少模型复杂度和过拟合风险。
⑤边界计数 (border_count):用于数值特征分箱的边界数量,影响模型在连续特征上的决策边界。
⑥类别特征组合深度 (cat_features):CatBoost 优化了对类别特征的处理,可以指定在模型中使用的类别特征。
⑦子采样 (subsample):指定每棵树训练时从训练数据集中随机抽取的比例,有助于防止模型过拟合。
⑧列采样 (colsample_bylevel,colsample_bytree):控制每棵树或每个级别使用的特征的比例,可以增加模型的多样性,降低过拟合风险。
⑨最小数据在叶节点 (min_data_in_leaf):叶节点必需的最小样本数量,增加这个参数的值可以防止模型学习过于具体的模式,从而降低过拟合风险。
⑩评估指标 (eval_metric):用于训练过程中模型评估的性能指标。
结果输出(随便挑的):
从AUC来看,Catboost随便一跑,就跑出过拟合了,跟Xgboost差不多。
三、Catboost调参
随便设置了一下,效果不明显,给各位自行嗨皮:
# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(catboost)
# Assume 'data' is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)
# Convert the target variable to factor if not already
data$X <- as.factor(data$X)
data$X <- as.numeric(data$X) - 1
# Split data into training and validation sets (80% training, 20% validation)
trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
validData <- data[-trainIndex, ]
# Prepare CatBoost pools
trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)
# Define parameter grid
depths <- c(2, 4, 6) # Reduced maximum depth
l2_leaf_regs <- c(1, 3, 5, 10, 20, 25) # Increased maximum regularization
iterations <- c(500, 1000) # Added higher iteration count for lower learning rates
learning_rates <- c(0.05, 0.1) # Lower maximum learning rate
subsample <- 1.0 # Use 80% of data for each tree to prevent overfitting
best_auc <- 0
best_params <- list()
# Loop through parameter grid
for (depth in depths) {
for (l2_leaf_reg in l2_leaf_regs) {
for (iter in iterations) {
for (learning_rate in learning_rates) {
# Set parameters for this iteration
params <- list(
iterations = iter,
depth = depth,
learning_rate = learning_rate,
l2_leaf_reg = l2_leaf_reg,
loss_function = 'Logloss',
eval_metric = 'AUC'
)
# Train the model
model <- catboost.train(learn_pool = trainPool, test_pool = validPool, params = params)
# Predict on the validation set
validPredict <- catboost.predict(model, validPool)
if (is.vector(validPredict)) {
validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
} else {
# Assuming the second column is the probability of the positive class
validPredictBinary <- ifelse(validPredict[, 2] > 0.5, 1, 0)
}
# Calculate AUC
validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredictBinary)
auc_score <- auc(validRoc)
# Update best model if current AUC is better
if (auc_score > best_auc) {
best_auc <- auc_score
best_params <- params
}
}
}
}
}
# Print the best AUC and corresponding parameters
print(paste("Best AUC:", best_auc))
print("Best Parameters:")
print(best_params)
# After parameter tuning, train the model with best parameters
model <- catboost.train(learn_pool = trainPool, params = best_params)
# Predict on the training and validation sets using the correct parameter
trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")
# Convert predictions to binary using 0.5 as threshold
trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
# 计算ROC对象
trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)
# 使用ggplot绘制ROC曲线
trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
geom_line(color = "blue") +
geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
ggtitle("Training ROC Curve") +
xlab("False Positive Rate") +
ylab("True Positive Rate") +
annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
geom_line(color = "red") +
geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
ggtitle("Validation ROC Curve") +
xlab("False Positive Rate") +
ylab("True Positive Rate") +
annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
# 显示绘图
print(trainRocPlot)
print(validRocPlot)
# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain <- table(trainData$X, trainPredict >= 0.5)
confMatValid <- table(validData$X, validPredict >= 0.5)
# Function to plot confusion matrix using ggplot2
plot_confusion_matrix <- function(conf_mat, dataset_name) {
conf_mat_df <- as.data.frame(as.table(conf_mat))
colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
print(p)
}
# Now call the function to plot and display the confusion matrices
plot_confusion_matrix(confMatTrain, "Training")
plot_confusion_matrix(confMatValid, "Validation")
# Extract values for calculations
a_train <- confMatTrain[1, 1]
b_train <- confMatTrain[1, 2]
c_train <- confMatTrain[2, 1]
d_train <- confMatTrain[2, 2]
a_valid <- confMatValid[1, 1]
b_valid <- confMatValid[1, 2]
c_valid <- confMatValid[2, 1]
d_valid <- confMatValid[2, 2]
# Training Set Metrics
acc_train <- (a_train + d_train) / sum(confMatTrain)
error_rate_train <- 1 - acc_train
sen_train <- d_train / (d_train + c_train)
sep_train <- a_train / (a_train + b_train)
precision_train <- d_train / (b_train + d_train)
F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
# Validation Set Metrics
acc_valid <- (a_valid + d_valid) / sum(confMatValid)
error_rate_valid <- 1 - acc_valid
sen_valid <- d_valid / (d_valid + c_valid)
sep_valid <- a_valid / (a_valid + b_valid)
precision_valid <- d_valid / (b_valid + d_valid)
F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
# Print Metrics
cat("Training Metrics\n")
cat("Accuracy:", acc_train, "\n")
cat("Error Rate:", error_rate_train, "\n")
cat("Sensitivity:", sen_train, "\n")
cat("Specificity:", sep_train, "\n")
cat("Precision:", precision_train, "\n")
cat("F1 Score:", F1_train, "\n")
cat("MCC:", MCC_train, "\n")
cat("AUC:", auc_train, "\n\n")
cat("Validation Metrics\n")
cat("Accuracy:", acc_valid, "\n")
cat("Error Rate:", error_rate_valid, "\n")
cat("Sensitivity:", sen_valid, "\n")
cat("Specificity:", sep_valid, "\n")
cat("Precision:", precision_valid, "\n")
cat("F1 Score:", F1_valid, "\n")
cat("MCC:", MCC_valid, "\n")
cat("AUC:", auc_valid, "\n")
结果输出:
提供个样本代码吧,我不调了。
五、最后
至于怎么安装,自学了哈。
数据嘛:
链接:https://pan.baidu.com/s/1rEf6JZyzA1ia5exoq5OF7g?pwd=x8xm
提取码:x8xm