第100+17步 ChatGPT学习：R实现Catboost分类

基于R 4.2.2版本演示

一、写在前面

有不少大佬问做机器学习分类能不能用R语言，不想学Python咯。

答曰：可！用GPT或者Kimi转一下就得了呗。

加上最近也没啥内容写了，就帮各位搬运一下吧。

二、R代码实现Catboost分类

（1）导入数据

我习惯用RStudio自带的导入功能：

第100+17步 ChatGPT学习：R实现Catboost分类-LMLPHP

（2）建立Catboost模型（默认参数）

# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(catboost)

# Assume 'data' is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)

# Split data into training and validation sets (80% training, 20% validation)
trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
validData <- data[-trainIndex, ]

# Prepare pools for CatBoost
trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)

# Define parameters for CatBoost
params <- list(
  iterations = 250,
  depth = 6,
  learning_rate = 0.1,
  l2_leaf_reg = 10,
  loss_function = "Logloss",
  eval_metric = "AUC"
)

# Train the CatBoost model
model <- catboost.train(learn_pool = trainPool, params = params)

# Predict on the training and validation sets using the correct parameter
trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")

# Convert predictions to binary using 0.5 as threshold
trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)

# 计算ROC对象
trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)

# 使用ggplot绘制ROC曲线
trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
  geom_line(color = "blue") +
  geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  ggtitle("Training ROC Curve") +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")

validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
  geom_line(color = "red") +
  geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  ggtitle("Validation ROC Curve") +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")

# 显示绘图
print(trainRocPlot)
print(validRocPlot)


# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain <- table(trainData$X, trainPredict >= 0.5)
confMatValid <- table(validData$X, validPredict >= 0.5)

# Plot and display confusion matrices
plot_confusion_matrix <- function(pred, actual, dataset_name) {
  conf_mat <- table(Predicted = pred >= 0.5, Actual = actual)
  conf_mat_df <- as.data.frame(as.table(conf_mat))
  colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
  
  p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
    geom_tile(color = "white") +
    geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
  
  print(p)
}

# Call the function to plot confusion matrices for both training and validation sets
plot_confusion_matrix(trainPredict, trainData$X, "Training")
plot_confusion_matrix(validPredict, validData$X, "Validation")

# Extract values for calculations
a_train <- confMatTrain[1, 1]
b_train <- confMatTrain[1, 2]
c_train <- confMatTrain[2, 1]
d_train <- confMatTrain[2, 2]

a_valid <- confMatValid[1, 1]
b_valid <- confMatValid[1, 2]
c_valid <- confMatValid[2, 1]
d_valid <- confMatValid[2, 2]

# Training Set Metrics
acc_train <- (a_train + d_train) / sum(confMatTrain)
error_rate_train <- 1 - acc_train
sen_train <- d_train / (d_train + c_train)
sep_train <- a_train / (a_train + b_train)
precision_train <- d_train / (b_train + d_train)
F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc

# Validation Set Metrics
acc_valid <- (a_valid + d_valid) / sum(confMatValid)
error_rate_valid <- 1 - acc_valid
sen_valid <- d_valid / (d_valid + c_valid)
sep_valid <- a_valid / (a_valid + b_valid)
precision_valid <- d_valid / (b_valid + d_valid)
F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
auc_valid <- roc(response = validData$X, predictor = validPredict)$auc

# Print Metrics
cat("Training Metrics\n")
cat("Accuracy:", acc_train, "\n")
cat("Error Rate:", error_rate_train, "\n")
cat("Sensitivity:", sen_train, "\n")
cat("Specificity:", sep_train, "\n")
cat("Precision:", precision_train, "\n")
cat("F1 Score:", F1_train, "\n")
cat("MCC:", MCC_train, "\n")
cat("AUC:", auc_train, "\n\n")

cat("Validation Metrics\n")
cat("Accuracy:", acc_valid, "\n")
cat("Error Rate:", error_rate_valid, "\n")
cat("Sensitivity:", sen_valid, "\n")
cat("Specificity:", sep_valid, "\n")
cat("Precision:", precision_valid, "\n")
cat("F1 Score:", F1_valid, "\n")
cat("MCC:", MCC_valid, "\n")
cat("AUC:", auc_valid, "\n")

在R语言中，Catboost模型得单独安装，下面是一些可以调整的关键参数：

①学习率 (learning_rate)：控制每步模型更新的幅度。较小的学习率可以提高模型的训练稳定性和准确性，但可能需要更多的时间和更多的树来收敛。

②树的深度 (depth)：决定了每棵树的最大深度。较深的树可以更好地捕捉数据中的复杂关系，但也可能导致过拟合。

③树的数量 (iterations)：模型中树的总数。更多的树可以增加模型的复杂度和能力，但同样可能导致过拟合。

④L2 正则化系数 (l2_leaf_reg)：在模型的损失函数中增加一个正则项，以减少模型复杂度和过拟合风险。

⑤边界计数 (border_count)：用于数值特征分箱的边界数量，影响模型在连续特征上的决策边界。

⑥类别特征组合深度 (cat_features)：CatBoost 优化了对类别特征的处理，可以指定在模型中使用的类别特征。

⑦子采样 (subsample)：指定每棵树训练时从训练数据集中随机抽取的比例，有助于防止模型过拟合。

⑧列采样 (colsample_bylevel，colsample_bytree)：控制每棵树或每个级别使用的特征的比例，可以增加模型的多样性，降低过拟合风险。

⑨最小数据在叶节点 (min_data_in_leaf)：叶节点必需的最小样本数量，增加这个参数的值可以防止模型学习过于具体的模式，从而降低过拟合风险。

⑩评估指标 (eval_metric)：用于训练过程中模型评估的性能指标。

结果输出（随便挑的）：

第100+17步 ChatGPT学习：R实现Catboost分类-LMLPHP

从AUC来看，Catboost随便一跑，就跑出过拟合了，跟Xgboost差不多。

三、Catboost调参

随便设置了一下，效果不明显，给各位自行嗨皮：

# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(catboost)

# Assume 'data' is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)

# Convert the target variable to factor if not already
data$X <- as.factor(data$X)
data$X <- as.numeric(data$X) - 1

# Split data into training and validation sets (80% training, 20% validation)
trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
validData <- data[-trainIndex, ]

# Prepare CatBoost pools
trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)

# Define parameter grid
depths <- c(2, 4, 6)  # Reduced maximum depth
l2_leaf_regs <- c(1, 3, 5, 10, 20, 25)  # Increased maximum regularization
iterations <- c(500, 1000)  # Added higher iteration count for lower learning rates
learning_rates <- c(0.05, 0.1)  # Lower maximum learning rate
subsample <- 1.0  # Use 80% of data for each tree to prevent overfitting

best_auc <- 0
best_params <- list()

# Loop through parameter grid
for (depth in depths) {
  for (l2_leaf_reg in l2_leaf_regs) {
    for (iter in iterations) {
      for (learning_rate in learning_rates) {
        # Set parameters for this iteration
        params <- list(
          iterations = iter,
          depth = depth,
          learning_rate = learning_rate,
          l2_leaf_reg = l2_leaf_reg,
          loss_function = 'Logloss',
          eval_metric = 'AUC'
        )
        
        # Train the model
        model <- catboost.train(learn_pool = trainPool, test_pool = validPool, params = params)
        
        # Predict on the validation set
        validPredict <- catboost.predict(model, validPool)
        if (is.vector(validPredict)) {
          validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
        } else {
          # Assuming the second column is the probability of the positive class
          validPredictBinary <- ifelse(validPredict[, 2] > 0.5, 1, 0)
        }
        
        # Calculate AUC
        validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredictBinary)
        auc_score <- auc(validRoc)
        
        # Update best model if current AUC is better
        if (auc_score > best_auc) {
          best_auc <- auc_score
          best_params <- params
        }
      }
    }
  }
}

# Print the best AUC and corresponding parameters
print(paste("Best AUC:", best_auc))
print("Best Parameters:")
print(best_params)

# After parameter tuning, train the model with best parameters
model <- catboost.train(learn_pool = trainPool, params = best_params)

# Predict on the training and validation sets using the correct parameter
trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")

# Convert predictions to binary using 0.5 as threshold
trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)

# 计算ROC对象
trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)

# 使用ggplot绘制ROC曲线
trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
  geom_line(color = "blue") +
  geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  ggtitle("Training ROC Curve") +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")

validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
  geom_line(color = "red") +
  geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  ggtitle("Validation ROC Curve") +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")

# 显示绘图
print(trainRocPlot)
print(validRocPlot)


# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain <- table(trainData$X, trainPredict >= 0.5)
confMatValid <- table(validData$X, validPredict >= 0.5)

# Function to plot confusion matrix using ggplot2
plot_confusion_matrix <- function(conf_mat, dataset_name) {
  conf_mat_df <- as.data.frame(as.table(conf_mat))
  colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
  
  p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
    geom_tile(color = "white") +
    geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
  
  print(p)
}

# Now call the function to plot and display the confusion matrices
plot_confusion_matrix(confMatTrain, "Training")
plot_confusion_matrix(confMatValid, "Validation")

# Extract values for calculations
a_train <- confMatTrain[1, 1]
b_train <- confMatTrain[1, 2]
c_train <- confMatTrain[2, 1]
d_train <- confMatTrain[2, 2]

a_valid <- confMatValid[1, 1]
b_valid <- confMatValid[1, 2]
c_valid <- confMatValid[2, 1]
d_valid <- confMatValid[2, 2]

# Training Set Metrics
acc_train <- (a_train + d_train) / sum(confMatTrain)
error_rate_train <- 1 - acc_train
sen_train <- d_train / (d_train + c_train)
sep_train <- a_train / (a_train + b_train)
precision_train <- d_train / (b_train + d_train)
F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc

# Validation Set Metrics
acc_valid <- (a_valid + d_valid) / sum(confMatValid)
error_rate_valid <- 1 - acc_valid
sen_valid <- d_valid / (d_valid + c_valid)
sep_valid <- a_valid / (a_valid + b_valid)
precision_valid <- d_valid / (b_valid + d_valid)
F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
auc_valid <- roc(response = validData$X, predictor = validPredict)$auc

# Print Metrics
cat("Training Metrics\n")
cat("Accuracy:", acc_train, "\n")
cat("Error Rate:", error_rate_train, "\n")
cat("Sensitivity:", sen_train, "\n")
cat("Specificity:", sep_train, "\n")
cat("Precision:", precision_train, "\n")
cat("F1 Score:", F1_train, "\n")
cat("MCC:", MCC_train, "\n")
cat("AUC:", auc_train, "\n\n")

cat("Validation Metrics\n")
cat("Accuracy:", acc_valid, "\n")
cat("Error Rate:", error_rate_valid, "\n")
cat("Sensitivity:", sen_valid, "\n")
cat("Specificity:", sep_valid, "\n")
cat("Precision:", precision_valid, "\n")
cat("F1 Score:", F1_valid, "\n")
cat("MCC:", MCC_valid, "\n")
cat("AUC:", auc_valid, "\n")

结果输出：

第100+17步 ChatGPT学习：R实现Catboost分类-LMLPHP

提供个样本代码吧，我不调了。

五、最后

至于怎么安装，自学了哈。

数据嘛：

链接：https://pan.baidu.com/s/1rEf6JZyzA1ia5exoq5OF7g?pwd=x8xm

提取码：x8xm

Jet4505

第100+17步 ChatGPT学习：R实现Catboost分类