我正在尝试使用Java Weka API构建分类模型。我的训练数据集存在 class 不平衡的问题。因此,我想使用SMOTE等类不平衡技术来减少类不平衡问题。
源代码如下:
package classification;
import java.util.Random;
import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class questStackoverflow {
public static void main(String agrs[]) throws Exception{
String fileRootPath = "../file.arff"; //Dataset
Instances strdata = DataSource.read(fileRootPath); //Load Dataset
StringToWordVector filter = new StringToWordVector(10000);
filter.setInputFormat(strdata);
String[] options = { "-W", "10000", "-L", "-M", "1",
"-stemmer", "weka.core.stemmers.IteratedLovinsStemmer",
"-stopwords-handler", "weka.core.stopwords.Rainbow",
"-tokenizer", "weka.core.tokenizers.AlphabeticTokenizer"
};
filter.setOptions(options);
filter.setIDFTransform(true);
Instances data = Filter.useFilter(strdata,filter); //Apply filter
data.setClassIndex(0); //set class index
double recall=0.0;
double precision=0.0;
double fmeasure=0.0;
double tp, fp, fn, tn;
Classifier classifier = null;
classifier = new NaiveBayesMultinomial(); //classifer
int folds = 10;
Random random = new Random(1);
data.randomize(random);
data.stratify(folds);
tp = fp = fn = tn = 0;
for (int i = 0; i < folds; i++) {
Instances trains = data.trainCV(folds, i,random); //training dataset
Instances tests = data.testCV(folds, i); //testing dataset
classifier.buildClassifier(trains); //build classifier
for (int j = 0; j < tests.numInstances(); j++) {
Instance instance = tests.instance(j);
double classValue = instance.classValue();
double result = classifier.classifyInstance(instance);
if (result == 0.0 && classValue == 0.0) {
tp++;
} else if (result == 0.0 && classValue == 1.0) {
fp++;
} else if (result == 1.0 && classValue == 0.0) {
fn++;
} else if (result == 1.0 && classValue == 1.0) {
tn++;
}
}
}
if (tn + fn > 0)
precision = tn / (tn + fn);
if (tn + fp > 0)
recall = tn / (tn + fp);
if (precision + recall > 0)
fmeasure = 2 * precision * recall / (precision + recall);
System.out.println("Precision: " + precision);
System.out.println("Recall: " + recall);
System.out.println("Fmeasure: " + fmeasure);
}
}
没有类不平衡技术,我的代码运行良好。但是,我需要使用类不平衡技术来缓解类不平衡问题。但是,我不知道如何在Java Weka API中使用它。
最佳答案
您可以在代码中添加以下代码行:
weka.filters.supervised.instance.SMOTE
SMOTE smote=new SMOTE();
smote.setInputFormat(trains);
Instances Trains_smote= Filter.useFilter(trains, smote);
您的代码如下。
package classification;
import java.util.Random;
import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
weka.filters.supervised.instance.SMOTE
public class questStackoverflow {
public static void main(String agrs[]) throws Exception{
String fileRootPath = "../file.arff"; //Dataset
Instances strdata = DataSource.read(fileRootPath); //Load Dataset
StringToWordVector filter = new StringToWordVector(10000);
filter.setInputFormat(strdata);
String[] options = { "-W", "10000", "-L", "-M", "1",
"-stemmer", "weka.core.stemmers.IteratedLovinsStemmer",
"-stopwords-handler", "weka.core.stopwords.Rainbow",
"-tokenizer", "weka.core.tokenizers.AlphabeticTokenizer"
};
filter.setOptions(options);
filter.setIDFTransform(true);
Instances data = Filter.useFilter(strdata,filter); //Apply filter
data.setClassIndex(0); //set class index
double recall=0.0;
double precision=0.0;
double fmeasure=0.0;
double tp, fp, fn, tn;
Classifier classifier = null;
classifier = new NaiveBayesMultinomial(); //classifer
int folds = 10;
Random random = new Random(1);
data.randomize(random);
data.stratify(folds);
tp = fp = fn = tn = 0;
for (int i = 0; i < folds; i++) {
Instances trains = data.trainCV(folds, i,random); //training dataset
Instances tests = data.testCV(folds, i); //testing dataset
SMOTE smote=new SMOTE();
smote.setInputFormat(trains);
Instances Trains_smote = Filter.useFilter(trains, smote);
classifier.buildClassifier(Trains_smote); //build classifier
for (int j = 0; j < tests.numInstances(); j++) {
Instance instance = tests.instance(j);
double classValue = instance.classValue();
double result = classifier.classifyInstance(instance);
if (result == 0.0 && classValue == 0.0) {
tp++;
} else if (result == 0.0 && classValue == 1.0) {
fp++;
} else if (result == 1.0 && classValue == 0.0) {
fn++;
} else if (result == 1.0 && classValue == 1.0) {
tn++;
}
}
}
if (tn + fn > 0)
precision = tn / (tn + fn);
if (tn + fp > 0)
recall = tn / (tn + fp);
if (precision + recall > 0)
fmeasure = 2 * precision * recall / (precision + recall);
System.out.println("Precision: " + precision);
System.out.println("Recall: " + recall);
System.out.println("Fmeasure: " + fmeasure);
}
}