我是机器学习的新手,正在尝试了解SequentialFeatureSelector
sklearn的概念。我正在使用Anaconda和Jupyter笔记本进行POC。我已经进口了

from mlxtend.feature_selection import SequentialFeatureSelector as SFS


包。默认情况下,mlxtend软件包不是Anaconda的一部分,那么我已经通过pip install mlxtend命令进行了安装。

我为此数据使用了sklearn Boston住房数据集,并在以下代码中进行了操作。在安装sfs时,出现错误。

如何解决这个错误?

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline
data = load_boston()
print(data.keys())
X = pd.DataFrame(data.data)
X.columns = data.feature_names
y = data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
sfs1=sfs(RandomForestRegressor(n_jobs=1),
    k_features=7,
    forward=True,
    floating=False,
    verbose=3,
    scoring='roc_auc',
    cv=3
   )
sfs1=sfs1.fit(X_train,y_train)


错误

ValueError                                Traceback (most recent call last)
<ipython-input-77-96b29660189d> in <module>
      1 #sfs1.fit(X_train,y_train)
      2 X_train.shape
----> 3 sfs2=sfs1.fit(X_train,y_train)

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in fit(self, X, y, custom_feature_names, **fit_params)
    371                         X=X_,
    372                         y=y,
--> 373                         **fit_params
    374                     )
    375                 else:

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y, ignore_feature, **fit_params)
    528                              tuple(subset | {feature}),
    529                              **fit_params)
--> 530                             for feature in remaining
    531                             if feature != ignore_feature)
    532

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550
    551     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _calc_score(selector, X, y, indices, **fit_params)
     32                                  n_jobs=1,
     33                                  pre_dispatch=selector.pre_dispatch,
---> 34                                  fit_params=fit_params)
     35     else:
     36         selector.est_.fit(X[:, indices], y, **fit_params)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    400                                 fit_params=fit_params,
    401                                 pre_dispatch=pre_dispatch,
--> 402                                 error_score=error_score)
    403     return cv_results['test_score']
    404

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    238             return_times=True, return_estimator=return_estimator,
    239             error_score=error_score)
--> 240         for train, test in cv.split(X, y, groups))
    241
    242     zipped_scores = list(zip(*scores))

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550
    551     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    566         fit_time = time.time() - start_time
    567         # _score will return dict if is_multimetric is True
--> 568         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
    569         score_time = time.time() - start_time - fit_time
    570         if return_train_score:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
    603     """
    604     if is_multimetric:
--> 605         return _multimetric_score(estimator, X_test, y_test, scorer)
    606     else:
    607         if y_test is None:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
    633             score = scorer(estimator, X_test)
    634         else:
--> 635             score = scorer(estimator, X_test, y_test)
    636
    637         if hasattr(score, 'item'):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
    174         y_type = type_of_target(y)
    175         if y_type not in ("binary", "multilabel-indicator"):
--> 176             raise ValueError("{0} format is not supported".format(y_type))
    177
    178         if is_regressor(clf):

ValueError: continuous format is not supported

最佳答案

仔细观察跟踪,您会发现错误不是由mlxtend引发的,而是由scikit-learn的scorer.py模块引发的,这是因为您使用的roc_auc_score适合分类问题只要;对于诸如您在这里这样的回归问题,这是毫无意义的。

docs中(添加了重点):


  sklearn.metrics.roc_auc_score(y_true,y_score,平均值=“宏”,sample_weight =无,max_fpr =无)
  
  根据预测分数计算接收器工作特性曲线(ROC AUC)下的面积。
  
  注意:此实现仅限于标签指示符格式的二进制分类任务或多标签分类任务。


另请参见每种问题的scikit-learn list of metrics,您可以在其中确认roc_auc不适合回归。

因此,在您的sfs定义中将其更改为类似

scoring='neg_mean_squared_error'


就像SequentialFeatureSelector的文档example中一样,或者适用于其他任何适合回归的指标,都可以。

08-24 14:12