使用sklearn的TSNE和mahalanobis度量,我得到以下错误
from sklearn.manifold import TSNE
tsne = TSNE( verbose=1, perplexity=40, n_iter=250,learning_rate=50, random_state=0,metric='mahalanobis')
pt=data.sample(frac=0.1).values
tsne_results = tsne.fit_transform(pt)
ValueError: Must provide either V or VI for Mahalanobis distance
如何提供马氏距离的方法参数?
最佳答案
实际上,在其他情况下,没有定义度量参数的选项。例如,其他基于成对距离的类提供一个metric_params
参数来将其他参数传递给距离函数。就像
KNeighborsClassifier
NearestNeighbors
有这个:
metric_params : dict, optional (default = None)
Additional keyword arguments for the metric function.
这个answer here显示了如何使用这个参数。
但TSNE无法发送附加参数。所以现在,您需要扩展类并遍历
__init__()
来发送参数,然后_fit()
method来实际使用它们。我们可以做到:
from time import time
import numpy as np
import scipy.sparse as sp
from sklearn.manifold import TSNE
from sklearn.externals.six import string_types
from sklearn.utils import check_array, check_random_state
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.manifold.t_sne import _joint_probabilities, _joint_probabilities_nn
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
class MyTSNE(TSNE):
def __init__(self, n_components=2, perplexity=30.0,
early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
n_iter_without_progress=300, min_grad_norm=1e-7,
metric="euclidean", metric_params=None, #<=ADDED
init="random", verbose=0,
random_state=None, method='barnes_hut', angle=0.5):
self.n_components = n_components
self.perplexity = perplexity
self.early_exaggeration = early_exaggeration
self.learning_rate = learning_rate
self.n_iter = n_iter
self.n_iter_without_progress = n_iter_without_progress
self.min_grad_norm = min_grad_norm
self.metric = metric
self.metric_params = metric_params #<=ADDED
self.init = init
self.verbose = verbose
self.random_state = random_state
self.method = method
self.angle = angle
def _fit(self, X, skip_num_points=0):
if self.method not in ['barnes_hut', 'exact']:
raise ValueError("'method' must be 'barnes_hut' or 'exact'")
if self.angle < 0.0 or self.angle > 1.0:
raise ValueError("'angle' must be between 0.0 - 1.0")
if self.metric == "precomputed":
if isinstance(self.init, string_types) and self.init == 'pca':
raise ValueError("The parameter init=\"pca\" cannot be "
"used with metric=\"precomputed\".")
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square distance matrix")
if np.any(X < 0):
raise ValueError("All distances should be positive, the "
"precomputed distances given as X is not "
"correct")
if self.method == 'barnes_hut' and sp.issparse(X):
raise TypeError('A sparse matrix was passed, but dense '
'data is required for method="barnes_hut". Use '
'X.toarray() to convert to a dense numpy array if '
'the array is small enough for it to fit in '
'memory. Otherwise consider dimensionality '
'reduction techniques (e.g. TruncatedSVD)')
else:
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=[np.float32, np.float64])
if self.method == 'barnes_hut' and self.n_components > 3:
raise ValueError("'n_components' should be inferior to 4 for the "
"barnes_hut algorithm as it relies on "
"quad-tree or oct-tree.")
random_state = check_random_state(self.random_state)
if self.early_exaggeration < 1.0:
raise ValueError("early_exaggeration must be at least 1, but is {}"
.format(self.early_exaggeration))
if self.n_iter < 250:
raise ValueError("n_iter should be at least 250")
n_samples = X.shape[0]
neighbors_nn = None
if self.method == "exact":
if self.metric == "precomputed":
distances = X
else:
if self.verbose:
print("[t-SNE] Computing pairwise distances...")
if self.metric == "euclidean":
distances = pairwise_distances(X, metric=self.metric,
squared=True,
**self.metric_params) #<=ADDED
else:
distances = pairwise_distances(X, metric=self.metric,
**self.metric_params) #<=ADDED
if np.any(distances < 0):
raise ValueError("All distances should be positive, the "
"metric given is not correct")
P = _joint_probabilities(distances, self.perplexity, self.verbose)
assert np.all(np.isfinite(P)), "All probabilities should be finite"
assert np.all(P >= 0), "All probabilities should be non-negative"
assert np.all(P <= 1), ("All probabilities should be less "
"or then equal to one")
else:
k = min(n_samples - 1, int(3. * self.perplexity + 1))
if self.verbose:
print("[t-SNE] Computing {} nearest neighbors...".format(k))
knn = NearestNeighbors(algorithm='auto', n_neighbors=k,
metric=self.metric,
metric_params = self.metric_params) #<=ADDED
t0 = time()
knn.fit(X)
duration = time() - t0
if self.verbose:
print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
n_samples, duration))
t0 = time()
distances_nn, neighbors_nn = knn.kneighbors(
None, n_neighbors=k)
duration = time() - t0
if self.verbose:
print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..."
.format(n_samples, duration))
del knn
if self.metric == "euclidean":
distances_nn **= 2
P = _joint_probabilities_nn(distances_nn, neighbors_nn,
self.perplexity, self.verbose)
if isinstance(self.init, np.ndarray):
X_embedded = self.init
elif self.init == 'pca':
pca = PCA(n_components=self.n_components, svd_solver='randomized',
random_state=random_state)
X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
elif self.init == 'random':
X_embedded = 1e-4 * random_state.randn(
n_samples, self.n_components).astype(np.float32)
else:
raise ValueError("'init' must be 'pca', 'random', or "
"a numpy array")
degrees_of_freedom = max(self.n_components - 1.0, 1)
return self._tsne(P, degrees_of_freedom, n_samples,
X_embedded=X_embedded,
neighbors=neighbors_nn,
skip_num_points=skip_num_points)
我已在更改处标记(#
tsne = MyTSNE(verbose=1,perplexity=40,n_iter=250,learning_rate=50, random_state=0,
metric='mahalanobis', metric_params={'V': np.cov(X)})
pt=data.sample(frac=0.1).values
tsne_results = tsne.fit_transform(pt)
注:
我在顶部提到的其他类检查
metric_params
中是否有有效的参数,但我没有这样做,所以请确保在其中传递正确的参数,否则它将给出错误。你应该在scikit-learn issues page on github