KNN算法——k近邻算法
假设有这么一份数据图,其中蓝色的点是恶性肿瘤,而红色的点是良性肿瘤,现在进来一个新的数据,我们要判断这个数据是良性的还是恶性的。k近邻算法的意思就是说,看这个点最近的k个点的距离,来根据这些点是蓝色还是红色的数量大小,来决定这个新来的点是蓝色还是红色。现在我们假设这个k值为3,则有
这个新进来的点,我们设为绿色,那么离它最近的点有两个是红色,一个是蓝色,则我们就可以认为这个新进来的点是一个红色的良性肿瘤的点。
由于这里面有求解距离的需求,则我们在求解距离上使用的是欧拉距离(其实就是勾股定理)
这里分别是二维、三维和高维空间的欧拉距离的计算公式,实际上就是各个数据的相同维度的值相减,平方后再将不同维度的平方值相加再开根号。
最终就是
现在我们来编程实现这个k近邻算法
import numpy as np import matplotlib.pyplot as plt from math import sqrt from collections import Counter if __name__ == "__main__": # 所有的肿瘤数据 raw_data_x = [[3.393533211, 2.331273381], [3.110073483, 1.781539638], [1.343808831, 3.368360954], [3.582294042, 4.679179110], [2.280362439, 2.866990263], [7.423436942, 4.696522875], [5.745051997, 3.533989803], [9.172168622, 2.511101045], [7.792783481, 3.424088941], [7.939820817, 0.791637231]] # 肿瘤数据的类型,0为良性肿瘤,1为恶性肿瘤 raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] X_train = np.array(raw_data_x) Y_train = np.array(raw_data_y) plt.scatter(X_train[Y_train == 0, 0], X_train[Y_train == 0, 1], color='g') plt.scatter(X_train[Y_train == 1, 0], X_train[Y_train == 1, 1], color='r') x = np.array([8.093607318, 3.365731514]) plt.scatter(x[0], x[1], color='b') # 计算新样本到已有样本各个点的距离 distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train] print(distances) # 对这个距离进行排序,排序的结果是已知样本的索引 nearest = np.argsort(distances) print(nearest) # 这里我们取6个近邻数据 k = 6 # 获取这前6个数据的肿瘤类型 topK_y = [Y_train[i] for i in nearest[:k]] print(topK_y) # 对这6个相近的肿瘤类型进行投票,投票数高的,新进的肿瘤数据就是该类型的肿瘤 votes = Counter(topK_y) # 打印出新进肿瘤数据的肿瘤类型 print(votes.most_common(1)[0][0]) plt.show()
运行结果
[4.812566907609877, 5.229270827235305, 6.749798999160064, 4.6986266144110695, 5.83460014556857, 1.4900114024329525, 2.354574897431513, 1.3761132675144652, 0.3064319992975, 2.5786840957478887]
[8 7 5 6 9 3 0 1 4 2]
[1, 1, 1, 1, 1, 0]
1
由最终结果可知,新进入的肿瘤数据是一个恶性肿瘤。
肿瘤数据分布图如下
这里绿色是良性肿瘤,红色是恶性肿瘤,蓝色是新进肿瘤数据。
现在我们将该算法写成一个函数
import numpy as np from math import sqrt from collections import Counter def kNN_classify(k, X_train, Y_train, x): assert 1 <= k <= X_train.shape[0], "k无效" assert X_train.shape[0] == Y_train.shape[0], \ "X数据集的大小必须跟Y数据集的大小相同" assert X_train.shape[1] == x.shape[0], \ "x的维度必须与X数据集的数据维度相同" # 计算新样本到已有样本各个点的距离 distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train] # 对这个距离进行排序,排序的结果是已知样本的索引 nearest = np.argsort(distances) # 获取这前6个数据的肿瘤类型 topK_y = [Y_train[i] for i in nearest[:k]] # 对这6个相近的肿瘤类型进行投票,投票数高的,新进的肿瘤数据就是该类型的肿瘤 votes = Counter(topK_y) return votes.most_common(1)[0][0]
import numpy as np import matplotlib.pyplot as plt from playLA.KNN import kNN_classify if __name__ == "__main__": # 所有的肿瘤数据 raw_data_x = [[3.393533211, 2.331273381], [3.110073483, 1.781539638], [1.343808831, 3.368360954], [3.582294042, 4.679179110], [2.280362439, 2.866990263], [7.423436942, 4.696522875], [5.745051997, 3.533989803], [9.172168622, 2.511101045], [7.792783481, 3.424088941], [7.939820817, 0.791637231]] # 肿瘤数据的类型,0为良性肿瘤,1为恶性肿瘤 raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] X_train = np.array(raw_data_x) Y_train = np.array(raw_data_y) plt.scatter(X_train[Y_train == 0, 0], X_train[Y_train == 0, 1], color='g') plt.scatter(X_train[Y_train == 1, 0], X_train[Y_train == 1, 1], color='r') x = np.array([8.093607318, 3.365731514]) plt.scatter(x[0], x[1], color='b') res = kNN_classify(6, X_train, Y_train, x) print(res) plt.show()
运行结果
1
我们来看一下一般的机器学习的流程
我们在kNN算法中并没有得到什么模型,可以说kNN是唯一一个不需要训练过程的算法。换句话说,输入样例直接输入给数据集,然后直接找出k个点,然后投票输出结果。k近邻算法是非常特殊的,可以被认为是没有模型的算法,为了和其他算法统一,可以认为训练数据集就是模型本身。
现在我们用scikit_learn机器学习库的kNN算法来重写这个过程
from sklearn.neighbors import KNeighborsClassifier import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": # 所有的肿瘤数据 raw_data_x = [[3.393533211, 2.331273381], [3.110073483, 1.781539638], [1.343808831, 3.368360954], [3.582294042, 4.679179110], [2.280362439, 2.866990263], [7.423436942, 4.696522875], [5.745051997, 3.533989803], [9.172168622, 2.511101045], [7.792783481, 3.424088941], [7.939820817, 0.791637231]] # 肿瘤数据的类型,0为良性肿瘤,1为恶性肿瘤 raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] X_train = np.array(raw_data_x) Y_train = np.array(raw_data_y) plt.scatter(X_train[Y_train == 0, 0], X_train[Y_train == 0, 1], color='g') plt.scatter(X_train[Y_train == 1, 0], X_train[Y_train == 1, 1], color='r') x = np.array([8.093607318, 3.365731514]) plt.scatter(x[0], x[1], color='b') kNN_classifier = KNeighborsClassifier(n_neighbors=6) # 提交数据集 kNN_classifier.fit(X_train, Y_train) X_predict = x.reshape(1, -1) # 获取训练结果集 predict = kNN_classifier.predict(X_predict) # 打印第0个样本的特征 print(predict[0]) plt.show()
运行结果
1
当然我们也可以自己封装一个跟scikit_learn一样的类
import numpy as np from math import sqrt from collections import Counter class KNNClassifier: def __init__(self, k): # 初始化kNN分类器 assert k >= 1, "k必须有效" self.k = k self._X_train = None self._Y_train = None def fit(self, X_train, Y_train): # 根据训练数据集X_train和Y_train训练kNN分类器 assert X_train.shape[0] == Y_train.shape[0], \ "X数据集的大小必须跟Y数据集的大小相同" assert self.k <= X_train.shape[0], \ "X数据集的大小最小为k" self._X_train = X_train self._Y_train = Y_train return self def predict(self, X_predict): # 给定待预测数据集X_predict,返回表示X_predict的结果向量 assert self._X_train is not None and self._Y_train is not None, \ "预测前必须提交数据集" assert X_predict.shape[1] == self._X_train.shape[1], \ "预测数据的维度必须与X数据集的数据维度相同" y_predict = [self._predict(x) for x in X_predict] return np.array(y_predict) def _predict(self, x): # 给定单个待预测数据x,返回x的预测结果值 assert x.shape[0] == self._X_train.shape[1], \ "预测数据的维度必须与X数据集的数据维度相同" # 计算新样本到已有样本各个点的距离 distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train] # 对这个距离进行排序,排序的结果是已知样本的索引 nearest = np.argsort(distances) # 获取这前6个数据的肿瘤类型 topK_y = [self._Y_train[i] for i in nearest[:self.k]] # 对这6个相近的肿瘤类型进行投票,投票数高的,新进的肿瘤数据就是该类型的肿瘤 votes = Counter(topK_y) return votes.most_common(1)[0][0]
from playLA.KNNClassifier import KNNClassifier import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": # 所有的肿瘤数据 raw_data_x = [[3.393533211, 2.331273381], [3.110073483, 1.781539638], [1.343808831, 3.368360954], [3.582294042, 4.679179110], [2.280362439, 2.866990263], [7.423436942, 4.696522875], [5.745051997, 3.533989803], [9.172168622, 2.511101045], [7.792783481, 3.424088941], [7.939820817, 0.791637231]] # 肿瘤数据的类型,0为良性肿瘤,1为恶性肿瘤 raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] X_train = np.array(raw_data_x) Y_train = np.array(raw_data_y) plt.scatter(X_train[Y_train == 0, 0], X_train[Y_train == 0, 1], color='g') plt.scatter(X_train[Y_train == 1, 0], X_train[Y_train == 1, 1], color='r') x = np.array([8.093607318, 3.365731514]) plt.scatter(x[0], x[1], color='b') kNN_classifier = KNNClassifier(k=6) # 提交数据集 kNN_classifier.fit(X_train, Y_train) X_predict = x.reshape(1, -1) # 获取训练结果集 predict = kNN_classifier.predict(X_predict) # 打印第0个样本的特征 print(predict[0]) plt.show()
运行结果
1