KNN=K-Nearest Neighbour
原理:我们取前K个相似的数据(排序过的)中概率最大的种类,作为预测的种类。通常,K不会大于20。
下边是一个简单的实例,具体的含义在注释中:
import numpy as np
import operator
import os def createDataSet():
group = np.array([[1.0, 1.1],[1.0, 1.0],[0, 0],[0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels def classify(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]#lines num; samples num
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet#dataSize*(1*inX)
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)#add as the first dim
distances = sqDistances**0.5
#return indicies array from min to max
#this is an array
sortedDistanceIndices = distances.argsort()
#classCount={}
classCount=dict() #define a dictionary
for i in range(k):
voteIlabel = labels[sortedDistanceIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1#get(key,default=none)
#return a list like [('C',4),('B',3),('A',2)], not a dict
#itemgetter(0) is the 1st element
#default: from min to max
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]