kNN1
# -*- coding: utf-8 -*-
"""
kNN : 최근접 이웃
"""
import numpy as np # 다차원배열, 선형대수 연산
import matplotlib.pyplot as plt
# 1. 알려진 두 집단 x,y 산점도 시각화
plt.scatter(1.2, 1.1) # A 집단
plt.scatter(1.0, 1.0)
plt.scatter(1.8, 0.8) # B 집단
plt.scatter(2, 0.9)
plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
plt.show()
# 2. DATA 생성과 함수 정의
p1 = [1.2, 1.1] # A 집단
p2 = [1.0, 1.0]
p3 = [1.8, 0.8] # B 집단
p4 = [2, 0.9]
category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
p5 = [1.6, 0.85] # 분류대상
# data 생성 함수 정의
def data_set():
# 선형대수 연산 : numpy형 변환
know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
not_know_group = np.array(p5) # 알려지지 않은 집단
class_category = np.array(category) # 정답(분류범주)
return know_group,not_know_group,class_category
know_group,not_know_group,class_category=data_set()
print('알려진 집단')
"""
[[1.2 1.1]
[1. 1. ]
[1.8 0.8]
[2. 0.9]]
"""
print(know_group)
print('알려지지 않은 집단')
print(not_know_group) #[1.6 0.85]
print('정답')
print(class_category) #['A' 'A' 'B' 'B']
#
#차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)
diff=know_group-not_know_group #2차원 -1차원
print('차=\n',diff)
"""
차=
[[-0.4 0.25]
[-0.6 0.15]
[ 0.2 -0.05]
[ 0.4 0.05]]
"""
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1) #행단위 합계
print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
distance=np.sqrt(sq_sum)
print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
#[3 4 1 2]거리 k=3 (B(2)>A(1))
print(class_category)#['A' 'A' 'B' 'B']
def classfy(know,not_know,cate,k):
#유클리드인 거리계산식
diff=know-not_know
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1)
distance=np.sqrt(sq_sum)
#2.가장 가까운 거리 오름차순 정렬 -> index
sortDist=distance.argsort() #sort->index
#print(sortDist) #[2 3 0 1]
#3.최근접 이윳
class_result={} #빈 set
for i in range(k):#0~2
key = cate[sortDist[i]] #i=0 -> 'B'
class_result[key]=class_result.get(key,0)+1
return class_result
#함수 호출
class_result=classfy(know_group,not_know_group,class_category,3)
print(class_result) #{'B': 2, 'A': 1}
#vot 함수
def class_vote(class_result):
return max(class_result,key=class_result.get)
vote_result=class_vote(class_result)
print("분류결과=",vote_result)#분류결과= B
kNN Class
# -*- coding: utf-8 -*-
"""
class 구현
"""
import numpy as np
from Step01_kNN import data_set
know_group,not_know_group,class_category=data_set()
#class =Func1+Func2+Func3
class kNNclassify:
#1.최근접 이웃
def classfy(self,know,not_know,cate,k):
#유클리드인 거리계산식
diff=know-not_know
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1)
distance=np.sqrt(sq_sum)
#2.가장 가까운 거리 오름차순 정렬 -> index
sortDist=distance.argsort() #sort->index
#print(sortDist) #[2 3 0 1]
#3.최근접 이윳(k=3)
self.class_result={} #빈 set
for i in range(k):#0~2
key = cate[sortDist[i]] #i=0 -> 'B'
self.class_result[key]=self.class_result.get(key,0)+1
#vot 함수
def class_vote(self):
return max(self.class_result,key=self.class_result.get)
#class object 생성
obj=kNNclassify() #생성자
#objext.menber : self.class_result
obj.classfy(know_group,not_know_group,class_category,3)
vote_result=obj.class_vote()
print('kNN 분류결과=',vote_result)#kNN 분류결과= B
NB
# -*- coding: utf-8 -*-
"""
통계적 분류기 - NB
"""
import pandas as pd
from sklearn import model_selection#train/test
from sklearn.naive_bayes import GaussianNB
iris=pd.read_csv("../data/iris.csv")
print(iris.head())
"""
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#2. x,y 변수 선정
cols=list(iris.columns)
x_cols=cols[:4] #X:1~4(연속형)
y_cols=cols[-1] #y:5(범주형)
#3.train/test split
iris_df=iris
print(iris_df.shape)#(150, 5)
train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
print(train_iris.shape)#(105, 5)
print(test_iris.shape)#(45, 5)
#4. model생성 train set
obj=GaussianNB() #object
model=obj.fit(train_iris[x_cols],train_iris[y_cols])
#5.model 평가
pred=model.predict(test_iris[x_cols]) #Y예측
Y = test_iris[y_cols] #정답
#confusion matrix
matrix=pd.crosstab(pred,Y)
print(matrix)
"""
Species setosa versicolor virginica
row_0
setosa 18 0 0
versicolor 0 10 2
virginica 0 0 15
"""
acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
print('분류정확도=',acc)#분류정확도= 0.9555555555555556
SVM
# -*- coding: utf-8 -*-
"""
SVM Model
"""
import pandas as pd
from sklearn import model_selection#train/test
from sklearn import svm #model
iris=pd.read_csv("../data/iris.csv")
print(iris.head())
"""
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#2. x,y 변수 선정
cols=list(iris.columns)
x_cols=cols[:4] #X:1~4(연속형)
y_cols=cols[-1] #y:5(범주형)
#3.train/test split
iris_df=iris
print(iris_df.shape)#(150, 5)
train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
print(train_iris.shape)#(105, 5)
print(test_iris.shape)#(45, 5)
#4.model -SVM
obj=svm.SVC()
model=obj.fit(train_iris[x_cols],train_iris[y_cols])
#5.model 평가
pred=model.predict(test_iris[x_cols])
Y=test_iris[y_cols]
#confusion matrix
matrix=pd.crosstab(pred,Y)
print(matrix)
"""
Species setosa versicolor virginica
row_0
setosa 18 0 0
versicolor 0 10 1
virginica 0 0 16
"""
acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
print('분류정확도=',acc)#분류정확도= 0.9777777777777777
spam_train_test
# -*- coding: utf-8 -*-
"""
NB vs SWM
-data set :sparse matrix 이용
-file name:../data/spam_tran_test.npy
"""
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import numpy as np
import pandas as pd
#1.file Loading
X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
print(X_train.shape) #(3901, 4000)
print(X_test.shape) #(1673, 4000)
print(type(y_train))#<class 'list'>
#list -> numpy형변환: 선형대수 연산
y_train=np.array(y_train)
y_test=np.array(y_test)
print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서
#2.NB model생성
obj =GaussianNB()
nb_model=obj.fit(X_train,y_train)
pred=nb_model.predict(X_test)
Y=y_test
matrix=pd.crosstab(pred,Y)
print("nb matrix\n",matrix)
"""
col_0 0(ham) 1(spam)
row_0
0 1264 28
1 167 214
"""
acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
print("NB acc=",acc) #NB acc= 0.8834429169157203
#2) 정확률:예측치 yes-> 실제값 yes
precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
print("정확률=",precision)#정확률= 0.5616797900262467
#3) 재현률:실제값yes -> 예측치 yes
recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
print("재현률=",recall)#재현률= 0.8842975206611571
#4) f1 score:precision,recall
f1_score=2 * (precision*recall)/(precision+recall)
print('f1_score=',f1_score)#f1_score= 0.6869983948635634
#3.SVM model
svm_obj =svm.SVC(kernel='linear')#kenel 함수
svm_model=svm_obj.fit(X_train,y_train)
svm_pred=svm_model.predict(X_test)
svm_Y=y_test
svm_matrix=pd.crosstab(svm_pred,svm_Y)
print("svm matrix\n",svm_matrix)
"""
svm matrix
col_0 0 1
row_0
0 1428 36
1 3 206
"""
svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
print("svm acc=",svm_acc) #svm acc= 0.976688583383144
sms_spam_data
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 23 15:52:23 2019
@author: 502-03
"""
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import numpy as np
import pandas as pd
#1.file Loading
X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
print(X_train.shape) #(4446, 6000)
print(X_test.shape) #(1112, 6000)
print(type(y_train))#<class 'pandas.core.series.Series'>
#NB model 생성
obj=GaussianNB()
nb_model=obj.fit(X_train,y_train)
nb_pred=nb_model.predict(X_test)
nb_Y=y_test
nb_tab=pd.crosstab(nb_pred,nb_Y)
print("nb_tab=\n",nb_tab)
"""
nb_tab=
type ham spam
row_0
ham 812 10
spam 156 134
"""
nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317
#svm
obj=svm.SVC(kernel='linear')
svc_model=obj.fit(X_train,y_train)
svc_pred=svc_model.predict(X_test)
svc_Y=y_test
svc_tab=pd.crosstab(svc_pred,svc_Y)
print("svc_tab=\n",svc_tab)
"""
svc_tab=
type ham spam
row_0
ham 964 20
spam 4 124
"""
svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504
precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
print("정확률",precision)#정확률 0.96875
recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
print("재현률",recall)#재현률 0.8611111111111112
f1_score=2* (precision * recall)/(precision + recall)
print("f1_score",f1_score)#f1_score 0.911764705882353