我有以下数据:
head(df.num1)
## num_critic_for_reviews duration director_facebook_likes
## 1 723 178 0
## 2 302 169 563
## 3 602 148 0
## 4 813 164 22000
## 5 388 100 131
## 6 462 132 475
## actor_3_facebook_likes actor_1_facebook_likes gross num_voted_users
## 1 855 1000 760505847 886204
## 2 1000 40000 309404152 471220
## 3 161 11000 200074175 275868
## 4 23000 27000 448130642 1144337
## 5 365 131 46975183 8
## 6 530 640 73058679 212204
## cast_total_facebook_likes facenumber_in_poster num_user_for_reviews
## 1 4834 0 3054
## 2 48350 0 1238
## 3 11700 1 994
## 4 106759 0 2701
## 5 143 0 450
## 6 1873 1 738
## budget title_year actor_2_facebook_likes imdb_score aspect_ratio
## 1 2.4e+08 2009 936 7.9 1.8
## 2 3.0e+08 2007 5000 7.1 2.4
## 3 2.4e+08 2015 393 6.8 2.4
## 4 2.5e+08 2012 23000 8.5 2.4
## 5 1.0e+07 2015 12 7.1 2.4
## 6 2.6e+08 2012 632 6.6 2.4
## movie_facebook_likes
## 1 33000
## 2 0
## 3 85000
## 4 164000
## 5 0
## 6 24000
然后我运行 kmeans 如下:
set.seed(111)
km_out <- kmeans(df.num1,centers=3) #perform kmeans cluster with k=3
我们现在计算对象和聚类中心之间的距离以确定异常值并识别说 5 个最大的距离是异常值(任意识别)。
centers <- km_out$centers[km_out$cluster, ] # "centers" is a data frame of 3 centers but the length of dataset so we can calculate distance difference easily.
distances <- sqrt(rowSums((df.num1 - centers)^2))
(outliers <- order(distances, decreasing=T)[1:5])# these rows are 5 top outliers
## [1] 3860 3006 2324 2335 3424
让我们获取附加了距离的数据框:
df.num1$distance<-distances
df.num1$cluster<-km_out$cluster
打印有关异常值的详细信息(最大的五个距离值)
(df.num1[outliers,])
## num_critic_for_reviews duration director_facebook_likes
## 3860 202 112 0
## 3006 73 134 45
## 2324 174 134 6000
## 2335 105 103 78
## 3424 150 124 78
## actor_3_facebook_likes actor_1_facebook_likes gross num_voted_users
## 3860 38 717 211667 53508
## 3006 0 9 195888 5603
## 2324 745 893 2298191 221552
## 2335 101 488 410388 13727
## 3424 4 6 439162 106160
## cast_total_facebook_likes facenumber_in_poster num_user_for_reviews
## 3860 907 0 131
## 3006 11 0 45
## 2324 2710 0 570
## 2335 991 1 79
## 3424 28 0 430
## budget title_year actor_2_facebook_likes imdb_score aspect_ratio
## 3860 4.2e+09 2005 126 7.7 2.4
## 3006 2.5e+09 2005 2 7.1 2.4
## 2324 2.4e+09 1997 851 8.4 1.8
## 2335 2.1e+09 2004 336 6.9 1.8
## 3424 1.1e+09 1988 5 8.1 1.8
## movie_facebook_likes distance cluster
## 3860 4000 4.1e+09 2
## 3006 607 2.4e+09 2
## 2324 11000 2.3e+09 2
## 2335 973 2.0e+09 2
## 3424 0 9.8e+08 2
但这些只是根据距聚类中心的最大距离选择的数据点.....
我喜欢的是基于统计测量的东西,例如基于 z 分数的极值(比如> 2sd 定义为异常值),而不是只取几个最大距离值 obs(行)........
最终输出的想法,例如:
或者更好的是这样的:
如果有一些帮助/指针来获得如上所示的结果,我将不得不......
问候
最佳答案
编辑以包含全局异常值
所以我的理解是,您希望通过使用 z-score 而不仅仅是绝对值比较来检查每个元素的距离与其簇的距离。
我用 iris
数据集复制了你的代码。尽管代码相当困惑,但您仍然可以看到每个集群中的每个元素是否是异常值。
df.num1 = iris[,-5]
set.seed(111)
km_out = kmeans(df.num1, 3)
km_out_global = kmeans(df.num1, 1)
cluster_centers = km_out$centers[km_out$cluster,]
cluster_distances = sqrt(rowSums(df.num1 - cluster_centers)^2)
global_distances = sqrt(rowSums(df.num1 - km_out_global$centers)^2)
df.num1_v1 = data.frame(df.num1, cluster = km_out$cluster, c_dist = cluster_distances)
CM = ave(df.num1_v1$c_dist, df.num1_v1$cluster, FUN = function(x) mean(x, na.rm=TRUE))
CSd = ave(df.num1_v1$c_dist, df.num1_v1$cluster, FUN = function(x) sd(x, na.rm=TRUE))
GM = mean(df.num1_v1$c_dist)
GSd = sd(df.num1_v1$c_dist)
cluster_z_score = (cluster_distances - CM)/CSd
global_z_score = (global_distances - GM)/GSd
df.num1_v2 = data.frame(df.num1_v1, CM, CSd, cluster_z_score,
cluster_outlier = ifelse(cluster_z_score > 2 | cluster_z_score < -2, T , F))
df.num1_v3 = data.frame(df.num1,
cluster_outlier = ifelse(cluster_z_score > 2 | cluster_z_score < -2, T , F),
global_outlier = ifelse(global_z_score > 2 | global_z_score < -2, T , F)
)
#You can modify your threshold at ifelse
table(df.num1_v2$cluster_outlier)
FALSE TRUE
141 9
df.num1_v2[11:16,]
Sepal.Length Sepal.Width Petal.Length Petal.Width cluster distances CM CSd z_score Outlier
11 5.4 3.7 1.5 0.2 3 0.658 0.63232 0.4551378 0.05642247 FALSE
12 4.8 3.4 1.6 0.2 3 0.142 0.63232 0.4551378 -1.07730008 FALSE
13 4.8 3.0 1.4 0.1 3 0.842 0.63232 0.4551378 0.46069563 FALSE
14 4.3 3.0 1.1 0.1 3 1.642 0.63232 0.4551378 2.21840501 TRUE
15 5.8 4.0 1.2 0.2 3 1.058 0.63232 0.4551378 0.93527716 FALSE
16 5.7 4.4 1.5 0.4 3 1.858 0.63232 0.4551378 2.69298655 TRUE
全局异常值
正如我所评论的,一个数据点 而不是 集群中的异常值可能会成为全局异常值。然而,这不是错误也不是错误,而只是统计数据。
table(df.num1_v3$global_outlier)
FALSE TRUE
31 119
请注意集群异常值如何成为全局异常值,反之亦然。
df.num1_v3[11:16,]
Sepal.Length Sepal.Width Petal.Length Petal.Width cluster_outlier global_outlier
11 5.4 3.7 1.5 0.2 FALSE TRUE
12 4.8 3.4 1.6 0.2 FALSE FALSE
13 4.8 3.0 1.4 0.1 FALSE TRUE
14 4.3 3.0 1.1 0.1 TRUE FALSE
15 5.8 4.0 1.2 0.2 FALSE TRUE
16 5.7 4.4 1.5 0.4 TRUE TRUE
关于r - 如何使用 Kmeans 聚类统计识别异常值,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/40797396/