参考博客:https://blog.csdn.net/keseliugeizuori/article/details/53162171
给定数据对象集合X={x(1,1),x(1.2,1.2),x(O.8,1.2),x(0.9,0.7),x(1.3,0.9),x(1,1.4),x(3,3),x(3.1,2.8),x(3.2,3.4),x(2.7,3.3),x(2.6,2.9)},
类别数k=2
采用k-means算法进行聚类。
用JAVA语言编写程序实现聚类
代码:
Demo.java:
package com.k_means; import java.util.ArrayList; public class Demo { public static void main(String[] args) { //初始化一个Kmean对象,将k置为2 int num; System.out.println("begin:"); num=2; k_means k=new k_means(num); ArrayList<float[]> dataSet=new ArrayList<float[]>(); ReadData rd=new ReadData(); dataSet=rd.read(); //设置原始数据集 k.setDataSet(dataSet); //执行算法 k.kmeans(); //得到聚类结果 ArrayList<ArrayList<float[]>> cluster=k.getCluster(); //查看结果 for(int i=0;i<cluster.size();i++) { k.printDataArray(cluster.get(i), "cluster["+i+"]"); } } }
k-means.java:
package com.k_means; import java.util.ArrayList; import java.util.Random; public class k_means { private int k;// 分成多少簇 private int m;// 迭代次数 private int dataSetLength;// 数据集元素个数,即数据集的长度 private ArrayList<float[]> dataSet;// 数据集链表 private ArrayList<float[]> center;// 中心链表 private ArrayList<ArrayList<float[]>> cluster; // 簇 private ArrayList<Float> jc;// 误差平方和,k越接近dataSetLength,误差越小 private Random random; public void setDataSet(ArrayList<float[]> dataSet) { //设置需分组的原始数据集 this.dataSet = dataSet; } public ArrayList<ArrayList<float[]>> getCluster() { return cluster; } public k_means(int k) { //传入需要分成的簇数量 if (k <= 0) { k = 1; } this.k = k; } private void init() { //初始化 m = 0; random = new Random(); if (dataSet == null || dataSet.size() == 0) { System.out.println("数据为空,请输入数据!!!!"); } else{ dataSetLength = dataSet.size(); if (k > dataSetLength) { k = dataSetLength; } center = initCenters(); cluster = initCluster(); jc = new ArrayList<Float>(); } } private ArrayList<float[]> initCenters() { //初始化中心数据链表,分成多少簇就有多少个中心点 ArrayList<float[]> center = new ArrayList<float[]>(); int[] randoms = new int[k]; boolean flag; int temp = random.nextInt(dataSetLength); randoms[0] = temp; for (int i = 1; i < k; i++) { flag = true; while (flag) { temp = random.nextInt(dataSetLength); int j = 0; while (j < i) { if (temp == randoms[j]) { break; } j++; } if (j == i) { flag = false; } } randoms[i] = temp; } for (int i = 0; i < k; i++) { center.add(dataSet.get(randoms[i]));// 生成初始化中心链表 } return center; } private ArrayList<ArrayList<float[]>> initCluster() { //初始化簇集合 ArrayList<ArrayList<float[]>> cluster = new ArrayList<ArrayList<float[]>>(); for (int i = 0; i < k; i++) { cluster.add(new ArrayList<float[]>()); } return cluster; } private float distance(float[] element, float[] center) { //计算两个点之间的距离 float distance = 0.0f; float x = element[0] - center[0]; float y = element[1] - center[1]; float z = x * x + y * y; distance = (float) Math.sqrt(z); return distance; } private int minDistance(float[] distance) { //获取距离集合中最小距离的位置 float minDistance = distance[0]; int minLocation = 0; for (int i = 1; i < distance.length; i++) { if (distance[i] < minDistance) { minDistance = distance[i]; minLocation = i; } else if (distance[i] == minDistance) // 如果相等,随机返回一个位置 { if (random.nextInt(10) < 5) { minLocation = i; } } } return minLocation; } private void clusterSet() { //将当前元素放到最小距离中心相关的簇中 float[] distance = new float[k]; for (int i = 0; i < dataSetLength; i++) { for (int j = 0; j < k; j++) { distance[j] = distance(dataSet.get(i), center.get(j)); } int minLocation = minDistance(distance); cluster.get(minLocation).add(dataSet.get(i)); } } private float errorSquare(float[] element, float[] center) { //求两点误差平方的方法 float x = element[0] - center[0]; float y = element[1] - center[1]; float errSquare = x * x + y * y; return errSquare; } private void countRule() { //计算误差平方和准则函数方法 float jcF = 0; for (int i = 0; i < cluster.size(); i++) { for (int j = 0; j < cluster.get(i).size(); j++) { jcF += errorSquare(cluster.get(i).get(j), center.get(i)); } } jc.add(jcF); } private void setNewCenter() { //设置新的簇中心方法 for (int i = 0; i < k; i++) { int n = cluster.get(i).size(); if (n != 0) { float[] newCenter = { 0, 0 }; for (int j = 0; j < n; j++) { newCenter[0] += cluster.get(i).get(j)[0]; newCenter[1] += cluster.get(i).get(j)[1]; } // 设置一个平均值 newCenter[0] = newCenter[0] / n; newCenter[1] = newCenter[1] / n; center.set(i, newCenter); } } } public void printDataArray(ArrayList<float[]> dataArray, String dataArrayName) { //打印数据 for (int i = 0; i < dataArray.size(); i++) { System.out.println("print:(" + dataArray.get(i)[0] + "," + dataArray.get(i)[1]+")"); } System.out.println("==================================="); } void kmeans() { init(); // 循环分组,直到误差不变为止 while (true) { clusterSet(); countRule(); // 误差不变了,分组完成 if (m != 0) { if (jc.get(m) - jc.get(m - 1) == 0) { break; } } setNewCenter(); m++; cluster.clear(); cluster = initCluster(); } } }
ReadData.java:
package com.k_means; import java.util.ArrayList; public class ReadData { // 从文件中读取数据 public ArrayList<float[]> read() { ArrayList<float[]> arr = new ArrayList<float[]>(); float[][] point1 = new float[11][20]; point1[0][0] = 1; point1[0][1] = 1; point1[1][0] = (float) 1.2; point1[1][1] = (float) 1.2; point1[2][0] = (float) 0.8; point1[2][1] = (float) 1.2; point1[3][0] = (float) 0.9; point1[3][1] = (float) 0.7; point1[4][0] = (float) 1.3; point1[4][1] = (float) 0.9; point1[5][0] = (float) 1; point1[5][1] = (float) 1.4; point1[6][0] = (float) 3; point1[6][1] = (float) 3; point1[7][0] = (float) 3.1; point1[7][1] = (float) 2.8; point1[8][0] = (float) 3.2; point1[8][1] = (float) 3.4; point1[9][0] = (float) 2.7; point1[9][1] = (float) 3.3; point1[10][0] = (float) 2.6; point1[10][1] = (float) 2.9; arr.add(point1[0]); arr.add(point1[1]); arr.add(point1[2]); arr.add(point1[3]); arr.add(point1[4]); arr.add(point1[5]); arr.add(point1[6]); arr.add(point1[7]); arr.add(point1[8]); arr.add(point1[9]); arr.add(point1[10]); return arr; } }