文本挖掘之文本聚类（OPTICS）

简介

　　鉴于DBSCAN算法对输入参数，邻域半径E和阈值M比较敏感，在参数调优时比较麻烦，因此本文对另一种基于密度的聚类算法OPTICS（Ordering Points To Identify the Clustering Structure）展开研究，该算法是DBSCAN的改进算法，与DBSCAN相比，该算法对输入参数不敏感。此外，OPTICS算法不显示地生成数据聚类，其只是对数据对象集合中的对象进行排序，获取一个有序的对象列表，其中包含了足够的信息能用来提取聚类。在实际的应用中，可利用该有序的对象序列，对数据的分布展开分析以及对数据的关联进行分析。

基本概念

　　由于OPTICS是对DBSCAN算法的一种改进，因此许多概念是共用的，如核心对象、（直接）密度可达、密度相连等，具体内容参考DBSCAN。在上述内容的基础上，本文再引入两个核心概念。

　　(1) 核心距离

　　在数据集合D中，对于给定的参数E和M，称使得p成为核心对象的最小邻域半径为p的核心距离。具体数学表达式如下所示：

文本挖掘之文本聚类（OPTICS）-LMLPHP

　　通俗意义上来说，在给定的参数E和M上，p的核心距离为距离值中的第M个最小值（最大值），该距离表征可以为欧式距离、余弦相似度或Word2Vec等。

　　(2) 可达距离

　　在数据集合D中，对于给定的参数E和M，称对象p的核心距离与对象p和o距离，二者之间最大值为o关于p的可达距离。具体数学表达式如下所示：

文本挖掘之文本聚类（OPTICS）-LMLPHP

　　程序伪代码(参考维基百科):

 OPTICS(DB, eps, MinPts)

     for each point p of DB

        p.reachability-distance = UNDEFINED

     for each unprocessed point p of DB

        N = getNeighbors(p, eps)

        mark p as processed

        output p to the ordered list

        if (core-distance(p, eps, Minpts) != UNDEFINED)

           Seeds = empty priority queue

           update(N, p, Seeds, eps, Minpts)

           for each next q in Seeds

              N' = getNeighbors(q, eps)

              mark q as processed

              output q to the ordered list

              if (core-distance(q, eps, Minpts) != UNDEFINED)

                 update(N', q, Seeds, eps, Minpts)

 update(N, p, Seeds, eps, Minpts)

     coredist = core-distance(p, eps, MinPts)

     for each o in N

        if (o is not processed)

           new-reach-dist = max(coredist, dist(p,o))

           if (o.reachability-distance == UNDEFINED) // o is not in Seeds

               o.reachability-distance = new-reach-dist

               Seeds.insert(o, new-reach-dist)

           else               // o in Seeds, check for improvement

               if (new-reach-dist < o.reachability-distance)

                  o.reachability-distance = new-reach-dist

                  Seeds.move-up(o, new-reach-dist)

　　程序源代码：

 import java.util.List;

 import com.gta.cosine.ElementDict;

 public class DataPoint {

     private List<ElementDict> terms;

     private double initDistance;

     private double coreDistance;

     private double reachableDistance;

     private boolean isVisited;

     public DataPoint(List<ElementDict> terms) {

         this.terms = terms;

         this.initDistance = -1;

         this.coreDistance = -1;

         this.reachableDistance = -1;

         this.isVisited = false;

     }

     public double getCoreDistance() {

         return coreDistance;

     }

     public void setCoreDistance(double coreDistance) {

         this.coreDistance = coreDistance;

     }

     public double getReachableDistance() {

         return reachableDistance;

     }

     public void setReachableDistance(double reachableDistance) {

         this.reachableDistance = reachableDistance;

     }

     public boolean getIsVisitLabel() {

         return isVisited;

     }

     public void setIsVisitLabel(boolean isVisited) {

         this.isVisited = isVisited;

     }

     public double getInitDistance() {

         return initDistance;

     }

     public void setInitDistance(double initDistance) {

         this.initDistance = initDistance;

     }

     public List<ElementDict> getAllElements() {

         return terms;

     }

     public ElementDict getElement(int index) {

         return terms.get(index);

     }

     public boolean equals(DataPoint dp)

     {

         List<ElementDict> ed1 = getAllElements();

         List<ElementDict> ed2 = dp.getAllElements();

         int len = ed1.size();

         if (len != ed2.size())

         {

             return false;

         }

         for (int i = 0; i < len; i++)

         {

             if (!ed1.get(i).equals(ed2.get(i)))

             {

                 return false;

             }

         }

         return true;

     }

 }

 import java.util.Comparator;

 import java.util.List;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.Queue;

 import java.util.PriorityQueue;

 import com.gta.cosine.ElementDict;

 import com.gta.cosine.TextCosine;

 public class OPTICS {

     private double            eps;

     private int               minPts;

     private TextCosine        cosine;

     private List<DataPoint>   dataPoints;

     private List<DataPoint>   orderList;

     public OPTICS(double eps, int minPts)

     {

         this.eps = eps;

         this.minPts = minPts;

         this.cosine = new TextCosine();

         this.dataPoints = new ArrayList<DataPoint>();

         this.orderList = new ArrayList<DataPoint>();

     }

     public void addPoint(String s)

     {

         List<ElementDict> ed = cosine.tokenizer(s);

         dataPoints.add(new DataPoint(ed));

     }

     public double coreDistance(List<DataPoint> neighbors)

     {

         double ret = -1;

         if (neighbors.size() >= minPts)

         {

             Collections.sort(neighbors, new Comparator<DataPoint>() {

                         public int compare(DataPoint dp1, DataPoint dp2) {

                             double cd = dp1.getInitDistance() - dp2.getInitDistance();

                             if (cd < 0) {

                                 return 1;

                             } else {

                                 return -1;

                             }

                         }

                     });

             ret = neighbors.get(minPts-1).getInitDistance();

         }

         return ret;

     }

     public double cosineDistance(DataPoint p, DataPoint q)

     {

         List<ElementDict> vec1 = p.getAllElements();

         List<ElementDict> vec2 = q.getAllElements();

         return cosine.analysisText(vec1, vec2);

     }

     public List<DataPoint> getNeighbors(DataPoint p, List<DataPoint> points)

     {

         List<DataPoint> neighbors = new ArrayList<DataPoint>();

         double countDistance = -1;

         for (DataPoint q : points)

         {

             countDistance = cosineDistance(p, q);

             if (countDistance >= eps)

             {

                 q.setInitDistance(countDistance);

                 neighbors.add(q);

             }

         }

         return neighbors;

     }

     public void cluster(List<DataPoint> points)

     {

         for (DataPoint point : points)

         {

             if (!point.getIsVisitLabel())

             {

                 List<DataPoint> neighbors = getNeighbors(point, points);

                 point.setIsVisitLabel(true);

                 orderList.add(point);

                 double cd = coreDistance(neighbors);

                 if (cd != -1)

                 {

                     point.setCoreDistance(cd);

                     Queue<DataPoint> seeds = new PriorityQueue<DataPoint>(16, new Comparator<DataPoint>() {

                             public int compare (DataPoint dp1, DataPoint dp2) {

                                 double rd = dp1.getReachableDistance() - dp2.getReachableDistance();

                                 if (rd < 0) {

                                     return 1;

                                 } else {

                                     return -1;

                                 }

                             }

                         });

                     update(point, neighbors, seeds, orderList);

                     while (!seeds.isEmpty())

                     {

                         DataPoint q = seeds.poll();

                         List<DataPoint> newNeighbors = getNeighbors(q, points);

                         q.setIsVisitLabel(true);

                         orderList.add(q);

                         if (coreDistance(newNeighbors) != -1)

                         {

                             update(q, newNeighbors, seeds, orderList);

                         }

                     }

                 }

             }

         }

     }

     public void update(DataPoint p, List<DataPoint> neighbors, Queue<DataPoint> seeds, List<DataPoint> seqList)

     {

         double coreDistance = coreDistance(neighbors);

         for (DataPoint point : neighbors)

         {

             double cosineDistance = cosineDistance(p, point);

             double reachableDistance = coreDistance > cosineDistance ? coreDistance : cosineDistance;

             if (!point.getIsVisitLabel())

             {

                 if (point.getReachableDistance() == -1)

                 {

                     point.setReachableDistance(reachableDistance);

                     seeds.add(point);

                 }

                 else

                 {

                     if (point.getReachableDistance() > reachableDistance)

                     {

                         if (seeds.remove(point))

                         {

                             point.setReachableDistance(reachableDistance);

                             seeds.add(point);

                         }

                     }

                 }

             }

             else

             {

                 if (point.getReachableDistance() == -1)

                 {

                     point.setReachableDistance(reachableDistance);

                     if (seqList.remove(point))

                     {

                         seeds.add(point);

                     }

                 }

             }

         }

     }

     public void showCluster()

     {

         for (DataPoint point : orderList)

         {

             List<ElementDict> ed = point.getAllElements();

             for (ElementDict e : ed)

             {

                 System.out.print(e.getTerm() + "  ");

             }

             System.out.println();

             System.out.println("core:  " + point.getCoreDistance());

             System.out.println("reach: " + point.getReachableDistance());

             System.out.println("***************************************");

         }

     }

     public void analysis()

     {

         cluster(dataPoints);

         showCluster();

     }

     public int IndexOfList(DataPoint o, Queue<DataPoint> points)

     {

         int index = 0;

         for (DataPoint p : points)

         {

             if (o.equals(p))

             {

                 break;

             }

             index ++;

         }

         return index;

     }

 }

　　本文计算距离时采用余弦相似度，具体内容参考本系列之文本挖掘之文本相似度判定。此外，本文经过分析，某些（个）对象之前已被访问后，例如某个边界对象，其核心距离保持为初始值，若严格按照伪代码所示处理，其结果与DBSCAN的结果有些出入，因此本文作者对OPTICS进行了一点修改，使这类对象的可达距离能被修改，并将其添加至列表中，因此，在整体上其处理结果与DBSCAN算法的处理结果保持一致。本文作者认为这样做是有效的，而且存在一定的必要性，若有更好的解决方案，请联系我。

　　由于OPTICS算法所获取的是对象的有序列表，对后续数据分析、挖掘，具有较高的应用价值，因此，该算法可以作为数据预处理的前奏部分。但是，该算法由于需要维护优先级队列，因而在效率上有点影响。

　　作者：志青云集
　　出处：http://www.cnblogs.com/lyssym
　　如果，您认为阅读这篇博客让您有些收获，不妨点击一下右下角的【推荐】。
　　如果，您希望更容易地发现我的新博客，不妨点击一下左下角的【关注我】。
　　如果，您对我的博客所讲述的内容有兴趣，请继续关注我的后续博客，我是【志青云集】。
　　本文版权归作者和博客园共有，欢迎转载，但未经作者同意必须保留此段声明，且在文章页面明显位置给出原文连接。