Java实现LSH（Locality Sensitive Hash ）

　　在对大批量数据进行图像处理的时候，比如说我提取SIFT特征，数据集为10W张图片，一个SIFT特征点是128维，一张图片提取出500个特征点，这样我们在处理的时候就是对5000万个128维的数据进行处理，这样处理所需要的耗时太长了，不符合实际生产的需要。我们需要用一种方法降低运算量，比如说降维。

　　看了一些论文，提到的较多的方法是LSH（Locality Sensitive Hash），就是局部敏感哈希。我们利用LSH方法在5000万个特征点中筛选出极少量的我们需要的特征点，在对这些极少量的数据进行计算，就可以得到我们想要的结果啦。

 package com.demo.lsh;

 import com.demo.config.Constant;

 import com.demo.dao.FeatureDao;

 import com.demo.dao.FeatureTableDao;

 import com.demo.dao.HashTableDao;

 import com.demo.entity.HashTable;

 import com.demo.utils.MD5Util;

 import com.demo.utils.MathUtil;

 import org.opencv.core.Mat;

 import org.springframework.util.StringUtils;

 import java.io.*;

 import java.security.MessageDigest;

 import java.security.NoSuchAlgorithmException;

 import java.util.*;

 public class LSH {

     //维度大小，例如对于sift特征来说就是128

     private int dimention = Constant.DIMENTION;

     //所需向量中元素可能的上限，譬如对于RGB来说，就是255

     private int max = Constant.MAX;

     //哈希表的数量，用于更大程度地削减false positive

     private int hashCount = Constant.HASHCOUNT;

     //LSH随机选取的采样位数，该值越小，则近似查找能力越大，但相应的false positive也越大；若该值等于size，则为由近似查找退化为精确匹配

     private int bitCount = Constant.BITCOUNT;

     //转化为01字符串之后的位数，等于max乘以dimensions

     private int size = dimention * max;

     //LSH哈希族，保存了随机采样点的INDEX

     private int[][] hashFamily;

     private HashTableDao hashTableDao;

     /**

      * 构造函数

      */

     public LSH(HashTableDao hashTableDao) {

         this.hashTableDao = hashTableDao;

         dimention = Constant.DIMENTION;

         max = Constant.MAX;

         hashCount = Constant.HASHCOUNT;

         bitCount = Constant.BITCOUNT;

         size = dimention * max;

         hashFamily = new int[hashCount][bitCount];

         generataHashFamily();

     }

     /**

      * 生成随机的投影点 ，在程序第一次执行时生成。投影点可以理解为后面去数组的索引值

      */

     private void generataHashFamily() {

         if (new File("/home/fanxuan/data/1.txt").exists()) {

             try {

                 InputStream in = new FileInputStream("/home/fanxuan/data/1.txt");

                 ObjectInputStream oin = new ObjectInputStream(in);

                 hashFamily = (int[][]) (oin.readObject());

             } catch (IOException e) {

                 e.printStackTrace();

             } catch (ClassNotFoundException e) {

                 e.printStackTrace();

             }

         }else {

             Random rd = new Random();

             for (int i = 0; i < hashCount; i++) {

                 for (int j = 0; j < bitCount; j++) {

                     hashFamily[i][j] = rd.nextInt(size);

                 }

             }

             try {

                 OutputStream out = new FileOutputStream("/home/fanxuan/data/1.txt");

                 ObjectOutputStream oout = new ObjectOutputStream(out);

                 oout.writeObject(hashFamily);

             } catch (FileNotFoundException e) {

                 e.printStackTrace();

             } catch (IOException e) {

                 e.printStackTrace();

             }

         }

     }

     //将向量转化为二进制字符串，比如元素的最大范围255，则元素65就被转化为65个1以及190个0

     private int[] unAray(int[] data) {

         int unArayData[] = new int[size];

         for (int i = 0; i < data.length; i++) {

             for (int j = 0; j < data[i]; j++) {

                 unArayData[i * max + j] = 1;

             }

         }

         return unArayData;

     }

     /**

      * 将向量映射为LSH中的key

      */

     private String generateHashKey(int[] list, int hashNum) {

         StringBuilder sb = new StringBuilder();

         int[] tempData = unAray(list);

         int[] hashedData = new int[bitCount];

         //首先将向量转为二进制字符串

         for (int i = 0; i < bitCount; i++) {

             hashedData[i] = tempData[hashFamily[hashNum][i]];

             sb.append(hashedData[i]);

         }

         //再用常规hash函数比如MD5对key进行压缩

         MessageDigest messageDigest = null;

         try{

             messageDigest = MessageDigest.getInstance("MD5");

         }catch (NoSuchAlgorithmException e) {

         }

         byte[] binary = sb.toString().getBytes();

         byte[] hash = messageDigest.digest(binary);

         String hashV = MD5Util.bufferToHex(hash);

         return hashV;

     }

     /**

      * 将Sift特征点转换为Hash存表

      */

     public void generateHashMap(String id, int[] vercotr, int featureId) {

         for (int j = 0; j < hashCount; j++) {

             String key = generateHashKey(vercotr, j);

             HashTable hashTableUpdateOrAdd = new HashTable();

             HashTable hashTable = hashTableDao.findHashTableByBucketId(key);

             if (hashTable != null) {

                 String featureIdValue = hashTable.getFeatureId() + "," + featureId;

                 hashTableUpdateOrAdd.setFeatureId(featureIdValue);

                 hashTableUpdateOrAdd.setBucketId(key);

                 hashTableDao.updateHashTableFeatureId(hashTableUpdateOrAdd);

             } else {

                 hashTableUpdateOrAdd.setBucketId(key);

                 hashTableUpdateOrAdd.setFeatureId(String.valueOf(featureId));

                 hashTableDao.insertHashTable(hashTableUpdateOrAdd);

             }

         }

     }

     // 查询与输入向量最接近（海明空间）的向量

     public List<String> queryList(int[] data) {

         List<String> result = new ArrayList<>();

         for (int j = 0; j < hashCount; j++) {

             String key = generateHashKey(data, j);

             result.add(key);

             HashTable hashTable = hashTableDao.findHashTableByBucketId(key);

             if (!StringUtils.isEmpty(hashTable.getFeatureId())) {

                 String[] str = hashTable.getFeatureId().split(",");

                 for (String string : str) {

                     result.add(string);

                 }

             }

         }

         return result;

     }

 }

 package com.demo.config;

 public class Constant {

     //维度大小，例如对于sift特征来说就是128

     public static final int DIMENTION = 128;

     //所需向量中元素可能的上限，譬如对于RGB来说，就是255

     public static final int MAX = 255;

     //哈希表的数量，用于更大程度地削减false positive

     public static final int HASHCOUNT = 12;

     //LSH随机选取的采样位数，该值越小，则近似查找能力越大，但相应的false positive也越大；若该值等于size，则为由近似查找退化为精确匹配

     public static final int BITCOUNT = 32;

 }

　　简单的介绍下代码，构造函数LSH（）用来建立LSH对象，hashTableDao为数据表操作对象，不多说;因为局部敏感哈希依赖与一套随机数，每次产生的结果都不一致，所以我们需要在程序第一次运行的时候将随机数生成并固定下来，我采用的方法是存放在本地磁盘中，也可以存放在数据库中。generateHashMap（）方法为数据训练函数，int[] vercotr为特征向量，其他两个参数为我需要的标志位。queryList（）方法是筛选方法。

　　感谢http://grunt1223.iteye.com/blog/944894的文章。