hrust工程化学习(六)----最近邻搜索

0. 简介

上一章我们主要讲了噪声滤除这种比较高级的点云处理算法，下面这一章我们将来看一下最近邻搜索。最近邻搜索（Nearest Neighbor Search）是一种广泛应用于机器学习、计算机视觉和自然语言处理等领域的算法。它的主要目的是找到一个给定数据集中与查询数据最相似的数据点。

1. CUDA与Thrust

在传统的CPU计算中，最近邻搜索通常需要对每个数据点与查询点进行一次比较。这样的计算复杂度在数据集较大时会变得非常高，导致查询时间非常慢。而使用CUDA和Thrust方法可以加速这个过程，大大提高最近邻搜索的效率。

CUDA是一种并行计算架构，它可以利用GPU的强大计算能力来加速各种计算任务。在最近邻搜索中，可以使用CUDA实现并行计算，将每个数据点与查询点之间的比较分配给GPU处理。这种方法可以大大缩短查询时间，并且可以处理大量的数据集。Thrust是一个基于CUDA的C++模板库，提供了一些高效的并行算法和数据结构。它可以轻松地将串行代码转换为并行代码，从而使代码更加高效。在最近邻搜索中，可以使用Thrust实现并行排序和并行查找算法，从而提高搜索效率。

2. CUDA代码完成加速

下面这段代码是基于哈希表的最近邻搜索算法的CUDA实现。该算法主要通过将三维点云分割成一个个小的栅格，并将点云放入栅格中，以此实现快速查找每个点云的最近邻。具体实现方法如下：

首先，将点云分为两部分，分别为d_first_point_cloud和d_second_point_cloud，其中d_second_point_cloud为需要查找最近邻的点云。然后，通过调用CUDA核函数kernel_nearestNeighborSearch，对每个d_second_point_cloud中的点进行查找。

在核函数中，**首先根据点的位置确定该点所在栅格的索引。如果该点不在栅格内，则将最近邻索引设为-1。**接下来，遍历与该点所在栅格相邻的栅格，寻找距离该点最近的点，并返回该点的索引。

在遍历相邻的栅格时，考虑两种情况：如果栅格与该点所在栅格相同，则在栅格中查找距离该点最近的点，并将最近邻的索引存储在nn_index中；否则，在该栅格中找到一定数量的最近邻点，将它们与nn_index中的点进行比较，并更新nn_index中的最近邻点。

最后，将每个点的最近邻点的索引存储在d_nearest_neighbour_indexes数组中，以供后续使用。

/nearest neighbourhood search
/*
最近邻搜索核函数
param:
d_first_point_cloud:第一个点云
number_of_points_first_point_cloud:第一个点云的点数
d_second_point_cloud:第二个点云
number_of_points_second_point_cloud:第二个点云的点数
d_hashTable:hash表
d_buckets:栅格
rgd_params:网格参数
search_radius:搜索半径
max_number_considered_in_INNER_bucket:在内桶中考虑的最大点数
max_number_considered_in_OUTER_bucket:在外桶中考虑的最大点数
d_nearest_neighbour_indexes:最近邻索引
*/
__global__ void kernel_nearestNeighborSearch(
	pcl::PointXYZ *d_first_point_cloud,
	int number_of_points_first_point_cloud,
	pcl::PointXYZ *d_second_point_cloud,
	int number_of_points_second_point_cloud,
	hashElement *d_hashTable,
	bucket *d_buckets,
	gridParameters rgd_params,
	float search_radius,
	int max_number_considered_in_INNER_bucket,
	int max_number_considered_in_OUTER_bucket,
	int *d_nearest_neighbour_indexes)
{
	int index_of_point_second_point_cloud = blockIdx.x * blockDim.x + threadIdx.x; // 第二个点云的点索引

	if (index_of_point_second_point_cloud < number_of_points_second_point_cloud) // 如果索引小于第二个点云的点数
	{
		bool isok = false;

		float x = d_second_point_cloud[index_of_point_second_point_cloud].x;
		float y = d_second_point_cloud[index_of_point_second_point_cloud].y;
		float z = d_second_point_cloud[index_of_point_second_point_cloud].z;

		if (x < rgd_params.bounding_box_min_X || x > rgd_params.bounding_box_max_X) // 如果x坐标不在网格的范围内
		{
			d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1; // 将最近邻索引设为-1
			return;
		}
		if (y < rgd_params.bounding_box_min_Y || y > rgd_params.bounding_box_max_Y)
		{
			d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
			return;
		}
		if (z < rgd_params.bounding_box_min_Z || z > rgd_params.bounding_box_max_Z)
		{
			d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
			return;
		}

		int ix = (x - rgd_params.bounding_box_min_X) / rgd_params.resolution_X; // 计算栅格的索引
		int iy = (y - rgd_params.bounding_box_min_Y) / rgd_params.resolution_Y;
		int iz = (z - rgd_params.bounding_box_min_Z) / rgd_params.resolution_Z;

		int index_bucket = ix * rgd_params.number_of_buckets_Y *
							   rgd_params.number_of_buckets_Z +
						   iy * rgd_params.number_of_buckets_Z + iz; // 计算栅格在整个队列中的索引
		int nn_index = -1;

		if (index_bucket >= 0 && index_bucket < rgd_params.number_of_buckets) //
		{
			int sx, sy, sz, stx, sty, stz;
			if (ix == 0)
				sx = 0;
			else
				sx = -1;
			if (iy == 0)
				sy = 0;
			else
				sy = -1;
			if (iz == 0)
				sz = 0;
			else
				sz = -1;

			if (ix == rgd_params.number_of_buckets_X - 1)
				stx = 1;
			else
				stx = 2;
			if (iy == rgd_params.number_of_buckets_Y - 1)
				sty = 1;
			else
				sty = 2;
			if (iz == rgd_params.number_of_buckets_Z - 1)
				stz = 1;
			else
				stz = 2;

			float _distance = 100000000.0f;
			int index_next_bucket;
			int iter;
			int number_of_points_in_bucket;
			int l_begin;
			int l_end;

			for (int i = sx; i < stx; i++)
			{
				for (int j = sy; j < sty; j++)
				{
					for (int k = sz; k < stz; k++)
					{
						index_next_bucket = index_bucket +
											i * rgd_params.number_of_buckets_Y * rgd_params.number_of_buckets_Z +
											j * rgd_params.number_of_buckets_Z + k;
						if (index_next_bucket >= 0 && index_next_bucket < rgd_params.number_of_buckets)
						{
							number_of_points_in_bucket = d_buckets[index_next_bucket].number_of_points;
							if (number_of_points_in_bucket <= 0)
								continue;

							int max_number_considered_in_bucket;
							if (index_next_bucket == index_bucket)
							{
								max_number_considered_in_bucket = max_number_considered_in_INNER_bucket;
							}
							else
							{
								max_number_considered_in_bucket = max_number_considered_in_OUTER_bucket;
							}
							if (max_number_considered_in_bucket <= 0)
								continue;

							if (max_number_considered_in_bucket >= number_of_points_in_bucket)
							{
								iter = 1;
							}
							else
							{
								iter = number_of_points_in_bucket / max_number_considered_in_bucket;
								if (iter <= 0)
									iter = 1;
							}

							l_begin = d_buckets[index_next_bucket].index_begin;
							l_end = d_buckets[index_next_bucket].index_end;

							for (int l = l_begin; l < l_end; l += iter)
							{
								if (l >= 0 && l < number_of_points_first_point_cloud)
								{
									int hashed_index_of_point = d_hashTable[l].index_of_point;
									// inA[hashed_index_of_point].var = 1;

									float nn_x = d_first_point_cloud[hashed_index_of_point].x;
									float nn_y = d_first_point_cloud[hashed_index_of_point].y;
									float nn_z = d_first_point_cloud[hashed_index_of_point].z;

									float dist = (x - nn_x) * (x - nn_x) +
												 (y - nn_y) * (y - nn_y) +
												 (z - nn_z) * (z - nn_z);

									if (dist <= search_radius * search_radius)
									{
										if (dist < _distance)
										{
											isok = true;
											nn_index = hashed_index_of_point;
											_distance = dist;
										}
									}
								}
							}
						}
					}
				}
			}
		}

		if (isok)
		{
			d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = nn_index;
		}
		else
		{
			d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
		}
	}
}

然后我们下面的函数就是调用上面的函数一个CUDA并行的最近邻搜索算法。通过传入采用了两个点云，并将其中一个点云的每个点与第二个点云中最近的点进行匹配，即找到第二个点云中距离该点最近的点的索引。

cudaError_t cudaNearestNeighborSearch(
	int threads,
	pcl::PointXYZ *d_first_point_cloud,
	int number_of_points_first_point_cloud,
	pcl::PointXYZ *d_second_point_cloud,
	int number_of_points_second_point_cloud,
	hashElement *d_hashTable,
	bucket *d_buckets,
	gridParameters rgd_params,
	float search_radius,
	int max_number_considered_in_INNER_bucket,
	int max_number_considered_in_OUTER_bucket,
	int *d_nearest_neighbour_indexes)
{
	cudaError_t err = cudaGetLastError();

	int blocks = number_of_points_second_point_cloud / threads + 1;

	kernel_nearestNeighborSearch<<<blocks, threads>>>(
		d_first_point_cloud,
		number_of_points_first_point_cloud,
		d_second_point_cloud,
		number_of_points_second_point_cloud,
		d_hashTable,
		d_buckets,
		rgd_params,
		search_radius,
		max_number_considered_in_INNER_bucket,
		max_number_considered_in_OUTER_bucket,
		d_nearest_neighbour_indexes);

	err = cudaDeviceSynchronize();
	return err;
}

该函数为在CUDA平台上实现最近邻搜索（nearest neighbor search）算法。其输入参数为两个点云（pcl::PointCloudpcl::PointXYZ &first_point_cloud，pcl::PointCloudpcl::PointXYZ &second_point_cloud），搜索半径search_radius，Bounding box extension参数，以及max_number_considered_in_INNER_bucket和max_number_considered_in_OUTER_bucket，这两个参数是用于限制每个bucket中内部和外部考虑的最大点数。最后一个参数nearest_neighbour_indexes为最近邻索引结果的输出。

函数的大致流程为：

检查最近邻索引的大小是否等于第二个点云的大小，如果不相等，返回false。
设置GPU设备，并且打印当前可用的设备线程数。
为第一个点云和第二个点云在GPU中分配内存，并将它们的数据从主机内存复制到GPU内存中。
计算网格参数，并在GPU中分配内存来存储哈希表和bucket。
在GPU中计算网格并计算最近邻索引。
将最近邻索引从GPU内存复制回主机内存中，存储在nearest_neighbour_indexes中。
使用了OpenGL来显示点云数据。点云数据由两个不同的点云组成：first_point_cloud和second_point_cloud。在循环中，对于第二个点云中的每个点，找到最近邻的点（在第一个点云中），并将这两个点作为一条线绘制出来。这里使用了OpenGL的glVertex3f函数来定义每个点的坐标。整个循环遍历second_point_cloud中的所有点，从而绘制所有的最近邻点对的连线。如果nearest_neighbour_indexes中的索引是-1，则说明该点没有找到最近邻点，不会被绘制出来。

函数中涉及到一些额外的函数，如cudaCalculateGridParams、cudaCalculateGrid和cudaNearestNeighborSearch，这些函数都是在CUDA平台上执行的，并负责在GPU上执行相应的操作。

bool nearestNeighbourhoodSearch(
	pcl::PointCloud<pcl::PointXYZ> &first_point_cloud,
	pcl::PointCloud<pcl::PointXYZ> &second_point_cloud,
	float search_radius,
	float bounding_box_extension,
	int max_number_considered_in_INNER_bucket,
	int max_number_considered_in_OUTER_bucket,
	std::vector<int> &nearest_neighbour_indexes)
{
	if (nearest_neighbour_indexes.size() != second_point_cloud.size()) // 如果最近邻索引的大小不等于第二个点云的大小
		return false;												   // 返回false

	cudaError_t err = ::cudaSuccess;
	err = cudaSetDevice(0); // 设置设备
	if (err != ::cudaSuccess)
		return false;

	std::cout << "Before cudaMalloc" << std::endl;
	coutMemoryStatus();

	gridParameters rgd_params;
	pcl::PointXYZ *d_first_point_cloud = NULL;
	pcl::PointXYZ *d_second_point_cloud = NULL;
	int *d_nearest_neighbour_indexes = NULL;
	hashElement *d_hashTable = NULL;
	bucket *d_buckets = NULL;

	int threads = getNumberOfAvailableThreads();
	std::cout << "CUDA code will use " << threads << " device threads" << std::endl;
	if (threads == 0)
		return false;

	err = cudaMalloc((void **)&d_first_point_cloud, first_point_cloud.points.size() * sizeof(pcl::PointXYZ)); // 为第一个点云分配内存
	if (err != ::cudaSuccess)
		return false;
	err = cudaMemcpy(d_first_point_cloud, first_point_cloud.points.data(), first_point_cloud.points.size() * sizeof(pcl::PointXYZ), cudaMemcpyHostToDevice); // 将第一个点云的数据复制到GPU中
	if (err != ::cudaSuccess)
		return false;

	err = cudaMalloc((void **)&d_second_point_cloud, second_point_cloud.points.size() * sizeof(pcl::PointXYZ)); // 为第二个点云分配内存
	if (err != ::cudaSuccess)
		return false;
	err = cudaMemcpy(d_second_point_cloud, second_point_cloud.points.data(), second_point_cloud.points.size() * sizeof(pcl::PointXYZ), cudaMemcpyHostToDevice); // 将第二个点云的数据复制到GPU中
	if (err != ::cudaSuccess)
		return false;

	err = cudaCalculateGridParams(d_first_point_cloud, first_point_cloud.points.size(),
								  search_radius, search_radius, search_radius, bounding_box_extension, rgd_params); // 计算网格参数
	if (err != ::cudaSuccess)
		return false;
	std::cout << "regular grid parameters:" << std::endl;
	std::cout << "bounding_box_min_X: " << rgd_params.bounding_box_min_X << std::endl;
	std::cout << "bounding_box_min_Y: " << rgd_params.bounding_box_min_Y << std::endl;
	std::cout << "bounding_box_min_Z: " << rgd_params.bounding_box_min_Z << std::endl;
	std::cout << "bounding_box_max_X: " << rgd_params.bounding_box_max_X << std::endl;
	std::cout << "bounding_box_max_Y: " << rgd_params.bounding_box_max_Y << std::endl;
	std::cout << "bounding_box_max_Z: " << rgd_params.bounding_box_max_Z << std::endl;
	std::cout << "number_of_buckets_X: " << rgd_params.number_of_buckets_X << std::endl;
	std::cout << "number_of_buckets_Y: " << rgd_params.number_of_buckets_Y << std::endl;
	std::cout << "number_of_buckets_Z: " << rgd_params.number_of_buckets_Z << std::endl;
	std::cout << "resolution_X: " << rgd_params.resolution_X << std::endl;
	std::cout << "resolution_Y: " << rgd_params.resolution_Y << std::endl;
	std::cout << "resolution_Z: " << rgd_params.resolution_Z << std::endl;

	err = cudaMalloc((void **)&d_hashTable, first_point_cloud.points.size() * sizeof(hashElement));
	if (err != ::cudaSuccess)
		return false;

	err = cudaMalloc((void **)&d_buckets, rgd_params.number_of_buckets * sizeof(bucket));
	if (err != ::cudaSuccess)
		return false;

	err = cudaMalloc((void **)&d_nearest_neighbour_indexes, second_point_cloud.points.size() * sizeof(int));
	if (err != ::cudaSuccess)
		return false;

	std::cout << "After cudaMalloc" << std::endl;
	coutMemoryStatus();

	err = cudaCalculateGrid(threads, d_first_point_cloud, d_buckets, d_hashTable, first_point_cloud.points.size(), rgd_params); // 计算网格
	if (err != ::cudaSuccess)
		return false;

	err = cudaNearestNeighborSearch(
		threads,
		d_first_point_cloud,
		first_point_cloud.points.size(),
		d_second_point_cloud,
		second_point_cloud.points.size(),
		d_hashTable,
		d_buckets,
		rgd_params,
		search_radius,
		max_number_considered_in_INNER_bucket,
		max_number_considered_in_OUTER_bucket,
		d_nearest_neighbour_indexes); // 计算最近邻索引
	if (err != ::cudaSuccess)
		return false;

	err = cudaMemcpy(nearest_neighbour_indexes.data(), d_nearest_neighbour_indexes, second_point_cloud.points.size() * sizeof(int), cudaMemcpyDeviceToHost); // 将最近邻索引复制到CPU中
	if (err != ::cudaSuccess)
	{
		return false;
	}

	err = cudaFree(d_first_point_cloud);
	d_first_point_cloud = NULL;
	if (err != ::cudaSuccess)
		return false;

	err = cudaFree(d_second_point_cloud);
	d_second_point_cloud = NULL;
	if (err != ::cudaSuccess)
		return false;

	err = cudaFree(d_hashTable);
	d_hashTable = NULL;
	if (err != ::cudaSuccess)
		return false;

	err = cudaFree(d_buckets);
	d_buckets = NULL;
	if (err != ::cudaSuccess)
		return false;

	err = cudaFree(d_nearest_neighbour_indexes);
	d_nearest_neighbour_indexes = NULL;
	if (err != ::cudaSuccess)
		return false;

	std::cout << "After cudaFree" << std::endl;
	coutMemoryStatus();

	return true;
}

//然后可以尝试着opengl显示
for (size_t i = 0; i < second_point_cloud.size(); i++)
	{
		int index_nn = nearest_neighbour_indexes[i];
		if (index_nn != -1 && index_nn >= 0 && index_nn < first_point_cloud.size())
		{
			glVertex3f(second_point_cloud[i].x, second_point_cloud[i].y, second_point_cloud[i].z);
			glVertex3f(first_point_cloud[index_nn].x, first_point_cloud[index_nn].y, first_point_cloud[index_nn].z);
		}
	}

3. Thrust代码完成加速

这段代码是使用Thrust库实现的最近邻搜索的函数。该函数的输入是两个点云first_point_cloud和second_point_cloud，以及一个搜索半径search_radius。输出是一个向量nearest_neighbour_indexes，其中包含了first_point_cloud中每个点的最近邻在second_point_cloud中的索引。此外，在is_nearest_neighbor函数中，使用了一个for循环来遍历second_point_cloud中的每个点，计算该点和查询点之间的距离，并找到最小距离的点。如果找到了最近邻，则将最近邻的索引赋值给nearest_neighbour_indexes向量中对应的位置。这里使用了一个bool变量found_nearest_neighbor来标记是否找到了最近邻，函数返回该变量的值。

敢敢のwings