目的

opengl虽然老,但是算上opengl es,应该是应用最广泛的显卡api。用compute shader做计算,可以一定程度上摆脱N卡的限制,也摆脱windows和linux,mac等平台的限制。
计算着色器应该没有完全榨干硬件的性能,但是也取得了可观的并行性。

compute shader的并发模型

【opengl学习】opengl的compute shader-LMLPHPcompute shader把并发任务拆成了一个三维的工作组,即一个并发任务可以有三个维度,我理解是为了方便索引。然后每个工作组下可以定义多个工作项,也有三个维度。工作组和工作项的区别在于,一个工作组的不同工作项可以共享变量,用share关键字即可,他们之间可以相互协同,类似于线程,可以完成比较复杂的工作。而工作组之间类似于进程,不方便共享,更独立。
一个shader只实现一个工作项的逻辑,工作项数目由shader指定,似乎不能动态设置,而工作项数目由api指定,可以动态设置。
【opengl学习】opengl的compute shader-LMLPHP
【opengl学习】opengl的compute shader-LMLPHP

计算着色器中的内置变量来进行索引:
假设一个任务有 (10,5,3)个工作组,每个工作组有(2,3,4)个工作项

  • gl_WorkGroupSize:全局工作组数量。三维数组(10,5,3)
  • gl_NumGroupSize: 每个工作组的工作项数,三维数组,即(2,3,4)
  • gl_WorkGroupID:当前工作组的全局ID。三维数组 ,范围是([0-9],[0-4],[0-2])
  • gl_LocalInvocationID:当前工作项的局部ID。三维数组,范围是([0-1],[0-2],[0-3])
  • gl_GlobalInvocationID:当前工作项的全局ID。三维数组,相当于gl_WorkGroupID和gl_LocalInvocationID拉平的id:
    • 比如第(5,2,1)工作组的第(1,2,1)个工作项的gl_GlobalInvocationID = (52+1, 23+2, 1*4+1) = (11,8,5)
  • gl_LocalInvocationIndex:当前工作项的线性索引。一个数,相当于拉成一维来进行唯一索引,即上一项进一步拉平

性能测试

比较cpu单核性能和集成显卡,我的pc参数如下:
windows11 专业版
cpu 13th Gen Intel® Core™ i5-1340P 1.90 GHz
显卡 Intel® Iris® Xe Graphics
【opengl学习】opengl的compute shader-LMLPHP
测试 10002000的矩阵和20001000的矩阵相乘,这个显卡很垃圾
代码

#include <iostream>
#include <string>
#include <vector>
#include <random>
#include <chrono>
#include "glad/glad.h"
#include "GLFW/glfw3.h"
#include "img_util.h"
#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/type_ptr.hpp>

std::string loadShaderSource(const std::string& shaderPath) {
	FILE* file = fopen(shaderPath.c_str(), "r");
	std::vector<char> res;
	if (file == NULL) {
		std::cout << "open shader file error:" << shaderPath << std::endl;
		return "";
	}
	// 计算大小
	fseek(file, 0, SEEK_END);
	long fileSize = ftell(file);
	fseek(file, 0, SEEK_SET);

	if (fileSize > 0) {
		res.resize(fileSize);
		long readSize = fread(res.data(), sizeof(char), fileSize, file);
		if (readSize > fileSize) {
			std::cout << "read shader file error:" << shaderPath
				<< "fileSize: " << fileSize << ",readSize: " << readSize
				<< " ,content: " << std::string(res.begin(), res.end()) << std::endl;
			return "";
		}
	}

	return std::string(res.begin(), res.end());

}

void randomFloatVector(std::vector<float>& vec, float min_value = -1.0f, float max_value= 1.0f) {
	size_t size = vec.size();
	// 创建一个随机数生成器
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> dis(min_value, max_value);

	// 创建一个 vector 并随机初始化
	for (size_t i = 0; i < size; ++i) {
		vec[i] = dis(gen);
	}
}

GLFWwindow* initWindow() {
	glfwInit();
	glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
	glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
	glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);

	GLFWwindow* window = glfwCreateWindow(800, 600, "LearnOpenGL", NULL, NULL);
	if (window == NULL)
	{
		std::cout << "Failed to create GLFW window" << std::endl;
		glfwTerminate();
		return NULL;
	}
	glfwMakeContextCurrent(window);


	if (!gladLoadGLLoader((GLADloadproc)glfwGetProcAddress))
	{
		std::cout << "Failed to initialize GLAD" << std::endl;
		return NULL;
	}

	glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_NORMAL);

	return window;
}

int main(int argc, char** argv) {
	GLFWwindow* window = initWindow();
	if (!window) {
		std::cout << "init window failed" << std::endl;
		return -1;
	}

	int m = 1000;
	int n = 2000;
	std::string compteShaderPath = "D:\\projects\\cmake_proj\\shaders\\compute_shaders\\matmul.comp";

	std::cout << "init data begin" << std::endl;
	std::vector<float> xData(m * n); 
	randomFloatVector(xData);
	std::vector<float> wData(m * n);
	randomFloatVector(wData);
	std::vector<float> outDataGpu(m * m);
	std::vector<float> outDataCpu(m * m);

	GLuint xBuffer, wBuffer, outBuffer;

	std::string computeShaderSource = loadShaderSource(compteShaderPath);

	// 创建缓冲区
	glGenBuffers(1, &xBuffer);
	glGenBuffers(1, &wBuffer);
	glGenBuffers(1, &outBuffer);

	// 绑定并初始化 x 缓冲区
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, xBuffer);
	glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) * m * n, xData.data(), GL_STATIC_DRAW);

	// 绑定并初始化 w 缓冲区
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, wBuffer);
	glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) *  m * n, wData.data(), GL_STATIC_DRAW);

	// 绑定并初始化 out 缓冲区
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, outBuffer);
	glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(float) * m * m, nullptr, GL_STATIC_DRAW);

	// 创建shader
	std::cout << "load shader begin" << std::endl;
	GLuint computeShaderProgram = glCreateProgram();
	GLuint computeShader = glCreateShader(GL_COMPUTE_SHADER);
	const char* source = computeShaderSource.c_str();
	glShaderSource(computeShader, 1, &source, nullptr);
	glCompileShader(computeShader);

	// 检查编译错误
	GLint success;
	glGetShaderiv(computeShader, GL_COMPILE_STATUS, &success);
	if (!success) {
		char infoLog[512];
		glGetShaderInfoLog(computeShader, 512, nullptr, infoLog);
		std::cerr << "Compute shader compilation failed: " << infoLog << std::endl;
	}

	// 链接着色器程序
	glAttachShader(computeShaderProgram, computeShader);
	glLinkProgram(computeShaderProgram);

	// 检查链接错误
	glGetProgramiv(computeShaderProgram, GL_LINK_STATUS, &success);
	if (!success) {
		char infoLog[512];
		glGetProgramInfoLog(computeShaderProgram, 512, nullptr, infoLog);
		std::cerr << "Compute shader program linking failed: " << infoLog << std::endl;
	}

	//计时
	std::cout << "gpu compute begin" << std::endl;
	auto clk = std::chrono::high_resolution_clock();
	auto bg = clk.now();
	glUseProgram(computeShaderProgram);

	// 设置 n 
	glUniform1i(glGetUniformLocation(computeShaderProgram, "n"), n);
	glUniform1i(glGetUniformLocation(computeShaderProgram, "m"), m);
	
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, xBuffer);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, wBuffer);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, outBuffer);

	// 设置工作组大小
	glDispatchCompute(m, m, 1);

	// 确保所有计算完成
	glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

	// 读取数据
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, outBuffer);
	glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(float) * m * m, outDataGpu.data());

	auto ed = clk.now();
	auto gpuTime = std::chrono::duration_cast<std::chrono::microseconds>(ed - bg).count() / 1000.0;
	std::cout << "gpu time:" << gpuTime << "ms" << std::endl;

	bg = clk.now();
	for (int i = 0;i < m; ++i) {
		for (int j = 0;j < m; ++j) {
			float val = 0.0;
			for (int k = 0;k < n;++k) {
				val += xData[i * n + k] * wData[j + k * m];
			}
			outDataCpu[i * m + j] = val;
		}
	}
	ed = clk.now();
	auto cpuTime = std::chrono::duration_cast<std::chrono::microseconds>(ed - bg).count() / 1000.0;

	std::cout << "cpu time:" << cpuTime << "ms" << std::endl;

	float diff = 0.0;
	for (int i = 0;i < m*m; ++i) {
		diff += fabs(outDataGpu[i] - outDataCpu[i]);
	}

	std::cout << "diff: " << diff << ", avg:" << diff / (m * m) <<", cpu / gpu: " << cpuTime / gpuTime << std::endl;
	// 释放资源
	glDeleteBuffers(1, &xBuffer);
	glDeleteBuffers(1, &wBuffer);
	glDeleteBuffers(1, &outBuffer);
	glDeleteShader(computeShader);
	glDeleteProgram(computeShaderProgram);

	return 0;
}

shader

#version 460 core
uniform int n;
uniform int m;

layout(local_size_x = 1, local_size_y = 1) in;
layout(binding = 0) readonly buffer Input0 {
    float data[];
} x;
layout(binding = 1) readonly buffer Input1 {
    float data[];
} w;

layout(binding = 2) writeonly buffer Output0 {
    float data[];
} xout;

void main() {
    int i = int(gl_GlobalInvocationID.x); // x第i 行
    int j = int(gl_GlobalInvocationID.y); // w第j列
    float val = 0.0;
    for (int k=0; k<n; ++k) {
        val += x.data[i*n + k] * w.data[j + k *m ]; 
    }
    xout.data[i*m + j] = val;
}

采用了一个工作组数量都设置成1,gl_GlobalInvocationID等同于gl_WorkGroupID。运行结果
【opengl学习】opengl的compute shader-LMLPHP
相比于单线程,可以获得大概8-10倍的速度提升

参考资料

https://github.com/SingingRivulet/transformer.gl.git
https://github.com/cgoxopx/llama2.gl
https://zhuanlan.zhihu.com/p/673144065
https://blog.csdn.net/qq_26328385/article/details/105526000

10-10 23:17