TensorRT C++ ONNX 推理
CUDA 12.4.1 + TensorRT10.1.0 C++ 推理
一.onnxruntime 推理生成输入输出
import onnxruntime as ort
import numpy as np
import sys
# 创建 ONNX Runtime session
model_path = sys.argv[1]
session = ort.InferenceSession(model_path)
model_name=model_path.split(".")[0]
# 获取输入信息
inputs_info = session.get_inputs()
outputs_info = session.get_outputs()
# 生成随机输入数据
input_data = {}
for input_info in inputs_info:
input_name = input_info.name
input_shape = input_info.shape
input_type = input_info.type
print("input:",input_name,input_shape,input_type);
# 根据输入的形状和类型生成随机数据
if input_type == 'tensor(int64)':
input_data[input_name] = np.random.randint(0, 10, size=input_shape).astype(np.int64)
elif input_type == 'tensor(float)':
input_data[input_name] = np.random.rand(*input_shape).astype(np.float32)
elif input_type == 'tensor(bool)':
input_data[input_name] = np.random.choice([True, False], size=input_shape).astype(np.bool_)
else:
print(f"Unsupported input type for {input_name}: {input_type}")
with open(f'{model_name}-{input_name}-input.bin', 'wb') as f:
f.write(input_data[input_name].tobytes())
# 执行推理
outputs = session.run(None, input_data)
# 打印输出
for i, output_info in enumerate(outputs_info):
output_name = output_info.name
print(f'Output {output_name} with shape {output_info.shape}')
with open(f'{model_name}-{output_name}-output.bin', 'wb') as f:
f.write(outputs[i].tobytes())
二.TensorRT推理,比对MSE
#include <iostream>
#include <fstream>
#include <cassert>
#include <string>
#include <vector>
#include <map>
#include <functional>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvOnnxParser.h>
/**
* @brief 自定义日志器类,用于记录 TensorRT 的日志信息
*/
class Logger : public nvinfer1::ILogger {
public:
/**
* @brief 实现日志记录函数
* @param severity 日志级别
* @param msg 日志信息
*/
void log(Severity severity, const char* msg) noexcept override {
// 只记录警告及以上级别的日志
if (severity <= Severity::kWARNING) {
std::cout << msg << std::endl;
}
}
};
// 全局日志器实例
static Logger gLogger;
/**
* @brief 计算数据的哈希值,用于缓存模型
* @param data 数据指针
* @param size 数据大小
* @return size_t 哈希值
*/
size_t computeHash(const void* data, std::size_t size) {
if (data == nullptr || size == 0) {
throw std::invalid_argument("数据指针为空或大小为零");
}
return std::hash<std::string>()(std::string(static_cast<const char*>(data), size));
}
/**
* @brief 获取张量数据类型对应的字节数
* @param dtype 数据类型
* @return int 字节数
*/
int getTensorBytesPerComponent(nvinfer1::DataType dtype) {
switch (dtype) {
case nvinfer1::DataType::kINT64:
return 8;
case nvinfer1::DataType::kFLOAT:
case nvinfer1::DataType::kINT32:
return 4;
case nvinfer1::DataType::kHALF:
case nvinfer1::DataType::kBF16:
return 2;
case nvinfer1::DataType::kINT8:
case nvinfer1::DataType::kFP8:
case nvinfer1::DataType::kUINT8:
case nvinfer1::DataType::kBOOL:
return 1;
default:
throw std::invalid_argument("未知的数据类型");
}
}
/**
* @brief 输入输出张量信息结构体
*/
struct IOTensorInfo {
/**
* @brief 构造函数,初始化张量信息
* @param engine TensorRT 引擎指针
* @param index 张量索引
*/
IOTensorInfo(nvinfer1::ICudaEngine* engine, int index) {
if (engine == nullptr) {
throw std::invalid_argument("引擎指针为空");
}
if (index < 0 || index >= engine->getNbIOTensors()) {
throw std::out_of_range("张量索引超出范围");
}
tensorIndex = index;
tensorName = engine->getIOTensorName(tensorIndex);
tensorDims = engine->getTensorShape(tensorName.c_str());
ioMode = engine->getTensorIOMode(tensorName.c_str());
format = engine->getTensorFormat(tensorName.c_str());
dataType = engine->getTensorDataType(tensorName.c_str());
bytesPerComponent = getTensorBytesPerComponent(dataType);
elementCount = 1;
for (int j = 0; j < tensorDims.nbDims; ++j) {
elementCount *= tensorDims.d[j];
}
}
/**
* @brief 重载输出运算符,便于打印张量信息
* @param os 输出流
* @param obj IOTensorInfo 对象
* @return std::ostream& 输出流
*/
friend std::ostream& operator<<(std::ostream& os, const IOTensorInfo& obj) {
os << (obj.ioMode == nvinfer1::TensorIOMode::kINPUT ? "Input" : "Output") << " " << obj.tensorIndex
<< ": Name: " << obj.tensorName
<< ", Format: " << static_cast<int32_t>(obj.format)
<< ", DataType: " << static_cast<int32_t>(obj.dataType)
<< ", BytesPerComponent: " << obj.bytesPerComponent
<< ", ElementCount: " << obj.elementCount
<< ", Dimensions: (";
for (int j = 0; j < obj.tensorDims.nbDims; ++j) {
os << obj.tensorDims.d[j];
if (j < obj.tensorDims.nbDims - 1) {
os << ", ";
}
}
os << ")";
return os;
}
int tensorIndex; ///< 张量索引
std::string tensorName; ///< 张量名称
nvinfer1::TensorIOMode ioMode; ///< 输入或输出模式
nvinfer1::Dims tensorDims; ///< 张量维度
nvinfer1::TensorFormat format; ///< 张量格式
nvinfer1::DataType dataType; ///< 数据类型
int32_t bytesPerComponent; ///< 每个组件的字节数
int32_t elementCount; ///< 元素总数
};
/**
* @brief 将 TensorRT 的模型保存到文件中
* @param hostMemory TensorRT 主机内存指针
* @param filename 文件名
* @return bool 是否保存成功
*/
bool saveIHostMemoryToFile(nvinfer1::IHostMemory* hostMemory, const std::string& filename) {
if (hostMemory == nullptr || filename.empty()) {
std::cerr << "无效的主机内存指针或文件名为空" << std::endl;
return false;
}
std::ofstream file(filename, std::ios::binary);
if (!file) {
std::cerr << "无法打开文件进行写入: " << filename << std::endl;
return false;
}
size_t size = hostMemory->size();
if (size == 0) {
std::cerr << "主机内存大小为零" << std::endl;
return false;
}
file.write(reinterpret_cast<const char*>(&size), sizeof(size_t)); // 写入大小
file.write(static_cast<const char*>(hostMemory->data()), size); // 写入数据
file.close();
return true;
}
/**
* @brief 检查文件是否存在
* @param filename 文件名
* @return bool 是否存在
*/
bool fileExists(const std::string& filename) {
if (filename.empty()) {
return false;
}
std::ifstream file(filename);
return file.good();
}
/**
* @brief 读取二进制文件内容
* @param filename 文件名
* @return std::vector<char> 文件内容
*/
std::vector<char> readBinaryFile(const std::string& filename) {
if (filename.empty()) {
std::cerr << "文件名为空" << std::endl;
return {};
}
std::ifstream file(filename, std::ios::binary);
if (!file) {
std::cerr << "无法打开文件: " << filename << std::endl;
return {};
}
file.seekg(0, std::ios::end);
std::streamsize size = file.tellg();
if (size <= 0) {
std::cerr << "文件大小无效: " << filename << std::endl;
return {};
}
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
std::cerr << "读取文件失败: " << filename << std::endl;
return {};
}
file.close();
return buffer;
}
/**
* @brief TensorRT 推理类
*/
class TRTInferenceEngine {
public:
/**
* @brief 构造函数,初始化推理引擎
* @param serializedOnnxModel 序列化的 ONNX 模型数据指针
* @param modelSize 模型数据大小
*/
TRTInferenceEngine(const void* serializedOnnxModel, size_t modelSize) {
if (serializedOnnxModel == nullptr || modelSize == 0) {
throw std::invalid_argument("模型数据指针为空或大小为零");
}
runtime_ = nvinfer1::createInferRuntime(gLogger);
if (runtime_ == nullptr) {
throw std::runtime_error("创建 TensorRT Runtime 失败");
}
size_t hashValue = computeHash(serializedOnnxModel, modelSize);
cachePath_ = ".trt_cachemodel_" + std::to_string(hashValue) + ".engine";
if (!fileExists(cachePath_)) {
buildEngineFromONNX(serializedOnnxModel, modelSize);
} else {
loadEngineFromCache();
}
context_ = engine_->createExecutionContext();
if (context_ == nullptr) {
throw std::runtime_error("创建执行上下文失败");
}
int numBindings = engine_->getNbIOTensors();
for (int i = 0; i < numBindings; ++i) {
ioTensorInfo_.emplace_back(engine_, i);
}
for (const auto& info : ioTensorInfo_) {
std::cout << info << std::endl;
}
}
/**
* @brief 析构函数,释放资源
*/
~TRTInferenceEngine() {
if (context_) delete context_;
if (engine_) delete engine_;
if (runtime_) delete runtime_;
}
/**
* @brief 获取张量总数
* @return int 张量数量
*/
int getTensorCount() const {
return static_cast<int>(ioTensorInfo_.size());
}
/**
* @brief 获取指定索引的张量大小(字节数)
* @param index 张量索引
* @return int 张量大小(字节)
*/
int getTensorSize(int index) const {
validateTensorIndex(index);
return ioTensorInfo_[index].bytesPerComponent * ioTensorInfo_[index].elementCount;
}
/**
* @brief 获取指定索引的张量元素数量
* @param index 张量索引
* @return int 元素数量
*/
int getTensorElementCount(int index) const {
validateTensorIndex(index);
return ioTensorInfo_[index].elementCount;
}
/**
* @brief 判断指定索引的张量是否为输入张量
* @param index 张量索引
* @return bool 是否为输入张量
*/
bool isInputTensor(int index) const {
validateTensorIndex(index);
return ioTensorInfo_[index].ioMode == nvinfer1::TensorIOMode::kINPUT;
}
/**
* @brief 执行推理
* @param buffers 输入和输出的设备内存指针数组
* @param bufferCount 缓冲区数量
* @param stream CUDA 流
* @return int 返回状态码
*/
int infer(void** buffers, int bufferCount, cudaStream_t stream) {
if (buffers == nullptr || bufferCount != getTensorCount()) {
std::cerr << "缓冲区指针为空或数量不匹配" << std::endl;
return -1;
}
if (stream == nullptr) {
std::cerr << "CUDA 流为空" << std::endl;
return -1;
}
for (int i = 0; i < bufferCount; ++i) {
context_->setTensorAddress(ioTensorInfo_[i].tensorName.c_str(), buffers[i]);
}
bool status = context_->enqueueV3(stream);
if (!status) {
std::cerr << "推理执行失败" << std::endl;
return -1;
}
return 0;
}
private:
/**
* @brief 从 ONNX 模型构建 TensorRT 引擎
* @param serializedOnnxModel 序列化的 ONNX 模型数据指针
* @param modelSize 模型数据大小
*/
void buildEngineFromONNX(const void* serializedOnnxModel, size_t modelSize) {
std::cout << "构建 TensorRT 引擎..." << std::endl;
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
if (builder == nullptr) {
throw std::runtime_error("创建 Builder 失败");
}
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0);
if (network == nullptr) {
delete builder;
throw std::runtime_error("创建 Network 失败");
}
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
if (parser == nullptr) {
delete network;
delete builder;
throw std::runtime_error("创建 Parser 失败");
}
if (!parser->parse(serializedOnnxModel, modelSize)) {
delete parser;
delete network;
delete builder;
throw std::runtime_error("解析 ONNX 模型失败");
}
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
if (config == nullptr) {
delete parser;
delete network;
delete builder;
throw std::runtime_error("创建 BuilderConfig 失败");
}
// 如需使用 FP16 精度,可取消注释以下行
// config->setFlag(nvinfer1::BuilderFlag::kFP16);
nvinfer1::IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
if (serializedModel == nullptr) {
delete config;
delete parser;
delete network;
delete builder;
throw std::runtime_error("构建序列化网络失败");
}
engine_ = runtime_->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
if (engine_ == nullptr) {
delete serializedModel;
delete config;
delete parser;
delete network;
delete builder;
throw std::runtime_error("反序列化引擎失败");
}
// 保存序列化的模型到缓存文件
std::ofstream ofile(cachePath_, std::ios::binary);
if (!ofile) {
std::cerr << "无法打开文件进行写入: " << cachePath_ << std::endl;
} else {
ofile.write(static_cast<const char*>(serializedModel->data()), serializedModel->size());
std::cout << "模型已保存到: " << cachePath_ << std::endl;
}
// 释放资源
delete serializedModel;
delete config;
delete parser;
delete network;
delete builder;
}
/**
* @brief 从缓存加载 TensorRT 引擎
*/
void loadEngineFromCache() {
std::cout << "从缓存加载模型: " << cachePath_ << std::endl;
std::vector<char> engineData = readBinaryFile(cachePath_);
if (engineData.empty()) {
throw std::runtime_error("读取缓存模型失败");
}
engine_ = runtime_->deserializeCudaEngine(engineData.data(), engineData.size());
if (engine_ == nullptr) {
throw std::runtime_error("反序列化引擎失败");
}
}
/**
* @brief 验证张量索引的合法性
* @param index 张量索引
*/
void validateTensorIndex(int index) const {
if (index < 0 || index >= static_cast<int>(ioTensorInfo_.size())) {
throw std::out_of_range("张量索引超出范围");
}
}
nvinfer1::IRuntime* runtime_ = nullptr; ///< TensorRT 运行时
nvinfer1::ICudaEngine* engine_ = nullptr; ///< TensorRT 引擎
nvinfer1::IExecutionContext* context_ = nullptr; ///< 执行上下文
std::vector<IOTensorInfo> ioTensorInfo_; ///< 输入输出张量信息
std::string cachePath_; ///< 缓存路径
};
/**
* @brief 计算均方误差(MSE)
* @param actual 实际值向量
* @param predicted 预测值向量
* @return float 均方误差
*/
float calculateMSE(const std::vector<float>& actual, const std::vector<float>& predicted) {
if (actual.size() != predicted.size() || actual.empty()) {
throw std::invalid_argument("实际值和预测值的大小不一致或为空");
}
float mse = 0.0f;
size_t n = actual.size();
std::cout << "计算 MSE,数据量为: " << n << std::endl;
for (size_t i = 0; i < n; ++i) {
float error = actual[i] - predicted[i];
if (i < 4) {
std::cout << "差值: " << error << " 实际值: " << actual[i] << " 预测值: " << predicted[i] << std::endl;
}
mse += error * error;
}
mse /= n;
return mse;
}
/**
* @brief 程序入口
* @param argc 参数数量
* @param argv 参数列表
* @return int 返回状态码
*/
int main(int argc, char* argv[]) {
try {
if (argc < 3) {
std::cout << "用法: " << argv[0] << " [模型路径] [输入文件...] [输出文件...]" << std::endl;
return -1;
}
const char* onnxPath = argv[1];
int inputOutputCount = argc - 2;
if (!fileExists(onnxPath)) {
std::cerr << "模型文件不存在: " << onnxPath << std::endl;
return -1;
}
std::vector<char> onnxData = readBinaryFile(onnxPath);
if (onnxData.empty()) {
std::cerr << "读取 ONNX 模型失败" << std::endl;
return -1;
}
TRTInferenceEngine inferEngine(onnxData.data(), onnxData.size());
// 创建 CUDA 流
cudaStream_t stream;
cudaError_t cudaStatus = cudaStreamCreate(&stream);
if (cudaStatus != cudaSuccess) {
std::cerr << "创建 CUDA 流失败: " << cudaGetErrorString(cudaStatus) << std::endl;
return -1;
}
int tensorCount = inferEngine.getTensorCount();
if (tensorCount != inputOutputCount) {
std::cerr << "输入输出数量不匹配: " << tensorCount << " vs " << inputOutputCount << std::endl;
cudaStreamDestroy(stream);
return -1;
}
std::vector<void*> deviceBuffers(tensorCount, nullptr);
std::map<int, void*> hostMemoryMap;
// 为每个张量分配显存,并处理输入输出数据
for (int i = 0; i < tensorCount; ++i) {
int tensorSize = inferEngine.getTensorSize(i);
std::cout << "张量索引: " << i << " 大小: " << tensorSize << " 字节" << std::endl;
void* deviceBuffer = nullptr;
cudaStatus = cudaMalloc(&deviceBuffer, tensorSize);
if (cudaStatus != cudaSuccess) {
std::cerr << "分配设备内存失败: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaStreamDestroy(stream);
return -1;
}
deviceBuffers[i] = deviceBuffer;
if (inferEngine.isInputTensor(i)) {
std::vector<char> inputData = readBinaryFile(argv[2 + i]);
if (inputData.size() != static_cast<size_t>(tensorSize)) {
std::cerr << "输入数据大小与张量大小不匹配: " << tensorSize << " vs " << inputData.size() << std::endl;
cudaStreamDestroy(stream);
return -1;
}
cudaStatus = cudaMemcpy(deviceBuffer, inputData.data(), inputData.size(), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
std::cerr << "拷贝输入数据到设备失败: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaStreamDestroy(stream);
return -1;
}
} else {
void* hostBuffer = malloc(tensorSize);
if (hostBuffer == nullptr) {
std::cerr << "分配主机内存失败" << std::endl;
cudaStreamDestroy(stream);
return -1;
}
hostMemoryMap[i] = hostBuffer;
}
}
// 执行推理
if (inferEngine.infer(deviceBuffers.data(), tensorCount, stream) != 0) {
cudaStreamDestroy(stream);
return -1;
}
// 同步 CUDA 流,确保推理完成
cudaStatus = cudaStreamSynchronize(stream);
if (cudaStatus != cudaSuccess) {
std::cerr << "CUDA 流同步失败: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaStreamDestroy(stream);
return -1;
}
// 处理输出数据,计算 MSE
for (const auto& pair : hostMemoryMap) {
int index = pair.first;
void* hostBuffer = pair.second;
int tensorSize = inferEngine.getTensorSize(index);
int elementCount = inferEngine.getTensorElementCount(index);
cudaStatus = cudaMemcpy(hostBuffer, deviceBuffers[index], tensorSize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
std::cerr << "拷贝输出数据到主机失败: " << cudaGetErrorString(cudaStatus) << std::endl;
free(hostBuffer);
cudaStreamDestroy(stream);
return -1;
}
std::vector<char> gtData = readBinaryFile(argv[2 + index]);
if (gtData.size() != static_cast<size_t>(tensorSize)) {
std::cerr << "输出数据大小与期望值大小不匹配: " << tensorSize << " vs " << gtData.size() << std::endl;
free(hostBuffer);
cudaStreamDestroy(stream);
return -1;
}
float* gtPtr = reinterpret_cast<float*>(gtData.data());
std::vector<float> predicted(reinterpret_cast<float*>(hostBuffer), reinterpret_cast<float*>(hostBuffer) + elementCount);
std::vector<float> actual(gtPtr, gtPtr + elementCount);
float mse = calculateMSE(actual, predicted);
std::cout << "均方误差 (MSE): " << mse << std::endl;
free(hostBuffer);
}
// 释放显存
for (auto buffer : deviceBuffers) {
if (buffer != nullptr) {
cudaFree(buffer);
}
}
// 销毁 CUDA 流
cudaStreamDestroy(stream);
} catch (const std::exception& ex) {
std::cerr << "程序异常: " << ex.what() << std::endl;
return -1;
}
return 0;
}