● 代码,tf 卷积神经网络,将训练好的参数保存为 .npz 文件给 tensorRT 用
1 # tf 模型搭建和训练部分同上一篇博客 2 tfArg = {} 3 for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): # 遍历全局元素集合,全部放进 tfArg 中来 4 tfArg[i.name] = sess.run(i) 5 tfArg['testX']=mnist[2].images # 补上 mnist 的测试数据 6 tfArg['testY']=mnist[2].labels 7 8 np.savez(pbFilePath + 'tfArg.npz',**tfArg) # 保存 tfArg 为 .npz 文件 9 sess.close()
● 代码,将前面一模一样的神经网络用 trt 重写一遍,加载训练好的参数来推理
1 import numpy as np 2 import tensorflow as tf 3 import tensorrt as trt 4 import pycuda.autoinit 5 import pycuda.driver as cuda 6 import input_data 7 from datetime import datetime as dt 8 9 pbFilePath = "tempFile/" 10 11 # 网络基础设施 12 iGpu = 0 13 print("GPU in use:", cuda.Device(iGpu).name()) 14 cuda.Device(iGpu).make_context() 15 logger = trt.Logger(trt.Logger.WARNING) 16 builder = trt.Builder(logger) 17 network = builder.create_network() 18 builder.max_batch_size = 64 19 builder.max_workspace_size = 0 << 20 20 21 # 读取参数 22 para = np.load(pbFilePath + 'tfArg.npz') 23 w1= para['w1:0'].transpose((3, 2, 0, 1)).reshape(-1) # NHWC -> NCHW,所有权重都要 reshape(-1) 压成 1 维 24 b1 = para['b1:0'] 25 w2 = para['w2:0'].transpose((3, 2, 0, 1)).reshape(-1) 26 b2 = para['b2:0'] 27 w3 = para['w3:0'].reshape(7,7,64,1024).transpose((3, 2, 0, 1)).reshape(-1) 28 b3 = para['b3:0'] 29 w4 = para['w4:0'].reshape(1024,10).transpose((1,0)).reshape(-1) 30 b4 = para['b4:0'] 31 testX = para['testX'] # 测试数据 32 testY = para['testY'] 33 34 # 建立网络 35 batchSize = 64 36 data = network.add_input("data", trt.DataType.FLOAT, (batchSize, 1, 28, 28)) # 输入层,batchSize 张 1 通道 28 行 28 列 37 38 h1 = network.add_convolution(data, 32, (5, 5), w1, b1) # 卷积 1,指定输出特征数,窗口高宽,权重值(隐式转换为 trt.Weigfhts) 39 h1.stride = (1, 1) # 外侧补充指定跨步和光环 40 h1.padding = (2, 2) 41 h1Act = network.add_activation(h1.get_output(0), trt.ActivationType.RELU) # 激活层,指定激活类型 42 43 h1Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2)) # 池化层,指定池化类型,窗口高宽 44 h1Pool.stride = (2, 2) 45 h1Pool.padding = (0, 0) 46 47 h2 = network.add_convolution(h1Pool.get_output(0), 64, (5, 5), w2, b2) # 卷积 2 48 h2.stride = (1, 1) 49 h2.padding = (2, 2) 50 h2Act = network.add_activation(h2.get_output(0), trt.ActivationType.RELU) 51 52 h2Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2)) # 池化 2 53 h2Pool.stride = (2, 2) 54 h2Pool.padding = (0, 0) 55 56 57 h3 = network.add_fully_connected(h2Pool.get_output(0), 1024, w3, b3) # 全连接层,指定输出特征数,权重值 58 h3Act = network.add_activation(h3.get_output(0), trt.ActivationType.RELU) 59 60 h4 = network.add_fully_connected(h3Act.get_output(0), 10, w4, b4) # 全连接层 2 61 y = network.add_softmax(h4.get_output(0)) # softmax 层 62 63 network.mark_output(y.get_output(0)) # 指定输出层 64 engine = builder.build_cuda_engine(network) # 建立 engine 65 66 # 申请内存 67 h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32) 68 h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32) 69 d_input = cuda.mem_alloc(h_input.nbytes) 70 d_output = cuda.mem_alloc(h_output.nbytes) 71 72 # 流和上下文 73 stream = cuda.Stream() 74 context = engine.create_execution_context() 75 76 # 测试 77 print( "%s, start!" %( dt.now()) ) 78 acc = 0 79 nTest = len(para['testX']) 80 for i in range(nTest // batchSize): # 向下取整,尾巴可能没测完 81 h_input = para['testX'][i*batchSize:(i+1)*batchSize].reshape(-1,1,28,28) 82 83 cuda.memcpy_htod_async(d_input, h_input, stream) # 数据拷贝 84 85 context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # 执行内核 86 87 cuda.memcpy_dtoh_async(h_output, d_output, stream) 88 89 stream.synchronize() # 同步,否则 yy 是 全零矩阵 90 91 yy = np.argmax(h_output.reshape(engine.get_binding_shape(1)),1).reshape(-1) 92 label = np.argmax(para['testY'][i*batchSize:(i+1)*batchSize],1) 93 acc += np.sum( ( yy == label ).astype(np.int) ) 94 95 cuda.Context.pop() # 删除上下文 96 print( "%s, acc = %f" %( dt.now(), acc/(len(para['testX'])) ) )
▶ 总结 tensorRT 的关键步骤(包含 engine 的读写,避免每次都新建 engine 浪费时间)
1 import tensorflow as tf 2 import tensorrt as trt 3 import pycuda.autoinit 4 import pycuda.driver as cuda 5 6 iGpu = 0 7 cuda.Device(iGpu).make_context() # 设备上下文 8 logger = trt.Logger(trt.Logger.WARNING) # 创建 logger 9 10 trtFilePath = "./densenetEngine.trt" # 读取现成的 engine 序列文件,否则现场生成一个 engine 并序列化保存为文件 11 if os.path.isfile(trtFilePath) and not DEBUG: 12 with open(trtFilePath, 'rb') as f: 13 engineStr = f.read() 14 else: 15 builder = trt.Builder(logger) # 创建 builder 16 builder.max_batch_size = 64 17 builder.max_workspace_size = 200 << 20 18 builder.fp16_mode = True # 是否使用 float16 19 20 network = builder.create_network() # 创建 network 21 22 h0 = network.add_input("h0", ...) # 开始建网 23 24 ... 25 26 y = network.add_... 27 28 network.mark_output(y.get_output(0)) # 标记输出节点 29 30 engine = builder.build_cuda_engine(network) # 建立 engine,最容易失败的位置 31 32 if engine == None: 33 print("build engine failed!") 34 return None 35 36 engineStr = engine.serialize() # 创建序列化的 engine 并写入文件中,方便下次直接取用 37 with open(trtFilePath, 'wb') as f: 38 f.write(engineStr) 39 40 runtime = trt.Runtime(logger) # 利用运行时环境读取序列化的 engine(现场创建 engine 的可以跳过这步) 41 engine = runtime.deserialize_cuda_engine(engineStr) 42 context = engine.create_execution_context() # 创建内核上下文(区别于设备上下文) 43 stream = cuda.Stream() # 创建流(可选) 44 45 hIn = cuda.pagelocked_empty(engine.get_binding_shape(0), dtype=np.float32) # 使用无初始化的页锁定内存,指定尺寸(隐式转换为 trt.volume)和数据类型,也可用 np.empty 等来申请一般内存 46 hOut = cuda.pagelocked_empty(engine.get_binding_shape(1), dtype=np.float32) # engine.get_binding_shape 的 (0) 和 (1) 分别等于network 的输入和输出节点尺寸 47 dIn = cuda.mem_alloc(h_input.nbytes) # 申请设备内存,使用主机内存的大小 48 dOut = cuda.mem_alloc(h_output.nbytes) 49 50 cuda.memcpy_htod_async(d_input, h_input, stream) # 异步数据拷贝 51 #cuda.memcpy_htod(d_input, data) # 非异步数据拷贝 52 context.execute_async(batchSize, bindings=[int(dIn), int(dOut)], stream_handle=stream.handle) # 异步执行内核 53 context.execute(batchSize, bindings=[int(dIn), int(dOut)]) # 非异步执行内核 54 cuda.memcpy_dtoh_async(hOut, dOut, stream) 55 56 stream.synchronize() # 同步 57 58 context = None # 清空内核上下文和 engine 59 engine = None 60 cuda.Context.pop() # 关闭设备上下文
▶ 留坑,使用 convert_to_uff.py 将保存的 .pb 模型转化为 .uff 模型,方便 tendorRT 直接加载和使用,不用再在 tenorRT 中重建。中间遇到一些问题,尚未成功。