TensorFlow 笔记02-mnist 的 tensorRT 实现，从 .npz 文件中加载参数进行推理

● 代码，tf 卷积神经网络，将训练好的参数保存为 .npz 文件给 tensorRT 用

1 # tf 模型搭建和训练部分同上一篇博客
2 tfArg = {}
3 for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):      # 遍历全局元素集合，全部放进 tfArg 中来
4     tfArg[i.name] = sess.run(i)
5 tfArg['testX']=mnist[2].images                                  # 补上 mnist 的测试数据
6 tfArg['testY']=mnist[2].labels
7
8 np.savez(pbFilePath + 'tfArg.npz',**tfArg)                      # 保存 tfArg 为 .npz 文件
9 sess.close()

● 代码，将前面一模一样的神经网络用 trt 重写一遍，加载训练好的参数来推理

 1 import numpy as np
 2 import tensorflow as tf
 3 import tensorrt as trt
 4 import pycuda.autoinit
 5 import pycuda.driver as cuda
 6 import input_data
 7 from datetime import datetime as dt
 8
 9 pbFilePath = "tempFile/"
10
11 # 网络基础设施
12 iGpu = 0
13 print("GPU in use:", cuda.Device(iGpu).name())
14 cuda.Device(iGpu).make_context()
15 logger = trt.Logger(trt.Logger.WARNING)
16 builder = trt.Builder(logger)
17 network = builder.create_network()
18 builder.max_batch_size = 64
19 builder.max_workspace_size = 0 << 20
20
21 # 读取参数
22 para = np.load(pbFilePath + 'tfArg.npz')
23 w1= para['w1:0'].transpose((3, 2, 0, 1)).reshape(-1)                            # NHWC -> NCHW，所有权重都要 reshape(-1) 压成 1 维
24 b1 = para['b1:0']
25 w2 = para['w2:0'].transpose((3, 2, 0, 1)).reshape(-1)
26 b2 = para['b2:0']
27 w3 = para['w3:0'].reshape(7,7,64,1024).transpose((3, 2, 0, 1)).reshape(-1)
28 b3 = para['b3:0']
29 w4 = para['w4:0'].reshape(1024,10).transpose((1,0)).reshape(-1)
30 b4 = para['b4:0']
31 testX = para['testX']                                                           # 测试数据
32 testY = para['testY']
33
34 # 建立网络
35 batchSize = 64
36 data = network.add_input("data", trt.DataType.FLOAT, (batchSize, 1, 28, 28))    # 输入层，batchSize 张 1 通道 28 行 28 列
37
38 h1 = network.add_convolution(data, 32, (5, 5), w1, b1)                          # 卷积 1，指定输出特征数，窗口高宽，权重值（隐式转换为 trt.Weigfhts）
39 h1.stride = (1, 1)                                                              # 外侧补充指定跨步和光环
40 h1.padding = (2, 2)
41 h1Act = network.add_activation(h1.get_output(0), trt.ActivationType.RELU)       # 激活层，指定激活类型
42
43 h1Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化层，指定池化类型，窗口高宽
44 h1Pool.stride = (2, 2)
45 h1Pool.padding = (0, 0)
46
47 h2 = network.add_convolution(h1Pool.get_output(0), 64, (5, 5), w2, b2)          # 卷积 2
48 h2.stride = (1, 1)
49 h2.padding = (2, 2)
50 h2Act = network.add_activation(h2.get_output(0), trt.ActivationType.RELU)
51
52 h2Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化 2
53 h2Pool.stride = (2, 2)
54 h2Pool.padding = (0, 0)
55
56
57 h3 = network.add_fully_connected(h2Pool.get_output(0), 1024, w3, b3)            # 全连接层，指定输出特征数，权重值
58 h3Act = network.add_activation(h3.get_output(0), trt.ActivationType.RELU)
59
60 h4 = network.add_fully_connected(h3Act.get_output(0), 10, w4, b4)               # 全连接层 2
61 y = network.add_softmax(h4.get_output(0))                                       # softmax 层
62
63 network.mark_output(y.get_output(0))                                            # 指定输出层
64 engine = builder.build_cuda_engine(network)                                     # 建立 engine
65
66 # 申请内存
67 h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
68 h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
69 d_input = cuda.mem_alloc(h_input.nbytes)
70 d_output = cuda.mem_alloc(h_output.nbytes)
71
72 # 流和上下文
73 stream = cuda.Stream()
74 context = engine.create_execution_context()
75
76 # 测试
77 print( "%s, start!" %( dt.now()) )
78 acc = 0
79 nTest = len(para['testX'])
80 for i in range(nTest // batchSize):                                             # 向下取整，尾巴可能没测完       
81     h_input = para['testX'][i*batchSize:(i+1)*batchSize].reshape(-1,1,28,28)
82
83     cuda.memcpy_htod_async(d_input, h_input, stream)                            # 数据拷贝
84
85     context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)  # 执行内核
86
87     cuda.memcpy_dtoh_async(h_output, d_output, stream)
88
89     stream.synchronize()                                                        # 同步，否则 yy 是 全零矩阵
90
91     yy = np.argmax(h_output.reshape(engine.get_binding_shape(1)),1).reshape(-1)
92     label = np.argmax(para['testY'][i*batchSize:(i+1)*batchSize],1)
93     acc += np.sum( ( yy == label ).astype(np.int) )
94
95 cuda.Context.pop()                                                              # 删除上下文
96 print( "%s, acc = %f" %( dt.now(), acc/(len(para['testX'])) ) )

▶ 总结 tensorRT 的关键步骤（包含 engine 的读写，避免每次都新建 engine 浪费时间）

 1 import tensorflow as tf
 2 import tensorrt as trt
 3 import pycuda.autoinit
 4 import pycuda.driver as cuda
 5
 6 iGpu = 0
 7 cuda.Device(iGpu).make_context()                # 设备上下文
 8 logger = trt.Logger(trt.Logger.WARNING)         # 创建 logger
 9
10 trtFilePath = "./densenetEngine.trt"            # 读取现成的 engine 序列文件，否则现场生成一个 engine 并序列化保存为文件
11 if os.path.isfile(trtFilePath) and not DEBUG:
12     with open(trtFilePath, 'rb') as f:
13         engineStr = f.read()
14 else:
15     builder = trt.Builder(logger)               # 创建 builder
16     builder.max_batch_size     = 64
17     builder.max_workspace_size = 200 << 20
18     builder.fp16_mode          = True           # 是否使用 float16
19
20     network = builder.create_network()          # 创建 network
21
22     h0 = network.add_input("h0", ...)           # 开始建网
23
24     ...
25
26     y = network.add_...
27
28     network.mark_output(y.get_output(0))        # 标记输出节点
29
30     engine = builder.build_cuda_engine(network) # 建立 engine，最容易失败的位置
31
32     if engine == None:
33         print("build engine failed!")
34         return None
35
36     engineStr = engine.serialize()              # 创建序列化的 engine 并写入文件中，方便下次直接取用
37     with open(trtFilePath, 'wb') as f:
38         f.write(engineStr)
39
40 runtime = trt.Runtime(logger)                                               # 利用运行时环境读取序列化的 engine（现场创建 engine 的可以跳过这步）
41 engine  = runtime.deserialize_cuda_engine(engineStr)
42 context = engine.create_execution_context()                                 # 创建内核上下文（区别于设备上下文）
43 stream  = cuda.Stream()                                                     # 创建流（可选）
44
45 hIn = cuda.pagelocked_empty(engine.get_binding_shape(0), dtype=np.float32)  # 使用无初始化的页锁定内存，指定尺寸（隐式转换为 trt.volume）和数据类型，也可用 np.empty 等来申请一般内存
46 hOut = cuda.pagelocked_empty(engine.get_binding_shape(1), dtype=np.float32) # engine.get_binding_shape 的 (0) 和 (1) 分别等于network 的输入和输出节点尺寸
47 dIn = cuda.mem_alloc(h_input.nbytes)                                        # 申请设备内存，使用主机内存的大小
48 dOut = cuda.mem_alloc(h_output.nbytes)
49
50 cuda.memcpy_htod_async(d_input, h_input, stream)                            # 异步数据拷贝
51 #cuda.memcpy_htod(d_input, data)                                            # 非异步数据拷贝
52 context.execute_async(batchSize, bindings=[int(dIn), int(dOut)], stream_handle=stream.handle)  # 异步执行内核
53 context.execute(batchSize, bindings=[int(dIn), int(dOut)])                  # 非异步执行内核
54 cuda.memcpy_dtoh_async(hOut, dOut, stream)
55
56 stream.synchronize()                                                        # 同步
57
58 context = None                                                              # 清空内核上下文和 engine
59 engine  = None
60 cuda.Context.pop()                                                          # 关闭设备上下文

▶ 留坑，使用 convert_to_uff.py 将保存的 .pb 模型转化为 .uff 模型，方便 tendorRT 直接加载和使用，不用再在 tenorRT 中重建。中间遇到一些问题，尚未成功。