一文总结TensorRT下两种量化方式QAT和PTQ的部署

本文主要是代码为主，不涉及QAT训练loss不收敛如何解决等训练技巧。 PTQ和QAT两种概念如果读者还不懂，建议先了解后再来看本篇文章。本文所有版本为TensorRT8.0版本，trt7.2开始支持用于QAT用的set dynamic range API，8.0开始支持onnx中Q/DQ算子的解析，本文解析来会对这两种方式做说明。本文全部代码为了方便演示，主要为python，由于API基本一致，C++版本请自行移植。

一、TensorRT自带PTQ

以量化resnet50为例

import torch
import os
import sys
import torch.nn.functional as F
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import tqdm
import tensorrt as trt
import os
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import os
import os.path as osp
def test(model, device, test_loader):
    old_training_state = model.training
    model.eval()
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    for data, target in tqdm.tqdm(test_loader):
        data, target = data.to(device), target.to(device)
        with torch.no_grad():
            output = model(data)
        test_loss += lossLayer(output, target).item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    model.train(old_training_state)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.3f}%\n'.format(
        test_loss, 100. * correct / len(test_loader.dataset)

    ))

准备数据

# change valdir to your imagenet dataset validation directory
valdir = '~/MyAICode/dataset/ImageNet/val'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),batch_size=128, shuffle=False, num_workers=24, pin_memory=True)

测试原始模型精度并导出onnx

resnet50 = models.resnet50(pretrained=True)
device = torch.device('cuda')
resnet50.to(device)
# consistent with reported at https://pytorch.org/vision/stable/models.html
test(resnet50, device, val_loader)
resnet50 = resnet50.to('cpu')
torch.onnx.export(resnet50.cpu().eval(), torch.rand(1,3,224,224), 'resnet50.onnx', input_names=['input'],output_names=['output'],
                 dynamic_axes={'input':{0:'batch'}, 'output':{0:'batch'}}, do_constant_folding=True)

准备校正器

class ImageNetEntropyCalibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, val_data, cache_file, batch_size=32):
        # Whenever you specify a custom constructor for a TensorRT class,
        # you MUST call the constructor of the parent explicitly.
        trt.IInt8EntropyCalibrator2.__init__(self)
        self.cache_file = cache_file
        # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned.
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        self.data = datasets.ImageFolder(val_data, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]))
        self.dataset_length = len(self.data)
        self.batch_size = batch_size
        self.current_index = 0
        # Allocate enough memory for a whole batch.
        self.device_input = cuda.mem_alloc(4 * 3 * 224 * 224 * self.batch_size)
    def get_batch_size(self):
        return self.batch_size
    # TensorRT passes along the names of the engine bindings to the get_batch function.
    # You don't necessarily have to use them, but they can be useful to understand the order of
    # the inputs. The bindings list is expected to have the same ordering as 'names'.
    def get_batch(self, names):
        if self.current_index + self.batch_size > self.dataset_length:
            return None
        current_batch = int(self.current_index / self.batch_size)
        if current_batch % 10 == 0:
            print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size))
        batch = np.ascontiguousarray(torch.cat([self.data[i][0] for i in range(self.current_index, self.current_index + self.batch_size)], dim = 0).numpy().ravel())
        cuda.memcpy_htod(self.device_input, batch)
        self.current_index += self.batch_size
        return [self.device_input]
    def read_calibration_cache(self):
        # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                return f.read()
    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)
    def get_algorithm(self):
        return trt.CalibrationAlgoType.MINMAX_CALIBRATION

class ModelData(object):
    MODEL_PATH = "resnet50.onnx"
    OUTPUT_NAME = "output"
    # The original model is a float32 one.
    DTYPE = trt.float32

TRT_LOGGER = trt.Logger()
def GiB(val):
    return val * 1 << 30

准备构建推理engine

# This function builds an engine from a onnx model.
def build_int8_engine(onnx_filepath, calib, max_batch_size=32):
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    # trt.Runtime(TRT_LOGGER) as runtime
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(explicit_batch) as network, \\
        builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        builder.max_batch_size = max_batch_size
        config.max_workspace_size = GiB(1) # 8G
        config.set_flag(trt.BuilderFlag.INT8)
        config.set_flag(trt.BuilderFlag.FP16)
        config.int8_calibrator = calib
        # Parse model file
        with open(onnx_filepath, 'rb') as model:
            if not parser.parse(model.read()):
                for error in range(parser.num_errors):
                    TRT_LOGGER.log(TRT_LOGGER.ERROR, parser.get_error(error))
                raise ValueError('Failed to parse the ONNX file.')
        TRT_LOGGER.log(TRT_LOGGER.INFO, f'input number: {network.num_inputs}')
        TRT_LOGGER.log(TRT_LOGGER.INFO, f'output number: {network.num_outputs}')
        # set optimization profile
        profile = builder.create_optimization_profile()
        input_name = network.get_input(0).name
        profile.set_shape(input_name, min=(1, 3, 224, 224), opt=(min(32,max_batch_size), 3, 224, 224), max=(max_batch_size, 3, 224, 224))
        config.add_optimization_profile(profile)
        # Build engine and do int8 calibration.
        # 直接构造可以序列化的模型
#         plan = builder.build_serialized_network(network, config)
        # 反序列化
#         return runtime.deserialize_cuda_engine(plan)
        engine = builder.build_engine(network, config)
        with open('int8.engine', "wb") as f:
            f.write(engine.serialize())
        return engine

val_data = '~/MyAICode/dataset/ImageNet/val'
calibration_cache = "imagenet_calibration.cache"
calib = ImageNetEntropyCalibrator(val_data, cache_file=calibration_cache, batch_size = 64)

# Inference batch size can be different from calibration batch size.
batch_size = 256
onnx_file = ModelData.MODEL_PATH
engine = build_int8_engine(onnx_file, calib, batch_size)

准备input/output显存申请和cuda stream相关

def load_test_case(pagelocked_buffer, img):
    copy_size = img.ravel().size
    np.copyto(pagelocked_buffer[:int(copy_size)], img.ravel())
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    print('max_batch_size', engine.max_batch_size)
    for binding in engine:
        print('binding', binding, engine.get_binding_shape(binding),engine.get_binding_dtype(binding))
        size = trt.volume(engine.get_binding_shape(binding)[1:]) * engine.max_batch_size
        print(size)
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

inputs, outputs, bindings, stream = allocate_buffers(engine)

推理函数

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

使用trt做推理，并测试模型精度

def test_tensorrt(engine, test_loader):
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    with engine.create_execution_context() as context:
        context.set_optimization_profile_async(0, stream.handle)
        for data, target in test_loader:
            data = data.numpy()
            input_shape = engine.get_binding_shape(0)
            input_shape[0] = data.shape[0]
            context.set_binding_shape(0,input_shape)
            if not context.all_binding_shapes_specified:
                raise RuntimeError("Not all input dimensions are specified for the exeuction context")
            load_test_case(inputs[0].host, data)
            # =======================================
            # The common do_inference function will return a list of outputs - we only have one in this case.

            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=data.shape[0])
            output = torch.as_tensor(pred[0]).view(-1, 1000)[:data.shape[0]]
            test_loss += lossLayer(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
        # del context if not reuse
        del context
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.3f}%\n'.format(
        test_loss, 100. * correct / len(test_loader.dataset)

    ))

测速函数

import time
def test_tensorrt_for_test(engine):
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    i = 0
    total_time_span = 0
    with engine.create_execution_context() as context:
        context.set_optimization_profile_async(0, stream.handle)
        input_shape = engine.get_binding_shape(0)
        input_shape[0] = engine.max_batch_size
        context.set_binding_shape(0,input_shape)
        if not context.all_binding_shapes_specified:
            raise RuntimeError("Not all input dimensions are specified for the exeuction context")
        # warm up
        print('input_shape', input_shape)
        data = np.random.rand(*input_shape).astype(np.float32)
        load_test_case(inputs[0].host, data)
        for i in range(10):
            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=engine.max_batch_size)
        for i in range(100):
#             data = np.random.rand(*input_shape).astype(np.float32)
#             load_test_case(inputs[0].host, data)
            # =======================================
            # The common do_inference function will return a list of outputs - we only have one in this case.

            start_time = time.time()
            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=engine.max_batch_size)
            time_span = time.time() - start_time

            total_time_span += time_span
        total_time_span /= 100.0
        print('total_time_span', total_time_span)
        # del context if not reuse
        del context

INT8 推理

# change valdir to your imagenet dataset validation directory
valdir = '~/MyAICode/dataset/ImageNet/val'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
test_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
])),batch_size=256, shuffle=False, num_workers=24, pin_memory=True)

test_tensorrt(engine, test_loader)
#结果为 Test set: Average loss: 0.9614, Accuracy: 76.090%
test_tensorrt_for_test(engine)
# 结果为 total_time_span 0.03211389064788819 在 RTX 2080TI上

FP16

def build_engine(onnx_filepath, enable_fp16 = False, trt_logger = None, max_batch_size = 256):
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    trt_logger = trt_logger or trt.Logger(trt.Logger.VERBOSE)
    with trt.Builder(trt_logger) as builder, builder.create_network(explicit_batch) as network, \\
        builder.create_builder_config() as config, trt.OnnxParser(network, trt_logger) as parser:
        config.max_workspace_size = GiB(1)
        if enable_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        # Parse model file
        with open(onnx_filepath, 'rb') as model:
            if not parser.parse(model.read()):
                for error in range(parser.num_errors):
                    trt_logger.log(trt_logger.ERROR, parser.get_error(error))
                raise ValueError('Failed to parse the ONNX file.')
        trt_logger.log(trt_logger.INFO, f'input number: {network.num_inputs}')
        trt_logger.log(trt_logger.INFO, f'output number: {network.num_outputs}')
        input_name = network.get_input(0).name  # input_image_A
        # set optimization profile
        profile = builder.create_optimization_profile()
        profile.set_shape(input_name, min=(1, 3, 224, 224), opt=(max_batch_size, 3, 224, 224), max=(max_batch_size, 3, 224, 224))
        config.add_optimization_profile(profile)
        builder.max_batch_size = max_batch_size
        engine = builder.build_engine(network, config)
        if enable_fp16:
            with open('fp16.engine', "wb") as f:
                f.write(engine.serialize())
        else:
            with open('fp32.engine', "wb") as f:
                f.write(engine.serialize())
        return engine

batch_size = 256
onnx_file = ModelData.MODEL_PATH
engine = build_engine(onnx_file, enable_fp16=True, max_batch_size=batch_size)

inputs, outputs, bindings, stream = allocate_buffers(engine)

test_tensorrt(engine, test_loader)
# Test set: Average loss: 0.9619, Accuracy: 76.106%
test_tensorrt_for_test(engine)
# total_time_span 0.05484504699707031

FP32

batch_size = 256
onnx_file = ModelData.MODEL_PATH
engine = build_engine(onnx_file, enable_fp16=False, max_batch_size=batch_size)
inputs, outputs, bindings, stream = allocate_buffers(engine)

test_tensorrt(engine, test_loader)
# Test set: Average loss: 0.9618, Accuracy: 76.130%
test_tensorrt_for_test(engine)
# total_time_span 0.18927677154541014

总结

通过上面int8、fp16、fp32的测速实验可以看到int8的加速效果还是非常明显的，并且本小节给出了使用自带PTQ做int8推理的基本流程。

二、使用设置dynamic_range的api做QAT

本章使用MQBench模型量化框架对模型做PTQ和QAT，这里QAT的结果无法演示，不过过程代码与优化模型没有区别，所以读者在迁移到自己的项目中时，可以自行填充，也可以去MQBench官方git复制。首先，导出相关包

import torch
import os
import sys
# 我这里没有安装，所以采用这种方式加入库
sys.path.insert(0, os.path.abspath('./MQBench'))
from mqbench.convert_deploy import convert_deploy
from mqbench.prepare_by_platform import prepare_by_platform, BackendType
from mqbench.utils.state import enable_calibration, enable_quantization
import mqbench
from mqbench.convert_deploy import convert_deploy
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import tqdm
import tensorrt as trt
import os
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image
import numpy as np

导出onnx步骤与上一章节没有差别，所以这里省略。

MQBench 量化过程

model = resnet50.cpu().train()
model_mqbench = prepare_by_platform(model, BackendType.Tensorrt)
enable_calibration(model_mqbench)
model_mqbench.to(device)
model_mqbench.eval()
# calibration loop PTQ process
for data, target in tqdm.tqdm(val_loader):
        data, target = data.to(device), target.to(device)
        with torch.no_grad():
            model_mqbench(data)
# 下面可以自行定义QAT步骤
enable_quantization(model_mqbench)
model_mqbench.train()
# QAT loop ...

测试量化模型模型精度，test函数与上一章相同

test(model_mqbench, device, val_loader)

导出onnx

input_shape_dict={'input': [256, 3, 224, 224]}
convert_deploy(model_mqbench.eval(), BackendType.Tensorrt, input_shape_dict, model_name="resnet50-mqbench")

由于这里onnx输出时静态输入，为了支持动态batch我这里需要改一下

import onnx
def change_input_dim(model):
    # Use some symbolic name not used for any other dimension
    sym_batch_dim = "N"
    # or an actal value
    actual_batch_dim = 1
    # The following code changes the first dimension of every input to be batch-dim
    # Modify as appropriate ... note that this requires all inputs to
    # have the same batch_dim
    inputs = model.graph.input
    for input in inputs:
        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
        # Add checks as needed.
        dim1 = input.type.tensor_type.shape.dim[0]
        # update dim to be a symbolic value
        dim1.dim_param = sym_batch_dim
        # or update it to be an actual value:
#         dim1.dim_value = actual_batch_dim
def apply(transform, infile, outfile):
    model = onnx.load(infile)
    transform(model)
    onnx.save(model, outfile)
apply(change_input_dim, "resnet50-mqbench_deploy_model.onnx", "resnet50-mqbench_deploy_model.onnx")

量化后导出的onnx转engine

import onnx
import json
def onnx2trt(onnx_model,
             trt_path,
             dataset_path,
             max_batch_size=256,
             batch_size=1,
             cali_batch=10,
             log_level=trt.Logger.ERROR,
             max_workspace_size=1 << 30,
             device_id=0,
             mode='fp32',
             dynamic_range_file=None):
    if os.path.exists(trt_path):
        print(f'The "{trt_path}" exists. Remove it and continue.')
        os.remove(trt_path)
    device = torch.device('cuda:{}'.format(device_id))
    # create builder and network
    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)
    EXPLICIT_BATCH = 1 << (int)(
        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(EXPLICIT_BATCH)
    builder.max_batch_size = max_batch_size
    # parse onnx
    parser = trt.OnnxParser(network, logger)
    if isinstance(onnx_model, str):
        onnx_model = onnx.load(onnx_model)
    if not parser.parse(onnx_model.SerializeToString()):
        error_msgs = ''
        for error in range(parser.num_errors):
            error_msgs += f'{parser.get_error(error)}\n'
        raise RuntimeError(f'parse onnx failed:\n{error_msgs}')
    config = builder.create_builder_config()
    config.max_workspace_size = max_workspace_size
    if mode == 'int8':
        config.set_flag(trt.BuilderFlag.INT8)
        config.set_flag(trt.BuilderFlag.FP16)
        if dynamic_range_file:
            with open(dynamic_range_file, 'r') as f:
                dynamic_range = json.load(f)['tensorrt']['blob_range']
            for input_index in range(network.num_inputs):
                input_tensor = network.get_input(input_index)
                if input_tensor.name in dynamic_range:
                    amax = dynamic_range[input_tensor.name]
                    input_tensor.dynamic_range = (-amax, amax)
                    print(f'Set dynamic range of {input_tensor.name} as [{-amax}, {amax}]')
            for layer_index in range(network.num_layers):
                layer = network[layer_index]
                output_tensor = layer.get_output(0)
                if output_tensor.name in dynamic_range:
                    amax = dynamic_range[output_tensor.name]
                    output_tensor.dynamic_range = (-amax, amax)
                    print(f'Set dynamic range of {output_tensor.name} as [{-amax}, {amax}]')
        else:
            from calibrator import ImagenetCalibrator
            calidir = os.path.join(dataset_path, 'cali')
            dataset = datasets.ImageFolder(calidir, transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])]))
            cali_num = min(len(dataset), batch_size * cali_batch)
            cali_dataset = torch.utils.data.Subset(dataset, indices=torch.arange(cali_num))
            cali_loader = torch.utils.data.DataLoader(cali_dataset, batch_size=batch_size, shuffle=False,
                                                      num_workers=1, pin_memory=False)
            calibrator = ImagenetCalibrator(cali_loader, cache_file='imagenet.cache')
            config.int8_calibrator = calibrator
            print(f'Calibration Set!')
    # create engine
    with torch.cuda.device(device):
        # set optimization profile
        input_name = network.get_input(0).name
        profile = builder.create_optimization_profile()
        profile.set_shape(input_name, min=(1, 3, 224, 224), opt=(max_batch_size, 3, 224, 224), max=(max_batch_size, 3, 224, 224))
        config.add_optimization_profile(profile)
        builder.max_batch_size = max_batch_size
        engine = builder.build_engine(network, config)
    with open(trt_path, mode='wb') as f:
        f.write(bytearray(engine.serialize()))
    return engine

engine = onnx2trt('resnet50-mqbench_deploy_model.onnx','resnet50-mqbench-int8.engine','',dynamic_range_file='./resnet50-mqbench_clip_ranges.json', mode="int8")

接下来步骤与上一章没什么区别

def load_test_case(pagelocked_buffer, img):
    copy_size = img.ravel().size
    np.copyto(pagelocked_buffer[:int(copy_size)], img.ravel())
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    print('max_batch_size', engine.max_batch_size)
    for binding in engine:
        print('binding', binding, engine.get_binding_shape(binding),engine.get_binding_dtype(binding))
        size = trt.volume(engine.get_binding_shape(binding)[1:]) * engine.max_batch_size
        print(size)
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
inputs, outputs, bindings, stream = allocate_buffers(engine)
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]
def test_tensorrt(engine, test_loader):
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    with engine.create_execution_context() as context:
        context.set_optimization_profile_async(0, stream.handle)
        for data, target in test_loader:
            data = data.numpy()
            input_shape = engine.get_binding_shape(0)
            input_shape[0] = data.shape[0]
            context.set_binding_shape(0,input_shape)
            if not context.all_binding_shapes_specified:
                raise RuntimeError("Not all input dimensions are specified for the exeuction context")
            load_test_case(inputs[0].host, data)
            # =======================================
            # The common do_inference function will return a list of outputs - we only have one in this case.

            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=data.shape[0])
            output = torch.as_tensor(pred[0]).view(-1, 1000)[:data.shape[0]]
            test_loss += lossLayer(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
        # del context if not reuse
        del context
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.3f}%\n'.format(
        test_loss, 100. * correct / len(test_loader.dataset)

    ))
import time
def test_tensorrt_for_test(engine):
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    i = 0
    total_time_span = 0
    with engine.create_execution_context() as context:
        context.set_optimization_profile_async(0, stream.handle)
        input_shape = engine.get_binding_shape(0)
        input_shape[0] = engine.max_batch_size
        context.set_binding_shape(0,input_shape)
        if not context.all_binding_shapes_specified:
            raise RuntimeError("Not all input dimensions are specified for the exeuction context")
        # warm up
        print('input_shape', input_shape)
        data = np.random.rand(*input_shape).astype(np.float32)
        load_test_case(inputs[0].host, data)
        for i in range(10):
            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=engine.max_batch_size)
        for i in range(100):
#             data = np.random.rand(*input_shape).astype(np.float32)
#             load_test_case(inputs[0].host, data)
            # =======================================
            # The common do_inference function will return a list of outputs - we only have one in this case.

            start_time = time.time()
            pred = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=engine.max_batch_size)
            time_span = time.time() - start_time

            total_time_span += time_span
        total_time_span /= 100.0
        print('total_time_span', total_time_span)
        # del context if not reuse
        del context

测试精度和速度

# change valdir to your imagenet dataset validation directory
valdir = '~/MyAICode/dataset/ImageNet/val'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
test_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
])),batch_size=256, shuffle=False, num_workers=24, pin_memory=True)

test_tensorrt(engine, test_loader)
# Test set: Average loss: 0.9699, Accuracy: 75.804%
test_tensorrt_for_test(engine)
# total_time_span 0.032126998901367186

总结

从上面实验可以看出经过MQBench的PTQ所得精度要稍逊TRT自带的，这里为了一致，我都是采用相同的算法，原因暂时未知，可能是TRT内部多跑了一些epoch，不过差的不多。MQBench主攻QAT算法，因此也算各有所长吧。不过值得高兴的是，MQBench这种采用设置dynamic range的方式推理速度和官方PTQ的没差别，所以使用MQBench做QAT得到的模型得到的推理速度提升依然会很明显。

三、使用onnx的Q/DQ算子

该方法与Q/DQ算子插入算子位置有关，具体可以参考trt官方文章和官方实现git中的量化工具。地址为https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization 既然dynamic range的方式可以用，这种方式留在以后研究吧。

最后更新: March 21, 2024
创建日期: March 21, 2024