onnx导出BERT示例

目录

BERT模型导出

1.基于transformers载入PyTorch模型
2.创建伪输入(dummy inputs),并利用伪输入在模型中前向inference,推理网络并在这个过程中追踪记录操作集合
3.在输入和输出tensors上定义动态轴
4.保存graph和网络参数

nlp与cv的区别不大,主要是注意一下不输入序列定长的问题,也就是export方法中的dynamic_axes参数

BERT-Large, Uncased.(Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Large, Cased(Whole Word Masking) : 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, 	Uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, 	Cased: 12-layer, 768-hidden, 12-heads , 110M parameters
BERT-Large, Cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, 	Multilingual Cased (New, recommended): 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, 	Multilingual Uncased (Orig, not recommended) (Not recommended, use Multilingual Cased instead): 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, 	Chinese: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters


前6个为英文模型,Multilingual代表多语言模型,最后一个是中文模型 (字级别)
Uncased 代表将字母全部转换成小写,而Cased代表保留了大小写

加载模型

from pathlib import Path
from transformers import BertConfig,BertModel, BertTokenizer
from transformers.convert_graph_to_onnx import convert

from transformers import AutoTokenizer
import torch

config = BertConfig.from_pretrained("bert-base-uncased")
print(config)
bert_model  = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config)
print(bert_model.config)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# BertConfig {
#   "_name_or_path": "bert-base-uncased",
#   "architectures": [
#     "BertForMaskedLM"
#   ],
#   "attention_probs_dropout_prob": 0.1,
#   "classifier_dropout": null,
#   "gradient_checkpointing": false,
#   "hidden_act": "gelu",
#   "hidden_dropout_prob": 0.1,
#   "hidden_size": 768,
#   "initializer_range": 0.02,
#   "intermediate_size": 3072,
#   "layer_norm_eps": 1e-12,
#   "max_position_embeddings": 512,
#   "model_type": "bert",
#   "num_attention_heads": 12,
#   "num_hidden_layers": 12,
#   "pad_token_id": 0,
#   "position_embedding_type": "absolute",
#   "transformers_version": "4.36.2",
#   "type_vocab_size": 2,
#   "use_cache": true,
#   "vocab_size": 30522
# }

pt模型推理

from transformers import AutoModel, AutoConfig, AutoTokenizer
import torch
from itertools import chain

# 加载model,token,config
model = AutoModel.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


# 定义句子
# 分词器分词
sentence = 'here is some text to encode'
inputs_pt = tokenizer(sentence, return_tensors='pt')
print(inputs_pt["input_ids"].shape)

outputs = model(**inputs_pt)
print(dir(outputs))
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output
print("Token wise output: {}, Pooled output: {}".format(last_hidden_state.shape, pooler_output.shape))
print(last_hidden_state)

print("---" * 20)
torch.Size([1, 9])
['__annotations__', '__class__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'cross_attentions', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'last_hidden_state', 'move_to_end', 'past_key_values', 'pooler_output', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']
Token wise output: torch.Size([1, 9, 768]), Pooled output: torch.Size([1, 768])
tensor([[[-0.0549,  0.1053, -0.1065,  ..., -0.3551,  0.0686,  0.6506],
         [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
         [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
         ...,
         [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
         [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
         [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]]],
       grad_fn=<NativeLayerNormBackward0>)
------------------------------------------------------------

重新导出config

# 利用config生成一个onnx的config

from transformers.onnx.features import FeaturesManager
onnx_config = FeaturesManager._SUPPORTED_MODEL_TYPE['bert']['sequence-classification'](config)

print(onnx_config.inputs.items())
print(onnx_config.outputs.items())

odict_items([('input_ids', {0: 'batch', 1: 'sequence'}), ('attention_mask', {0: 'batch', 1: 'sequence'}), ('token_type_ids', {0: 'batch', 1: 'sequence'})])
odict_items([('logits', {0: 'batch'})])

# dummy_inputs的计算需要利用到tokenizer
dummy_inputs = onnx_config.generate_dummy_inputs(tokenizer, framework='pt')

print(dummy_inputs)
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

导出onnx模型

import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers.onnx.features import FeaturesManager
from transformers.convert_graph_to_onnx import convert


# 加载model,token,config
model = AutoModel.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model.eval()

# 定义句子
sentence = 'here is some text to encode'


# 利用config生成一个onnx的config
# dummy_inputs的计算需要利用到tokenizer
onnx_config = FeaturesManager._SUPPORTED_MODEL_TYPE['bert']['sequence-classification'](config)
dummy_inputs = onnx_config.generate_dummy_inputs(tokenizer, framework='pt')

output_onnx_path = "assets/bert_uncased.onnx"

print("onnx  input",onnx_config.inputs.items())
print("onnx output",onnx_config.outputs.items())

input_ids = dummy_inputs['input_ids']
attention_masks = dummy_inputs['attention_mask']
token_type_ids = dummy_inputs['token_type_ids']
   

input_names = ["input_ids", "attention_masks", "token_type_ids"]
output_names = ["output"]

torch.onnx.export(bert_model, 
                  (input_ids, attention_masks, token_type_ids),    #  或者 (dummy_inputs,)              
				f=output_onnx_path, 
				verbose=True,
                 input_names=list(onnx_config.inputs.keys()),
                 output_names=list(onnx_config.outputs.keys()),
                 dynamic_axes={name: axes for name, axes in chain(onnx_config.inputs.items(), onnx_config.outputs.items())
    					},    
                  opset_version=onnx_config.default_onnx_opset)

print("转换完成")
onnx  input odict_items([('input_ids', {0: 'batch', 1: 'sequence'}), ('attention_mask', {0: 'batch', 1: 'sequence'}), ('token_type_ids', {0: 'batch', 1: 'sequence'})])
onnx output odict_items([('logits', {0: 'batch'})])
转换完成

加载onnx测试

import onnxruntime as ort

# 定义句子
sentence = 'here is some text to encode'


options = ort.SessionOptions() 	# initialize session options
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# 这里的路径传上一节保存的onnx模型地址
session = ort.InferenceSession(
    "assets/bert_uncased.onnx", sess_options=options, providers=["CUDAExecutionProvider","CPUExecutionProvider"]
)

# disable session.run() fallback mechanism, it prevents for a reset of the execution provider
session.disable_fallback() 

inputs = tokenizer(sentence, return_tensors='pt')
inputs = {k: v.detach().cpu().numpy() for k, v in inputs.items()}


print(inputs.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

# 运行
# 这里的logits要有export的时候output_names相对应
output = session.run(output_names=['logits'], input_feed=inputs)

print(output)

print(output)
print(output[0].shape)

onnx output odict_items([('logits', {0: 'batch'})])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[array([[[-0.05490887,  0.10528212, -0.10649522, ..., -0.3550497 ,
          0.06862388,  0.650573  ],
        [-0.5759427 , -0.36500782, -0.13834022, ..., -0.6781805 ,
          0.20923868, -0.16394015],
        [-0.16414754, -0.55971897,  0.01500742, ..., -0.16027743,
         -0.13455114,  0.62159723],
        ...,
        [ 0.2447815 ,  0.125429  ,  0.15869957, ..., -0.27489156,
         -0.11634777,  0.88089377],
        [ 0.0481048 ,  0.4950128 , -0.28274378, ..., -0.6097362 ,
         -0.12124838,  0.2527281 ],
        [ 0.9046008 ,  0.21367389, -0.5896968 , ...,  0.30398968,
         -0.61721766, -0.19498175]]], dtype=float32)]
(1, 9, 768)

参考资料

模型推理加速系列|如何用ONNX加速BERT特征抽取(附代码)

实践演练BERT Pytorch模型转ONNX模型及预测

Bert模型导出为onnx和pb格式

NLP实践——Bert转onnx格式简介与踩坑记录

实践演练BERT Pytorch模型转ONNX模型及预测