需要多少算子才能覆盖主流大模型呢,于是 基于__torch_dispatch__机制的dump方法 dump出算子及参数列表,考虑到设备内存容量,设置为一层
一.参考链接
二.下载链接
三.测试程序
import warnings
warnings.filterwarnings("ignore")
import copy
import sys
import torch
import multiprocessing as mp
from tqdm import tqdm
op_mapping={}
class llm_forward:
def __init__(self,func):
global op_mapping
op_mapping[func.__name__]=func
self.func=func
def __call__(self,*args,**kwargs):
return self.func(*args,**kwargs)
try:
from torch_hook import TorchDumper,TorchDumpDispatchMode
except:
class TorchDumpDispatchMode:
pass
class TorchDumper:
def __init__(self,*args,**kwargs):
pass
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
pass
@llm_forward
def bert_base_chinese(use_half,device):
from transformers import AutoModelForMaskedLM,BertConfig
config=BertConfig.from_pretrained("bert_base_chinese/config.json")
config.num_hidden_layers=1
model = AutoModelForMaskedLM.from_config(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings))
with TorchDumper(TorchDumpDispatchMode,op_log_path="bert_base_chinesee.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Baichuan2_13B_Chat(use_half,device):
import sys
sys.path.insert(0,"./Baichuan2_13B_Chat")
from configuration_baichuan2 import BaichuanConfig
from modeling_baichuan2 import BaichuanForCausalLM
config=BaichuanConfig.from_pretrained("Baichuan2_13B_Chat/config.json")
config.num_hidden_layers=1
model = BaichuanForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.model_max_length//4))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Baichuan2_13B_Chat.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def baichuan_7B(use_half,device):
import sys
import os
sys.path.insert(0,os.path.join(os.getcwd(),"baichuan_7B"))
from configuration_baichuan import BaiChuanConfig
from modeling_baichuan import BaiChuanForCausalLM
config=BaiChuanConfig.from_pretrained("baichuan_7B/config.json")
config.num_hidden_layers=1
model = BaiChuanForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//4))
with TorchDumper(TorchDumpDispatchMode,op_log_path="baichuan_7B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def ChatGLM_6B(use_half,device):
import sys
sys.path.append("./ChatGLM_6B")
from configuration_chatglm import ChatGLMConfig
from modeling_chatglm import ChatGLMModel
config=ChatGLMConfig.from_pretrained("ChatGLM_6B/config.json")
config.num_layers=1
model = ChatGLMModel(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_sequence_length))
input_tokens[:,0]=config.bos_token_id
input_tokens[:,2]=config.mask_token_id
input_tokens[:,-1]=config.eos_token_id
with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM_6B.pkl"):
output=model(input_tokens.to(device))
logits=output.last_hidden_state
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def ChatGLM2_6B(use_half,device):
import sys
sys.path.append("./ChatGLM2_6B")
from configuration_chatglm import ChatGLMConfig
from modeling_chatglm import ChatGLMModel
config=ChatGLMConfig.from_pretrained("ChatGLM2_6B/config.json")
config.num_layers=1
model = ChatGLMModel(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.padded_vocab_size,(1,config.seq_length//10))
with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM2_6B.pkl"):
output=model(input_tokens.to(device))
logits=output.last_hidden_state
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def ChatGLM3_6B(use_half,device):
import sys
sys.path.append("./ChatGLM3_6B")
from configuration_chatglm import ChatGLMConfig
from modeling_chatglm import ChatGLMModel
config=ChatGLMConfig.from_pretrained("ChatGLM3_6B/config.json")
config.num_layers=1
model = ChatGLMModel(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.padded_vocab_size,(1,config.seq_length//4))
with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM3_6B.pkl"):
output=model(input_tokens.to(device))
logits=output.last_hidden_state
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def deepseek_moe_16b_chat(use_half,device):
import sys
sys.path.append("./deepseek_moe_16b_chat")
from configuration_deepseek import DeepseekConfig
from modeling_deepseek import DeepseekForCausalLM
config=DeepseekConfig.from_pretrained("deepseek_moe_16b_chat/config.json")
config.num_hidden_layers=1
model = DeepseekForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings))
with TorchDumper(TorchDumpDispatchMode,op_log_path="deepseek_moe_16b_chat.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def deepseek_coder_33b_base(use_half,device):
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
config=LlamaConfig.from_pretrained("deepseek_coder_33b_base/config.json")
config.num_hidden_layers=1
model = LlamaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//10))
with TorchDumper(TorchDumpDispatchMode,op_log_path="deepseek_coder_33b_base.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def falcon_7b_instruct(use_half,device):
import sys
sys.path.append("./falcon_7b_instruct")
from configuration_RW import RWConfig
from modelling_RW import RWForCausalLM
config=RWConfig.from_pretrained("falcon_7b_instruct/config.json")
config.n_layer=1
model = RWForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="falcon_7b_instruct.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def GPT2(use_half,device):
from transformers import GPT2LMHeadModel, GPT2Config
config=GPT2Config.from_pretrained("GPT2/config.json")
config.n_layer=1
model = GPT2LMHeadModel(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="GPT2.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def gemma_7b(use_half,device):
import sys
sys.path.append("./gemma_7b")
from config import GemmaConfig
from model import GemmaForCausalLM
config=GemmaConfig.from_pretrained("gemma_7b/config.json")
config.num_hidden_layers=1
model = GemmaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
max_seq_len=512
batch_size=1
prompt_tokens=torch.randint(0,config.vocab_size,(batch_size,max_seq_len)).to(device)
temperature= 0.95
top_p = 1.0
top_k = 100
# build KV caches
kv_caches = []
for _ in range(config.num_hidden_layers):
size = (batch_size, max_seq_len, config.num_key_value_heads,
config.head_dim)
dtype = config.get_dtype()
k_cache = torch.zeros(size=size, dtype=dtype).to(device)
v_cache = torch.zeros(size=size, dtype=dtype).to(device)
kv_caches.append((k_cache, v_cache))
# prepare inputs
input_token_ids_tensor = torch.full((batch_size, max_seq_len),
0,
dtype=torch.int64)
input_token_ids_tensor = input_token_ids_tensor.to(device)
input_positions_tensor = torch.arange(0, max_seq_len,
dtype=torch.int64).to(device)
mask_tensor = torch.full((1, 1, max_seq_len, max_seq_len),
-2.3819763e38).to(torch.float)
mask_tensor = torch.triu(mask_tensor, diagonal=1).to(device)
output_positions_tensor = torch.LongTensor([max_seq_len - 1]).to(device)
temperatures_tensor = None if not temperature else torch.FloatTensor(
[temperature] * batch_size).to(device)
top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
with TorchDumper(TorchDumpDispatchMode,op_log_path="gemma_7b.pkl"):
output=model(prompt_tokens,input_positions_tensor,
None,kv_caches,mask_tensor,output_positions_tensor,
temperatures_tensor,top_ps_tensor,top_ks_tensor)
_,logits=output
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def grok1_pytorch(use_half,device):
import sys
sys.path.append("./grok1_pytorch")
from configuration_grok1 import Grok1Config
from modeling_grok1 import Grok1ModelForCausalLM
config=Grok1Config.from_pretrained("grok1_pytorch/config.json")
config.num_hidden_layers=1
config.num_experts=1
config.num_experts_per_tok=1
model = Grok1ModelForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="grok1_pytorch.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def internLM(use_half,device):
import sys
sys.path.append("./internLM")
from configuration_internlm import InternLMConfig
from modeling_internlm import InternLMForCausalLM
config=InternLMConfig.from_pretrained("internLM/config.json")
config.num_hidden_layers=1
model = InternLMForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="internLM.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def internlm2_20b(use_half,device):
import sys
sys.path.append("./internlm2_20b")
from configuration_internlm2 import InternLM2Config
from modeling_internlm2 import InternLM2ForCausalLM
config=InternLM2Config.from_pretrained("internlm2_20b/config.json")
config.num_hidden_layers=1
model = InternLM2ForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="internlm2_20b.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def llama_13b(use_half,device):
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
config=LlamaConfig.from_pretrained("llama_13b/config.json")
config.num_hidden_layers=1
model = LlamaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_sequence_length))
with TorchDumper(TorchDumpDispatchMode,op_log_path="llama_13b.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Llama2_13B_chat(use_half,device):
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
config=LlamaConfig.from_pretrained("Llama2_13B_chat/config.json")
config.num_hidden_layers=1
model = LlamaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,128))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Llama2_13B_chat.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Llama3_8B_Chinese_Chat(use_half,device):
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
config=LlamaConfig.from_pretrained("Llama3_8B_Chinese_Chat/config.json")
config.num_hidden_layers=1
model = LlamaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,128))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Llama3_8B_Chinese_Chat.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Mixtral_8x22B(use_half,device):
import sys
sys.path.append("./Mixtral_8x22B")
from configuration_mixtral import MixtralConfig
from modeling_mixtral import MixtralForCausalLM
config=MixtralConfig.from_pretrained("Mixtral_8x22B/config.json")
config.num_hidden_layers=1
model = MixtralForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Mixtral_8x22B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def OLMo_7B(use_half,device):
import sys
sys.path.append("./OLMo_7B")
from configuration_olmo import OLMoConfig
from modeling_olmo import OLMoForCausalLM
config=OLMoConfig.from_pretrained("OLMo_7B/config.json")
config.n_layers=1
model = OLMoForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="OLMo_7B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Phi3_mini_4k_instruct(use_half,device):
import sys
sys.path.append("./Phi3_mini_4k_instruct")
from configuration_phi3 import Phi3Config
from modeling_phi3 import Phi3ForCausalLM
config=Phi3Config.from_pretrained("Phi3_mini_4k_instruct/config.json")
config.num_hidden_layers=1
model = Phi3ForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Phi3_mini_4k_instruct.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def OpenELM_3B(use_half,device):
import sys
sys.path.append("./OpenELM_3B")
from configuration_openelm import OpenELMConfig
from modeling_openelm import OpenELMForCausalLM
config=OpenELMConfig.from_pretrained("OpenELM_3B/config.json")
config.num_transformer_layers=1
model = OpenELMForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="OpenELM_3B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Qwen_14B_Chat(use_half,device):
import sys
sys.path.append("./Qwen_14B_Chat")
from configuration_qwen import QWenConfig
from modeling_qwen import QWenLMHeadModel
config=QWenConfig.from_pretrained("Qwen_14B_Chat/config.json")
config.num_hidden_layers=1
model = QWenLMHeadModel(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Qwen_14B_Chat.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Qwen1_5_7B(use_half,device):
import sys
sys.path.append("./Qwen1_5_7B")
from configuration_qwen2 import Qwen2Config
from modeling_qwen2 import Qwen2ForCausalLM
config=Qwen2Config.from_pretrained("Qwen1_5_7B/config.json")
config.num_hidden_layers=1
model = Qwen2ForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Qwen1_5_7B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def t5_base(use_half,device):
import sys
sys.path.append("./t5_base")
from transformers import T5Config, T5ForConditionalGeneration
config=T5Config.from_pretrained("t5_base/config.json")
config.num_layers=1
config.max_new_tokens=512
model = T5ForConditionalGeneration(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_new_tokens))
with TorchDumper(TorchDumpDispatchMode,op_log_path="t5_base.pkl"):
output=model.generate(input_tokens.to(device))
#logits=output
#loss=logits.mean()-1.0
#loss.backward()
@llm_forward
def XVERSE_7B(use_half,device):
import sys
sys.path.append("./XVERSE_7B")
from configuration_xverse import XverseConfig
from modeling_xverse import XverseForCausalLM
config=XverseConfig.from_pretrained("XVERSE_7B/config.json")
config.num_hidden_layers=1
model = XverseForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,512))
with TorchDumper(TorchDumpDispatchMode,op_log_path="XVERSE_7B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Yi_34B(use_half,device):
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
config=LlamaConfig.from_pretrained("Yi_34B/config.json")
config.num_hidden_layers=1
model = LlamaForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//10))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Yi_34B.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
@llm_forward
def Yuan2_51B_hf(use_half,device):
import sys
sys.path.append("./Yuan2_51B_hf")
from configuration_yuan import YuanConfig
from yuan_hf_model import YuanForCausalLM
config=YuanConfig.from_pretrained("Yuan2_51B_hf/config.json")
config.num_hidden_layers=1
config.intermediate_size=2048
config.model_max_length=config.max_position_embeddings=2
model = YuanForCausalLM(config)
if use_half:
model=model.half()
model.train().to(device)
input_tokens=torch.randint(0,config.vocab_size,(1,2))
with TorchDumper(TorchDumpDispatchMode,op_log_path="Yuan2_51B_hf.pkl"):
output=model(input_tokens.to(device))
logits=output.logits
loss=logits.mean()-1.0
loss.backward()
def main():
global op_mapping
device="cuda"
use_half=True
pbar=tqdm(list(op_mapping.keys()))
for name in pbar:
torch.manual_seed(1)
p = mp.Process(target=op_mapping[name],args=(use_half,device))
p.start()
p.join()
torch.cuda.empty_cache()
pbar.set_description("%s" % (name))
if __name__=='__main__':
main()