Hugging Face PEFT LoRA 指令微调 glm4-9b-chat
😍Hugging Face 的 PEFT 包微调
安装 PEFT包+导包
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
训练数据集准备
训练集格式:
json格式,三部分组成:instruction,input,output
[
{
"instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。",
"input": "煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。",
"output": "[{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}]"
},
{
"instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。",
"input": "内分泌疾病是指内分泌腺或内分泌组织本身的分泌功能和(或)结构异常时发生的症候群。",
"output": "[{\"head\": \"腺\", \"relation\": \"use\", \"tail\": \"分泌\"}]"
},
]
数据加载预处理
将训练数据集处理为glm所需数据样式
glm所需prompt template如下:
指令微调需要将<instruction, input, output>处理为以下形式:
一般微调<input, output>并不需要如此繁琐的训练数据格式
[gMASK]<sop>
<|system|> 你是关系抽取专家。
<|user|> 煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。
<|assistant|> [{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}] <|endoftext|>
接下来:将crop_train.json中的每个json对象处理为以上模版形式,并且进行分词-标号形成:input_ids, attention_mask, labels 数据
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
dataset = load_dataset("json", data_files="./crop_train.json", split="train")
print(f"dataset: {dataset}")
tokenizer = AutoTokenizer.from_pretrained("./glm-4-9b-chat", trust_remote_code=True)
print(f"tokenizer: {tokenizer}")
def process_func(example):
MAX_LENGTH = 256
input_ids, attention_mask, labels = [], [], []
# 合并example的instruction和input字段为一个字符串
instruction = f"{example['instruction']} {example['input']}".strip() # query
instruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True
) # '[gMASK] <sop> <|user|> \nquery <|assistant|>'
# 检查example["output"]是否是列表,并相应地处理
if isinstance(example["output"], list):
response_text = "\n".join(example["output"])
else:
response_text = "\n" + example["output"]
response = tokenizer(response_text, add_special_tokens=False) # \n response, 缺少eos token
input_ids = instruction["input_ids"][0].numpy().tolist() + response["input_ids"] + [tokenizer.eos_token_id]
attention_mask = instruction["attention_mask"][0].numpy().tolist() + response["attention_mask"] + [1]
labels = [-100] * len(instruction["input_ids"][0].numpy().tolist()) + response["input_ids"] + [tokenizer.eos_token_id]
if len(input_ids) > MAX_LENGTH:
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
tokenized_ds = dataset.map(process_func, remove_columns=['instruction', 'input', 'output'])
print(f"All tokenizer tokens ids: {tokenized_ds}") # features: ['input_ids', 'attention_mask', 'labels'],
# tokenized_ds: 包含input_ids, attention_mask, labels = [], [], []
input_ids_1 = tokenized_ds[0]["input_ids"]
attention_mask_1 = tokenized_ds[0]["attention_mask"]
labels_1 = tokenized_ds[0]["labels"]
print(f"input_ids_1: {input_ids_1}")
print(f"attention_mask_1: {attention_mask_1}")
print(f"labels_1: {labels_1}")
input_text_1 = tokenizer.decode(input_ids_1)
print(f"input_ids_1_decode: {input_text_1}")
下载创建模型
import torch
model = AutoModelForCausalLM.from_pretrained("./glm-4-9b-chat", trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map="auto")
PEFT LoRA 代码
配置文件config
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
config = LoraConfig(target_modules=["query_key_value"], modules_to_save=["post_attention_layernorm"])
创建模型
model = get_peft_model(model, config)
print(model.print_trainable_parameters())# all params、trainable params
配置训练参数
args = TrainingArguments(
output_dir="./chatbot",
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
gradient_checkpointing=True,
logging_steps=100,
num_train_epochs=10,
learning_rate=1e-4,
remove_unused_columns=False,
save_strategy="epoch"
)
创建训练器
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_ds.select(range(10000)),
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
开始训练
trainer.train()
保存权重
lora_path='./GLM4'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)
原模型加载Lora训练权重推理预测
使用PeftModel包可以一起加载原始模型、lora训练模型
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
mode_path = '/root/autodl-tmp/glm-4-9b-chat/ZhipuAI/glm-4-9b-chat'
lora_path = './GLM4_lora'
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
mode_path,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval()
# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)
prompt = "你是谁?"
inputs = tokenizer.apply_chat_template([{"role": "system", "content": "假设你是皇帝身边的女人--甄嬛。"},
{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True
).to('cuda')
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
合并lora模型
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, GenerationConfig
from peft import PeftModel
# 载入预训练模型
tokenizer = AutoTokenizer.from_pretrained(
base_model,
use_fast=True,
padding_side="left", **config_kwargs)
print("Tokenizer Load Success!")
config = AutoConfig.from_pretrained(base_model, **config_kwargs)
# Load and prepare pretrained models (without valuehead).
model = AutoModelForCausalLM.from_pretrained(
base_model,
config=config,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True,
revision='main'
)
print('origin config =', model.config)
# 模型合并
lora_path = "./save_lora"
model = PeftModel.from_pretrained(model, lora_path)
model = model.merge_and_unload()
print('merge config =', model.config)
# 保存合并模型
save_path = "./save_merge_model
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
原文地址:https://blog.csdn.net/weixin_45947938/article/details/144690884
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!