自学内容网 自学内容网

Hugging Face PEFT LoRA 指令微调 glm4-9b-chat

😍Hugging Face 的 PEFT 包微调

安装 PEFT包+导包

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

训练数据集准备

训练集格式:

json格式,三部分组成:instruction,input,output

[
    {
        "instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。",
        "input": "煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。",
        "output": "[{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}]"
    },
    {
        "instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。",
        "input": "内分泌疾病是指内分泌腺或内分泌组织本身的分泌功能和(或)结构异常时发生的症候群。",
        "output": "[{\"head\": \"腺\", \"relation\": \"use\", \"tail\": \"分泌\"}]"
    },
]

数据加载预处理

将训练数据集处理为glm所需数据样式

glm所需prompt template如下:
指令微调需要将<instruction, input, output>处理为以下形式:
一般微调<input, output>并不需要如此繁琐的训练数据格式

[gMASK]<sop>
<|system|> 你是关系抽取专家。
<|user|> 煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。
<|assistant|> [{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}] <|endoftext|>

接下来:将crop_train.json中的每个json对象处理为以上模版形式,并且进行分词-标号形成:input_ids, attention_mask, labels 数据

import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer


dataset = load_dataset("json", data_files="./crop_train.json", split="train")
print(f"dataset: {dataset}")

tokenizer = AutoTokenizer.from_pretrained("./glm-4-9b-chat", trust_remote_code=True)
print(f"tokenizer: {tokenizer}")

def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    # 合并example的instruction和input字段为一个字符串
    instruction = f"{example['instruction']} {example['input']}".strip()  # query
    instruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],
                                                add_generation_prompt=True,
                                                tokenize=True,
                                                return_tensors="pt",
                                                return_dict=True
                                                )  # '[gMASK] <sop> <|user|> \nquery <|assistant|>'

    # 检查example["output"]是否是列表,并相应地处理
    if isinstance(example["output"], list):
        response_text = "\n".join(example["output"])
    else:
        response_text = "\n" + example["output"]

    response = tokenizer(response_text, add_special_tokens=False)  # \n response, 缺少eos token
    input_ids = instruction["input_ids"][0].numpy().tolist() + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"][0].numpy().tolist() + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"][0].numpy().tolist()) + response["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds = dataset.map(process_func, remove_columns=['instruction', 'input', 'output'])
print(f"All tokenizer tokens ids: {tokenized_ds}")     # features: ['input_ids', 'attention_mask', 'labels'],

# tokenized_ds: 包含input_ids, attention_mask, labels = [], [], []
input_ids_1 = tokenized_ds[0]["input_ids"]
attention_mask_1 = tokenized_ds[0]["attention_mask"]
labels_1 = tokenized_ds[0]["labels"]
print(f"input_ids_1: {input_ids_1}")
print(f"attention_mask_1: {attention_mask_1}")
print(f"labels_1: {labels_1}")

input_text_1 = tokenizer.decode(input_ids_1)
print(f"input_ids_1_decode: {input_text_1}")


下载创建模型

import torch
model = AutoModelForCausalLM.from_pretrained("./glm-4-9b-chat", trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map="auto")

PEFT LoRA 代码

配置文件config

from peft import LoraConfig, TaskType, get_peft_model, PeftModel
config = LoraConfig(target_modules=["query_key_value"], modules_to_save=["post_attention_layernorm"])

创建模型

model = get_peft_model(model, config)
print(model.print_trainable_parameters())# all params、trainable params

配置训练参数

args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_steps=100,
    num_train_epochs=10,
    learning_rate=1e-4,
    remove_unused_columns=False,
    save_strategy="epoch"
)

创建训练器

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds.select(range(10000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

开始训练

trainer.train()

保存权重

lora_path='./GLM4'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

原模型加载Lora训练权重推理预测

使用PeftModel包可以一起加载原始模型、lora训练模型

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = '/root/autodl-tmp/glm-4-9b-chat/ZhipuAI/glm-4-9b-chat'
lora_path = './GLM4_lora'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(
    mode_path, 
    device_map="auto",
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "你是谁?"
inputs = tokenizer.apply_chat_template([{"role": "system", "content": "假设你是皇帝身边的女人--甄嬛。"},
                                        {"role": "user", "content": prompt}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="pt",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

合并lora模型

from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, GenerationConfig
from peft import PeftModel
 
# 载入预训练模型
tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    use_fast=True, 
    padding_side="left", **config_kwargs)
    print("Tokenizer Load Success!")

config = AutoConfig.from_pretrained(base_model, **config_kwargs)

# Load and prepare pretrained models (without valuehead).
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        config=config,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        revision='main'
)

print('origin config =', model.config)
# 模型合并

lora_path = "./save_lora"
model = PeftModel.from_pretrained(model, lora_path)
model = model.merge_and_unload()
print('merge config =', model.config)
# 保存合并模型
save_path = "./save_merge_model
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

原文地址:https://blog.csdn.net/weixin_45947938/article/details/144690884

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!