Copyright © 2022-2024 aizws.net · 网站版本: v1.2.6·内部版本: v1.23.4·
页面加载耗时 0.00 毫秒·物理内存 102.0MB ·虚拟内存 1305.6MB
欢迎来到 AI 中文社区(简称 AI 中文社),这里是学习交流 AI 人工智能技术的中文社区。 为了更好的体验,本站推荐使用 Chrome 浏览器。
大语言模型的微调一直是说起来容易做起来难的事儿。近日 Hugging Face 技术主管 Philipp Schmid 发表了一篇博客,详细讲解了如何利用 Hugging Face 上的库和 fsdp 以及 Q-Lora 对大模型进行微调。
# Install Pytorch for FSDP and FA/SDPA
%pip install "torch==2.2.2" tensorboard
# Install Hugging Face librarie
s
%pip install --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
from datasets import load_dataset
# Convert dataset to OAI messages
system_message = """You are Llama, an AI assistant created by Philipp to be helpful and honest. Your knowledge spans a wide range of topics, allowing you to engage in substantive conversations and provide analysis on complex subjects."""
def create_conversation(sample):
if sample["messages"][0]["role"] == "system":
return sample
else:
sample["messages"] = [{"role": "system", "content": system_message}] + sample["messages"]
return sample
# Load dataset from the hub
dataset = load_dataset("HuggingFaceH4/no_robots")
# Add system message to each conversation
columns_to_remove = list(dataset["train"].features)
columns_to_remove.remove("messages")
dataset = dataset.map(create_conversation, remove_columns=columns_to_remove,batched=False)
# Filter out conversations which are corrupted with wrong turns, keep which have even number of turns after adding system message
dataset["train"] = dataset["train"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)
dataset["test"] = dataset["test"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)
# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records", force_ascii=False)
dataset["test"].to_json("test_dataset.json", orient="records", force_ascii=False)
%%writefile llama_3_70b_fsdp_qlora.yaml
# script parameters
model_id: "meta-llama/Meta-Llama-3-70b" # Hugging Face model id
dataset_path: "." # path to dataset
max_seq_len: 3072 # 2048 # max sequence length for model and packing of the dataset
# training parameters
output_dir: "./llama-3-70b-hf-no-robot" # Temporary output directory for model checkpoints
report_to: "tensorboard" # report metrics to tensorboard
learning_rate: 0.0002 # learning rate 2e-4
lr_scheduler_type: "constant" # learning rate scheduler
num_train_epochs: 3 # number of training epochs
per_device_train_batch_size: 1 # batch size per device during training
per_device_eval_batch_size: 1 # batch size for evaluation
gradient_accumulation_steps: 2 # number of steps before performing a backward/update pass
optim: adamw_torch # use torch adamw optimizer
logging_steps: 10 # log every 10 steps
save_strategy: epoch # save checkpoint every epoch
evaluation_strategy: epoch # evaluate every epoch
max_grad_norm: 0.3 # max gradient norm
warmup_ratio: 0.03 # warmup ratio
bf16: true # use bfloat16 precision
tf32: true # use tf32 precision
gradient_checkpointing: true # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
backward_prefetch: "backward_pre"
forward_prefetch: "false"
use_orig_params: "false"
!ACCELERATE_USE_FSDP=1 FSDP_CPU_RAM_EFFICIENT_LOADING=1 torchrun --nproc_per_node=4 ./scripts/run_fsdp_qlora.py --config llama_3_70b_fsdp_qlora.yaml
#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
# from peft import AutoPeftModelForCausalLM
# # Load PEFT model on CPU
# model = AutoPeftModelForCausalLM.from_pretrained(
# args.output_dir,
# torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
# )
# # Merge LoRA and base model and save
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
peft_model_id = "./llama-3-70b-hf-no-robot"
# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
peft_model_id,
torch_dtype=torch.float16,
quantization_config= {"load_in_4bit": True},
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
from datasets import load_dataset
from random import randint
# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))
messages = eval_dataset[rand_idx]["messages"][:2]
# Test on sample
input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,return_tensors="pt").to(model.device)
outputs = model.generate(
input_ids,
max_new_tokens=512,
eos_token_id= tokenizer.eos_token_id,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(f"**Query:**\n{eval_dataset[rand_idx]['messages'][1]['content']}\n")
print(f"**Original Answer:**\n{eval_dataset[rand_idx]['messages'][2]['content']}\n")
print(f"**Generated Answer:**\n{tokenizer.decode(response,skip_special_tokens=True)}")
# **Query:**
# How long was the Revolutionary War?
# **Original Answer:**
# The American Revolutionary War lasted just over seven years. The war started on April 19, 1775, and ended on September 3, 1783.
# **Generated Answer:**
# The Revolutionary War, also known as the American Revolution, was an 18th-century war fought between the Kingdom of Great Britain and the Thirteen Colonies. The war lasted from 1775 to 1783.
声明:本文转载自机器之心,转载目的在于传递更多信息,并不代表本社区赞同其观点和对其真实性负责,本文只提供参考并不构成任何建议,若有版权等问题,点击这里。