语言大模型
概念
- 基于海量文本数据训练的深度学习模型,能够理解和生成自然语言
- 每个词都是向量,语义相近的词,向量也较接近
- 根据概率,依次生成下一个词
开始部署
下载
环境
# NVIDIA显卡,查看驱动版本
nvidia-smi
# 查看CUDA版本,没有就去NVIDIA官网安装CUDA
nvcc --version
# 查看CUDA版本对应的pytorch版本
https://pytorch.org/get-started/locally
# 安装
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
# 验证
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA是否可用: {torch.cuda.is_available()}")
使用
pip install accelerate
# 环境
python:3.13
accelerate:1.13.0
torch:2.10.0+cu126
tokenizers:0.22.2
transformers:5.3.0
peft:0.18.1
datasets:4.8.2
# demo
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 装了模型所有文件的上级文件夹路径
model_name = "./model"
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 加载模型AutoModelForCausalLM会自动识别模型类型并加载对应的因果语言模型
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.float16,
device_map="auto"
)
if "cuda" in str(model.device):
print(f"模型运行在GPU上: {model.device}")
print(f"显存使用量: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
else:
print("模型运行在CPU上")
# 准备输入
text = "星期四要吃什么?"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
# 生成,禁用梯度计算
with torch.no_grad():
outputs = model.generate(
**inputs,
num_return_sequences=3,
max_new_tokens=999, # 最大生成token数量
do_sample=True, # 使用采样而不是贪婪解码
temperature=0.5, # 采样温度,控制随机性
top_p=0.9 # 核采样参数,控制生成多样性
)
# 解码并打印结果
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
LoRA微调
微调
# lora微调
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
# 加载lora权重,PeftModel.from_pretrained(base_model, "./lora_qwen_output/checkpoint-24")
from peft import PeftModel
# 查看训练日志:tensorboard --logdir ./logs
import tensorboard
# 模型路径
model_name_or_path = "./model"
# 训练输出目录
output_dir = "./lora_qwen_output"
# 配置Lora超参数
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM, # 因果语言模型任务
r=8,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # 要应用LoRA的模块
lora_dropout=0.1, # Dropout概率
bias="none" # 不训练bias
)
# 训练参数
epochs = 3 # 训练轮数
batch_size = 1 # 每GPU批次大小
gradient_accumulation_steps = 4 # 梯度累积步数
learning_rate = 2e-4 # 学习率
max_length = 512 # 输入最大长度
logging_steps = 10 # 日志打印间隔
save_steps = 50 # 每多少步保存一次
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
max_steps=120,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
learning_rate=learning_rate,
logging_steps=logging_steps,
logging_dir="./logs",
save_steps=save_steps,
save_total_limit=2, # 只保留最近2个checkpoint
fp16=True, # 使用混合精度训练(与模型半精度一致)
remove_unused_columns=False, # 保留所有列,因为collator需要"input_ids"等
report_to="tensorboard", # 不报告到外部工具(如wandb),保持精简
)
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 训练数据
dialogs = []
for i in range(5):
dialogs.append({
"messages": [
{"role": "user", "content": "你好!我是秦始皇!"},
{"role": "assistant", "content": "陛下!"}
]
})
dialogs.append({
"messages": [
{"role": "user", "content": "什么是语言大模型?"},
{"role": "assistant", "content": "就是接龙"}
]
})
dialogs.append({
"messages": [
{"role": "user", "content": "阿姆罗的宿敌是谁?"},
{"role": "assistant", "content": "夏亚"}
]
})
def format_dialog(example):
text = tokenizer.apply_chat_template(
example["messages"],
tokenize=False, # 不进行tokenize,只返回字符串
add_generation_prompt=False # 训练时不添加生成提示,让模型学习生成assistant部分
)
return {"text": text}
# 创建HuggingFace Dataset对象
dataset = Dataset.from_list(dialogs)
dataset = dataset.map(format_dialog, remove_columns=["messages"])
# 对文本进行tokenize
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=max_length,
padding=False, # 不需要padding,DataCollator会动态处理
return_tensors=None # 返回list,由collator处理
)
dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# 此时dataset包含"input_ids"和"attention_mask"字段
# 以半精度加载模型
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
dtype=torch.float16, # 半精度,显存减半
device_map=None,
trust_remote_code=True
)
model = model.cuda()
# 加载lora权重
# model = PeftModel.from_pretrained(model, "./lora_qwen_output/checkpoint-27")
# 启用梯度检查点(可选,进一步节省显存,但会减慢速度)
# model.gradient_checkpointing_enable()
# 将Lora应用到模型上
model = get_peft_model(model, lora_config)
# 打印可训练参数数量,确认Lora生效
model.print_trainable_parameters()
# 创建数据收集器 因果语言建模的数据收集器,会自动对批次进行padding,并创建labels(与input_ids相同)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False, # 不是掩码语言模型,而是因果LM
)
# 开始训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
# 从头训练
trainer.train(resume_from_checkpoint=None)
# 恢复训练
# trainer.train(resume_from_checkpoint="./lora_qwen_output/checkpoint-114")
# 保存模型
model.save_pretrained(output_dir)
# 保存分词器
tokenizer.save_pretrained(output_dir)
print(f"训练完成!保存到 {output_dir}")
使用
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 基座模型路径
base_model_path = "./model" # 或本地路径
lora_weights_path = "./lora_qwen_output" # 你保存LoRA权重的目录
# 加载基座模型和分词器
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
device_map=None,
trust_remote_code=True
)
base_model = base_model.cuda()
# 加载LoRA权重并合并或直接使用PeftModel
lora_model = PeftModel.from_pretrained(base_model, lora_weights_path)
# 如果想永久合并权重(变成普通模型):
# merged_model = lora_model.merge_and_unload()
# 推理函数
def generate_response(model, prompt):
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=128,
do_sample=True,
temperature=0.7
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
# 测试
test_prompts = [
"你好!我是秦始皇!",
# "什么是语言大模型?",
# "阿姆罗的宿敌是谁?"
]
print("=== 微调前模型 ===")
for p in test_prompts:
print(f"in: {p}")
print(f"out: {generate_response(base_model, p)}\n")
print("=== 微调后模型 ===")
for p in test_prompts:
print(f"in: {p}")
print(f"out: {generate_response(lora_model, p)}\n")