2024.06.05 - [인공지능/자연어 처리] - 자연어 처리 17강 - Parameter efficient Tuning
여기에 정리된 내용이긴 한데
2024.07.21 - [인공지능/자연어 처리] - 자연어 처리 LLaMa 모델 분석하기
여기에서도 조금 나왔을 겁니다.
아래 코드에서 설명이 없는 부분은 여기 참조하면 되겠습니다.
2024.07.21 - [인공지능/자연어 처리] - 자연어 처리 python 실습 - LLaMa instruction Tuning
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
디바이스 정해주기!
from transformers import LlamaForCausalLM
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0721')
model = LlamaForCausalLM.from_pretrained('daily_llama_0721')
model
pre-trained 모델 불러오기!
from peft import get_peft_model, LoraConfig, TaskType
LoRA가 보이네요
list(TaskType)
[<TaskType.SEQ_CLS: 'SEQ_CLS'>,
<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
<TaskType.TOKEN_CLS: 'TOKEN_CLS'>]
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
inference_mode=False, #학습시킬 것이기 때문에
r=32, # train할 파라미터의 수
lora_alpha=32,
lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.to(device)
PeftModelForCausalLM(
(base_model): LoraModel(
(model): LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(50000, 512, padding_idx=0)
(layers): ModuleList(
(0-3): 4 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(
in_features=512, out_features=512, bias=False
(lora_dropout): ModuleDict(
(default): Dropout(p=0.1, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=512, out_features=32, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=32, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
)
(k_proj): Linear(in_features=512, out_features=512, bias=False)
(v_proj): Linear(
in_features=512, out_features=512, bias=False
...
)
(lm_head): Linear(in_features=512, out_features=50000, bias=False)
)
)
)
model.print_trainable_parameters()
trainable params: 262144 || all params: 64115200 || trainable%: 0.4088640447195049
전체 중 일부만 학습하게 된다.
R이 커지면 점점 늘어난다.
from datasets import load_dataset
dataset_cate = load_dataset('heegyu/news-category-balanced-top10')
데이터 셋을 불러온다.
categories = dataset_cate['train'].to_pandas().category.unique().tolist()
categories.sort()
categories = categories[:4]
dataset_cate = dataset_cate.filter(lambda element: element['category'] in categories)
dataset_cate
DatasetDict({
train: Dataset({
features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
num_rows: 29026
})
})
categories = [x.split(' ')[0].lower() for x in categories]
int2label_cate = {i: categories[i] for i in range(len(categories))}
label2int_cate = {int2label_cate[key]:key for key in int2label_cate}
def gen_label(element):
category = element['category'].split(' ')[0].lower()
return {'label': label2int_cate[category], 'category': category}
dataset_cate = dataset_cate.map(gen_label)
dataset_cate = dataset_cate['train'].train_test_split(test_size=0.1)
dataset_cate
DatasetDict({
train: Dataset({
features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
num_rows: 26123
})
test: Dataset({
features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
num_rows: 2903
})
})
from datasets import DatasetDict
from datasets import concatenate_datasets
import random
prompt_format1_cate = """Given the article, what is the topic of the article? article: %s answer: %s"""
prompt_format2_cate = """Determine the topic of the news article. article: %s answer: %s"""
prompt_format3_cate = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer: %s"""
prompts_cate = [prompt_format1_cate, prompt_format2_cate, prompt_format3_cate]
def gen_prompt_cate(element):
prompt_format = prompts_cate[random.randint(0, len(prompts_cate)-1)]
return DatasetDict({'input': prompt_format%(element['headline'], int2label_cate[element['label']])})
train_cate = dataset_cate['train'].map(gen_prompt_cate, remove_columns=dataset_cate['train'].column_names)
train_dataset = train_cate
def tokenize(element):
tokenizer.pad_token = tokenizer.eos_token
outputs = tokenizer(
element['input'],
truncation=True,
max_length=context_length,
return_overflowing_tokens=False,
return_length=True,
padding=True
)
return {"input_ids": outputs["input_ids"]}
context_length=128
tokenized_datasets = train_dataset.map(
tokenize, batched=True, remove_columns=train_dataset.column_names
)
tokenized_datasets
Dataset({
features: ['input_ids'],
num_rows: 26123
})
이전 과정과 계속 동일하다.
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
print(f"{key} shape: {out[key].shape}")
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
input_ids shape: torch.Size([5, 54])
attention_mask shape: torch.Size([5, 54])
labels shape: torch.Size([5, 54])
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir="peft_llama",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
evaluation_strategy="steps",
eval_steps=5_000,
logging_steps=5_000,
gradient_accumulation_steps=8,
num_train_epochs=1,
weight_decay=0.1,
warmup_steps=1_000,
lr_scheduler_type="cosine",
learning_rate=5e-4,
save_steps=1_000,
fp16=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets,
)
trainer.train()
model_load.eval()
prompt = """\
What is the topic of the collowing article? article: Boeing CEO says he assured Trump about Air Force One costs answer:"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to(device)
# Generate
generate_ids = model_load.generate(input_ids=inputs.input_ids, max_length=40)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]
"What is the topic of the collowing article? article: Boeing CEO says he assured Trump about Air Force One costs answer: business/entering the 'Socusing' answer: business/"
오 바로 끊어지는건 신기하네요 길어서 짜른건지
tokenizer = AutoTokenizer.from_pretrained("daily_tokenizer_0612", padding_side='left')
prompt_format1 = """Given the article, what is the topic of the article? article: %s answer:"""
prompt_format2 = """Determine the topic of the news article. article: %s answer:"""
prompt_format3 = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer:"""
prompts = [prompt_format1, prompt_format2, prompt_format3]
def gen_valid_prompt_cate(element):
prompt_format = prompts[random.randint(0, len(prompts)-1)]
return DatasetDict({'input': prompt_format%(element['headline'])})
valid_dataset = dataset_cate['test'].map(gen_valid_prompt_cate)
context_length=128
valid_dataset = valid_dataset.map(
tokenize, batched=True, remove_columns=['link', 'headline', 'category', 'short_description', 'authors', 'date', 'input']
)
valid_dataset
Dataset({
features: ['label', 'input_ids'],
num_rows: 2903
})
from torch.utils.data import DataLoader
batch_size=4
val_ds = valid_dataset.select(range(100))
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)
import re
import torch
from tqdm import tqdm
def acc(pred,label):
return torch.sum(torch.tensor(pred) == label.squeeze()).item()
model.eval()
val_acc = 0
for step, batch in enumerate(tqdm(val_dl)):
label = batch['label']
input_id = batch['input_ids'].to(device)
pred = model.generate(input_ids=input_id, max_length=70)
decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]
val_acc += acc(decoded_pred, label)
print("val acc: ", val_acc/len(val_dl.dataset))
정확도는 0.5가 나오게 된다.
모든 파라미터를 파인튜닝한 것은 아니어서 그럴 것이다.
model.save_pretrained('peft_llama_adapter__')
import os
os.stat('peft_llama_adapter__/adapter_model.bin').st_size/(1024*1024)
1.0055246353149414
모델 사이즈를 확인해 보았다.
1MB정도이다.
os.stat('daily_llama_0721/pytorch_model.bin').st_size/(1024*1024)
243.59453010559082
기존 모델은 243MB정도 된다.
극히 일부의 파라미터만 학습한 것이다.
model_load = LlamaForCausalLM.from_pretrained('daily_llama_0721')
model_load = PeftModel.from_pretrained(model_load, 'peft_llama_adapter')
model_load.to(device)
모델 불러오기!
'인공지능 > 자연어 처리' 카테고리의 다른 글
LLaMa3 LoRA를 통해 parameter efficient fine-tuning 진행하기 2(Matlab 도메인) - python (10) | 2024.07.22 |
---|---|
LLaMa3 LoRA를 통해 parameter efficient fine-tuning 진행하기 1(Matlab 도메인) - python (0) | 2024.07.22 |
자연어 처리 python 실습 - LLaMa instruction Tuning (1) | 2024.07.21 |
자연어 처리 : LLaMa Pretrain하기 - python 실습 (1) | 2024.07.21 |
자연어 처리 LLaMa 모델 분석하기 (0) | 2024.07.21 |