인공지능/XAI

LLama3 학습 데이터 변환하여 LLama3.2 Sparse Autoencoder 학습하기

이게될까 2024. 9. 27. 20:05
728x90
728x90

기존에 공개된 학습 데이터를 통해 3.2에 사용할 수 있는 학습 데이터를 만들어 보겠습니다.

import csv
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm  # 진행 상태 표시를 위한 tqdm

from huggingface_hub import HfApi, login

login('') # 허깅 페이스 api 키로 로그인해서 데이터 셋, 토크나이저 가져올 수 있도록 합니다.


# 데이터셋 로드
dataset = load_dataset("chanind/openwebtext-llama3")

# Meta-Llama-3-8B 토크나이저 로드 (디코딩용)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

 

이제 디코딩을 시작합니다.

저는 정보 사라지는 것이 무서워 일단 저장부터 하는 성격이라...

# 데이터셋 로드
dataset = load_dataset("chanind/openwebtext-llama3")

# Meta-Llama-3-8B 토크나이저 로드 (디코딩용)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# CSV 파일로 디코딩된 텍스트를 저장
with open("decoded_texts.csv", "w", newline='', encoding='utf-8') as csvfile_decoded:
    
    # 디코딩된 텍스트를 저장할 CSV 파일 설정
    decoded_writer = csv.writer(csvfile_decoded)
    decoded_writer.writerow(['decoded_text'])  # 헤더 작성
    
    # 데이터셋을 tqdm을 사용하여 순회하며 각 샘플 디코딩
    for i in tqdm(range(len(dataset['train'])), desc="Processing dataset"):
        encoded_data = dataset['train'][i]['input_ids']  # 또는 적절한 필드 사용
        
        # 1. 토큰을 텍스트로 디코딩
        decoded_text = tokenizer.decode(encoded_data, skip_special_tokens=True)
        
        # 2. 디코딩된 텍스트를 첫 번째 CSV 파일에 저장
        decoded_writer.writerow([decoded_text])

 

디코딩된 정보는 제가 지정한 csv 파일에 저장됩니다.

 

# HfApi 인스턴스 생성
api = HfApi()

# 'yoonLM/decoding_llama3'에 decoded_texts.csv 파일 업로드
api.upload_file(
    path_or_fileobj="decoded_texts.csv",  # 로컬에 있는 파일 경로
    path_in_repo="decoded_texts.csv",     # 저장소에 업로드될 파일 경로
    repo_id="yoonLM/decoding_llama3",     # Hugging Face 저장소 이름 (형식: 'username/repository')
    repo_type="dataset"                   # 데이터셋 저장소 유형
)

이제 이 데이터를 허깅 페이스에 올려줍니다!

용량이 생각보다 많이 커서 오래걸립니다.

 

from sae_lens import PretokenizeRunner, PretokenizeRunnerConfig
from transformers import AutoTokenizer
from datasets import load_dataset
from huggingface_hub import HfApi, login
# # HuggingFace API 토큰으로 로그인
login('')
# 모델의 토크나이저 로드
context_size = 512

cfg = PretokenizeRunnerConfig(
    tokenizer_name="meta-llama/Llama-3.2-3B-Instruct",
    dataset_path="yoonLM/decoding_llama3", # this is just a tiny test dataset
    column_name="decoded_text",
    shuffle=True,
    num_proc=32, # increase this number depending on how many CPUs you have
    
    # tweak these settings depending on the model
    context_size=context_size,
    begin_batch_token=tokenizer.bos_token_id,
    begin_sequence_token=None,
    sequence_separator_token=tokenizer.eos_token_id,
    #streaming=True,  # 스트리밍 처리 방식으로 메모리 최적화
    # uncomment to upload to huggingface
    hf_repo_id=f"yoonLM/llama3.2_3b_tokenizingdata_{context_size}",
    #hf_num_shards=180,
    # uncomment to save the dataset locally
    save_path=f"tokenized-2_3b_tokenizingdata_{context_size}"
    )

dataset = PretokenizeRunner(cfg).run()

이제 디코딩된 데이터를 다시 라마 3.2 토크나이저로 인코딩해서 업로드하면 끝!

저는 콘텍스트 사이즈마다 다 진행해서 저렇게 비워뒀습니다.

 

이제 학습을 시켜야 합니다!

 

import os
from setproctitle import setproctitle

setproctitle("")
os.environ["CUDA_VISIBLE_DEVICES"] = "0 1"
gpu_num = 2
import torch
from tqdm import tqdm
import plotly.express as px
import pandas as pd

from huggingface_hub import login

# HuggingFace API 토큰으로 로그인
login('')

또 로그인과 이름 적기, GPU를 할당해줍니다.

 

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

쿠다도 해주고요!

 

from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner

total_training_steps = 300000  # probably we should do more
batch_size = 4096
total_training_tokens = total_training_steps * batch_size

context_size = 2048

lr_warm_up_steps = 200
lr_decay_steps = total_training_steps // 5  # 20% of training
l1_warm_up_steps = total_training_steps // 20  # 5% of training

cfg = LanguageModelSAERunnerConfig(
    # Data Generating Function (Model + Training Distibuion)
    model_name="meta-llama/Llama-3.2-3B-Instruct",  # our model (more options here: https://neelnanda-io.github.io/TransformerLens/generated/model_properties_table.html)
    hook_name="blocks.10.hook_resid_pre",  # A valid hook point (see more details here: https://neelnanda-io.github.io/TransformerLens/generated/demos/Main_Demo.html#Hook-Points)
    hook_layer=10,  # Only one layer in the model.
    d_in=3072,  # the width of the mlp output.
    dataset_path="yoonLM/llama3.2_3b_tokenizingdata", 
    is_dataset_tokenized=True,
    streaming=True,  # we could pre-download the token dataset if it was small.
    # SAE Parameters
    mse_loss_normalization=None,  # We won't normalize the mse loss,
    expansion_factor=16,  # the width of the SAE. Larger will result in better stats but slower training.
    b_dec_init_method="zeros",  # The geometric median can be used to initialize the decoder weights.
    apply_b_dec_to_input=False,  # We won't apply the decoder weights to the input.
    normalize_sae_decoder=False,
    scale_sparsity_penalty_by_decoder_norm=True,
    decoder_heuristic_init=True,
    init_encoder_as_decoder_transpose=True,
    normalize_activations="expected_average_only_in",
    # Training Parameters
    lr=5e-5,  # lower the better, we'll go fairly high to speed up the tutorial.
    adam_beta1=0.9,  # adam params (default, but once upon a time we experimented with these.)
    adam_beta2=0.999,
    #lr_scheduler_name="constant",  # constant learning rate with warmup. Could be better schedules out there.
    lr_scheduler_name="cosine",
    lr_warm_up_steps=lr_warm_up_steps,  # this can help avoid too many dead features initially.
    lr_decay_steps=lr_decay_steps,  # this will help us avoid overfitting.
    l1_coefficient=10,  # will control how sparse the feature activations are
    l1_warm_up_steps=l1_warm_up_steps,  # this can help avoid too many dead features initially.
    lp_norm=1.0,  # the L1 penalty (and not a Lp for p < 1)
    train_batch_size_tokens=batch_size,
    context_size=context_size,  # will control the lenght of the prompts we feed to the model. Larger is better but slower. so for the tutorial we'll use a short one.
    # Activation Store Parameters
    n_batches_in_buffer=64,  # controls how many activations we store / shuffle.
    training_tokens=total_training_tokens,  # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.
    store_batch_size_prompts=16,
    # Resampling protocol
    use_ghost_grads=False,  # we don't use ghost grads anymore.
    feature_sampling_window=1000,  # this controls our reporting of feature sparsity stats
    dead_feature_window=1000,  # would effect resampling or ghost grads if we were using it.
    dead_feature_threshold=1e-4,  # would effect resampling or ghost grads if we were using it.
    # WANDB
    log_to_wandb=True,  # always use wandb unless you are just testing code.
    wandb_project="sae_LLaMa3.2_3B_instruction",
    wandb_log_frequency=30,
    eval_every_n_wandb_logs=20,
    model_from_pretrained_kwargs={"n_devices": gpu_num},
    # Misc
    device= device,
    seed=42,
    n_checkpoints=total_training_tokens/20,
    checkpoint_path=f"checkpoints_LLama3.2_3B_{context_size}",
    dtype="float32"
)
# look at the next cell to see some instruction for what to do while this is running.
sparse_autoencoder = SAETrainingRunner(cfg).run()

각종 옵션에 맞춰서 돌리면 잘 돌아갑니다!

학습 시간은...

뭐....

이렇게 나옵니다...

728x90