https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
TIGER-Lab/MMLU-Pro · Datasets at Hugging Face
[ "Boycotts, Buyalls, Blockchain technology, Increased Sales", "Buycotts, Boycotts, Digital technology, Decreased Sales", "Boycotts, Buycotts, Digital technology, Decreased Sales", "Buycotts, Boycotts, Blockchain technology, Charitable donations", "Boycott
huggingface.co
이번에 만드는 모델을 평가하기 위해 벤치마크 중 하나인 MMLU-Pro를 사용하려고 합니다.

점수는 이렇게 되어 있고, CoT를 사용하지 않으면 더 떨어진다고 하네요
Chain-of-Thought Prompting Elicits Reasoning in Large Language Models - 논문 리뷰
https://arxiv.org/abs/2201.11903 Chain-of-Thought Prompting Elicits Reasoning in Large Language ModelsWe explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models t
yoonschallenge.tistory.com
CoT는 다들 아실테니...
https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
MMLU-Pro Leaderboard - a Hugging Face Space by TIGER-Lab
huggingface.co
여기서 리더보드도 확인 가능합니다.
https://github.com/TIGER-AI-Lab/MMLU-Pro
GitHub - TIGER-AI-Lab/MMLU-Pro: The code and data for "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding
The code and data for "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark" [NeurIPS 2024] - TIGER-AI-Lab/MMLU-Pro
github.com
여기서 평가를 진행할 수 있는데 너무 뒤늦게 봤습니다...
일단 코드를 보면 ...
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from vllm import LLM, SamplingParams
import argparse, os, json , random #, champ_dataset
import torch
from setproctitle import setproctitle
def load_model(repo):
return LLM(model=repo,tensor_parallel_size=torch.cuda.device_count())
def load_json(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(data, save_dir, path):
os.makedirs(save_dir, exist_ok=True)
path = os.path.join(save_dir, path)
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
def apply_MCQA_prompt(prompt, queries, options):
formatted_prompts = []
for query, option_list in zip(queries, options):
formatted_options = "\n".join(
[f"{chr(65+i)}. {opt}" for i, opt in enumerate(option_list)]
)
formatted_prompt = prompt.format(question=query, options=formatted_options)
formatted_prompts.append(formatted_prompt)
return formatted_prompts
def generate_outputs(dataset_name, prompts, queries, model, sampling_params):
outputs = model.generate(prompts, sampling_params)
output_texts = [output.outputs[0].text for output in outputs]
question_answer_pairs = [{'question': question, 'answer': answer} for question, answer in zip(queries, output_texts)]
save_json(question_answer_pairs, args.save_dir, f"{args.model.split('/')[1]}-{dataset_name}.json")
def main(args):
# PREPARE MODEL
model = load_model(args.model)
prompt = args.prompt
prompt_MCQA = "Question: {question}\nOptions: {options}\nAnswer:"
sampling_params = SamplingParams(temperature=0.2, max_tokens=512)
# PREPARE DATASETS
MMLU_Pro = load_dataset("TIGER-Lab/MMLU-Pro")['test']
# EXTRACT QUERIES
MMLU_Pro_queries = MMLU_Pro['question']
MMLU_Pro_options = MMLU_Pro['options']
# GENERATE PROMPTS
MMLU_Pro_prompts = apply_MCQA_prompt(prompt_MCQA, MMLU_Pro_queries,MMLU_Pro_options)
# GENERATE OUTPUTS
generate_outputs("MMLU-Pro", MMLU_Pro_prompts, MMLU_Pro_queries, model, sampling_params)
if __name__=='__main__':
args = argparse.ArgumentParser()
args.add_argument('--model', type=str, default="meta-llama/Llama-3.1-8B-Instruct")
args.add_argument('--save_dir', type=str, default="outputs")
args.add_argument('--prompt', type=str, default="Question: {question}\nAnswer:")
args = args.parse_args()
main(args)
이거 돌리면 끝입니다.
기존 평가들은 MCQA 방식이 아니어서 MCQA방식으로 함수를 변환해줬습니다.
이게 문제가 있는게 Few-Shot이 없어서 문제 정답 출력 형식이 지 맘대로라.... 아마 <answer> 태그 안에 넣으라는 Instroction을 넣는게 좋을 겁니다...
GPU 여러 개 사용하면 금방 끝납니다.
그럼 이제 저장된 Output 파일 평가해야죠
import re, argparse, os, json
from datasets import load_dataset
#import champ_dataset
def load_json(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
return json.load(f)
def is_correct(model_response, gt):
assert gt != "[invalid]"
return gt.lower() in model_response.lower()
def is_mcqa_correct(model_response, gt_answer_letter, gt_index, options):
"""
model_response: 모델이 출력한 문자열
gt_answer_letter: 'H' 등 알파벳 형태의 정답
gt_index: 정답 인덱스 (정수형)
options: 보기 리스트
"""
# 정답 보기 문자열
gt_text = options[gt_index].strip().lower()
# 모델 응답에서 알파벳 추출
predicted_letter = extract_answer_letter(model_response)
predicted_letter = predicted_letter.upper() if predicted_letter else ""
# 정답 알파벳
gt_letter = gt_answer_letter.upper()
# 비교
is_correct_by_letter = (predicted_letter == gt_letter)
is_correct_by_text = (gt_text in model_response.lower())
return is_correct_by_letter or is_correct_by_text
def extract_answer_letter(text):
"""
모델 응답에서 A~P 중 하나인 정답 알파벳 추출
패턴 우선순위:
1. "The answer is (B)"
2. "Answer: B"
3. 응답 시작 부분에서 "A." 또는 "A\n" 등 보기 패턴
4. 마지막에 단독으로 등장하는 A~P
"""
patterns = [
r"[Tt]he answer is\s*\(?([A-P])\)?",
r"[Aa]nswer:\s*([A-P])",
r"[Tt]he best answer is\s*\(?([A-P])\)?",
r"[Tt]he correct answer is\s*\(?([A-P])\)?",
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return match.group(1)
# 3. 시작 부분에서 "A." 또는 "A\n" 형식으로 나오는 경우
match = re.match(r"^\s*([A-P])[\.\)]?[\s\n]", text)
if match:
return match.group(1)
return None
def main(args):
# LOAD DATASET
MMLU_Pro = load_dataset("TIGER-Lab/MMLU-Pro")['test']
# EXTRACT ANSWERS
# EVALUATE
for output_result in os.listdir(args.output_dir):
file_path = os.path.join(args.output_dir, output_result)
with open(file_path, 'r', encoding='utf-8') as f:
output = json.load(f)
if 'gsm8k' in output_result:
correct = 0
elif 'MMLU-Pro' in output_result:
correct = 0
for model_completion, example in zip(output, MMLU_Pro):
try:
model_answer = model_completion['answer']
gt_letter = example['answer'] # 예: 'H'
gt_index = example['answer_index'] # 예: 7
options = example['options']
correct += is_mcqa_correct(model_answer, gt_letter, gt_index, options)
except Exception as e:
print("⚠️ Error processing example:")
print(f"model_answer: {model_completion}")
print(f"example: {example}")
print(f"Error: {e}")
break
print(f"MMLU=Pro Accuracy for {output_result}: {correct / len(output)}")
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument("--output_dir", type=str, default="outputs")
args.add_argument("--test_size", type=int, default=100)
args = args.parse_args()
main(args)
이렇게 돌리면 점수가 나옵니다...
LLaMa 너무 못하네요 ㅎㅎ....
'인공지능 > 자연어 처리' 카테고리의 다른 글
vllm 활용해서 logit 추출 및 logprob, CoT, SC-CoT Inference 진행 (0) | 2025.04.02 |
---|---|
vllm 통해 reasoning path 데이터 만들기 (1) | 2025.03.24 |
Gemma-3 사용하기 (Feat.오류) (0) | 2025.03.20 |
Few-Shot, CoT(Chain-of-Thought)와 ReAct 하나 하나 뜯어보기 (0) | 2025.02.05 |
Late Chunking 사용해보기 및 Chunking 코드 익숙해지기 (1) | 2025.01.22 |