-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
195 lines (157 loc) · 6.76 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
Script that uses a model to make predictions on a given dataset.
A model can make multiple predictions for a single problem to compute metrics such as pass@k, accuracy@k.
Outputs a .csv file with the predictions.
"""
import pandas as pd
import glob
from collections import defaultdict
import numpy as np
np.random.seed(42)
import argparse
import datasets
import torch
import os
import tqdm
import pprint
import math
from copy import deepcopy
from evaluate.utils import complete_prompts
from peft import AutoPeftModelForCausalLM
import transformers
from transformers import AutoModelForCausalLM
from evaluate.prompts.romanian_prediction_prompt import PROMPT
parser = argparse.ArgumentParser(description='Predict on dataset')
parser.add_argument('--model', type = str, default = 'Qwen/Qwen2-1.5B-Instruct', help = 'Model name')
parser.add_argument('--dataset', type = str, default = 'bac', help = 'Dataset name. (synthetic / bac / comps)')
parser.add_argument('--output', type = str, default = 'predictions/', help = 'Output folder.')
parser.add_argument('--batch_size', type = int, default = 1, help = 'Batch size.')
parser.add_argument('--shots', type = int, default = 0, help = 'Number of examples in the prompt.')
parser.add_argument('--temperature', type = float, default = 0.0, help = 'Temperature of model.')
args = parser.parse_args()
print("Running predictions for", args.__dict__)
HF_TOKEN = os.environ.get('HF_TOKEN', None)
def compute_max_length_power_of_two(dataset, tokenizer):
max_length = 0
for sample in tqdm.tqdm(dataset, total = len(dataset), desc = f"Computing max length for cosmadrian/romath-{args.dataset}"):
content = f"\n### Soluția este:\n{sample['solution']}"
tokens = tokenizer.encode(content, add_special_tokens = False)
max_length = max(max_length, len(tokens))
return 2**(math.ceil(math.log(max_length, 2)))
def populate_few_shot(template, train_dataset, shots = 0):
"""
Populates a few shot template with examples from the dataset. The same example for all models / setups.
"""
template = deepcopy(template)
if shots == 0:
return template
system_prompt = [template[0]]
final_prompt = [template[-1]]
sampled_idxs = np.random.choice(len(train_dataset), shots, replace = False)
raw_shots = train_dataset.select(sampled_idxs)
shot_list = []
for i, example in enumerate(raw_shots):
shot_list.append({
"role": "user",
"content":
f"""Care este rezolvarea următoarei probleme?\n{example['problem']}"""
})
content = f"\n### Soluția este:\n{example['solution']}"
if 'answer' in example and example['answer'] != 'Proof':
content = f"\n### Soluția este:\n{example['solution']}. Răspunsul final este: \\boxed{{{example['answer']}}}"
shot_list.append({
"role": "assistant",
"content": content
})
final_template = system_prompt + shot_list + final_prompt
return final_template
is_fine_tuned = os.path.exists(args.model)
if is_fine_tuned:
checkpoint_name = glob.glob(args.model + '/*')[0]
model = AutoPeftModelForCausalLM.from_pretrained(
checkpoint_name,
token = HF_TOKEN,
device_map = "auto",
load_in_8bit = True,
trust_remote_code = True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint_name, token = HF_TOKEN)
else:
model = AutoModelForCausalLM.from_pretrained(
args.model,
token = HF_TOKEN,
device_map = "auto",
load_in_8bit = True,
trust_remote_code = True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(args.model, token = HF_TOKEN)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
# Load dataset
train_dataset = datasets.load_dataset('cosmadrian/romath', args.dataset, split = 'train')
test_dataset = datasets.load_dataset('cosmadrian/romath', args.dataset, split = 'test')
outputs = defaultdict(list)
max_length = min(compute_max_length_power_of_two(test_dataset, tokenizer), 2048)
print("Computed max length:", max_length)
message_batch = []
for i, example in enumerate(tqdm.tqdm(test_dataset, total = len(test_dataset))):
question = example['problem']
solution = example['solution']
messages = complete_prompts(PROMPT, problem_statement = question)
messages = populate_few_shot(messages, train_dataset, args.shots)
message_batch.append({
'messages': messages,
'example': example
})
if len(message_batch) == args.batch_size:
all_messages = [b['messages'] for b in message_batch]
tokens = tokenizer.apply_chat_template(
all_messages,
max_length = 2048,
padding = 'max_length',
return_tensors = 'pt',
return_dict = True,
truncation = True,
add_generation_prompt = True
)
tokens = {k: v.to(model.device) for k, v in tokens.items()}
with torch.no_grad():
responses_ids = model.generate(
temperature = args.temperature,
do_sample = args.temperature > 0.0,
max_new_tokens = max_length,
top_p = 0.9 if args.temperature > 0.0 else None,
min_p = 0.1 if args.temperature > 0.0 else None,
top_k = None,
pad_token_id = tokenizer.eos_token_id,
**tokens
)
# remove the prompt part from the response_ids, keep only the response
responses_ids = responses_ids[:, tokens['input_ids'].shape[1]:]
responses = tokenizer.batch_decode(
responses_ids,
skip_special_tokens = True,
clean_up_tokenization_spaces = True,
)
for j in range(args.batch_size):
content = responses[j]
example = message_batch[j]['example']
outputs['idx'].append(example['idx'])
outputs['model'].append(args.model)
outputs['dataset'].append(args.dataset)
outputs['domain'].append(example['domain'])
outputs['temperature'].append(args.temperature)
outputs['shots'].append(args.shots)
outputs['fine-tuned'].append(is_fine_tuned)
outputs['problem'].append(example['problem'])
outputs['solution'].append(example['solution'])
if 'answer' in example:
outputs['answer'].append(example['answer'])
outputs['response'].append(content)
message_batch = []
df = pd.DataFrame(outputs)
model_name = args.model.replace('/', '-')
if is_fine_tuned:
model_name = os.path.basename(args.model)
os.makedirs(args.output, exist_ok = True)
df.to_csv(f"{args.output}/{model_name}_{args.dataset}_{args.shots}_{args.temperature}.csv", index = False)