|
1 | 1 | # https://github.com/maszhongming/UniEval/tree/main |
2 | 2 |
|
| 3 | +from dataclasses import dataclass, field |
| 4 | +from tqdm import tqdm |
3 | 5 | import torch |
4 | 6 | from torch import nn |
5 | | -from dataclasses import dataclass, field |
6 | | -import asyncio |
7 | | -from tqdm.asyncio import tqdm as tqdm_async |
| 7 | +import torch.multiprocessing as mp |
8 | 8 |
|
9 | 9 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
10 | | -from models.evaluate.base_evaluator import BaseEvaluator |
11 | | -from utils import create_event_loop |
12 | | -from models.text.text_pair import TextPair |
13 | | - |
| 10 | +from models import TextPair |
| 11 | + |
| 12 | + |
| 13 | +def _add_questions(dimension: str, question: str, answer: str): |
| 14 | + if dimension == "naturalness": |
| 15 | + cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer |
| 16 | + elif dimension == "coherence": |
| 17 | + cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \ |
| 18 | + + answer + ' </s> dialogue history: ' + question |
| 19 | + elif dimension == "understandability": |
| 20 | + cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer |
| 21 | + else: |
| 22 | + raise NotImplementedError( |
| 23 | + 'The input format for this dimension is still undefined. Please customize it first.') |
| 24 | + return cur_input |
14 | 25 |
|
15 | 26 | @dataclass |
16 | | -class UniEvaluator(BaseEvaluator): |
| 27 | +class UniEvaluator: |
17 | 28 | model_name: str = "MingZhong/unieval-sum" |
18 | 29 | dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability']) |
19 | | - max_length: int = 1024 |
| 30 | + max_length: int = 2560 |
20 | 31 |
|
21 | 32 | def __post_init__(self): |
22 | | - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) |
23 | | - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
24 | | - |
25 | | - self.model.eval() |
26 | | - self.model.to("cuda") |
| 33 | + self.num_gpus = torch.cuda.device_count() |
27 | 34 |
|
28 | | - self.softmax = nn.Softmax(dim=1) |
| 35 | + @staticmethod |
| 36 | + def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict): |
| 37 | + device = f'cuda:{rank}' |
| 38 | + torch.cuda.set_device(rank) |
29 | 39 |
|
30 | | - self.pos_id = self.tokenizer("Yes")["input_ids"][0] |
31 | | - self.neg_id = self.tokenizer("No")["input_ids"][0] |
| 40 | + rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
| 41 | + tokenizer = AutoTokenizer.from_pretrained(model_name) |
| 42 | + rank_model.to(device) |
| 43 | + rank_model.eval() |
32 | 44 |
|
33 | | - def evaluate(self, pairs: list[TextPair], dimension: str) -> list[float]: |
34 | | - """ |
35 | | - Evaluate the text and return a score. |
36 | | - """ |
37 | | - return create_event_loop().run_until_complete(self.async_evaluate(pairs, dimension)) |
| 45 | + softmax = nn.Softmax(dim=1) |
38 | 46 |
|
39 | | - async def async_evaluate(self, pairs: list[TextPair], dimension: str) -> list[float]: |
40 | | - semaphore = asyncio.Semaphore(self.max_concurrent) |
41 | | - |
42 | | - async def evaluate_with_semaphore(pair): |
43 | | - async with semaphore: |
44 | | - return await self.evaluate_single(pair, dimension) |
| 47 | + pos_id = tokenizer("Yes")["input_ids"][0] |
| 48 | + neg_id = tokenizer("No")["input_ids"][0] |
45 | 49 |
|
46 | 50 | results = [] |
47 | | - for result in tqdm_async( |
48 | | - asyncio.as_completed([evaluate_with_semaphore(pair) for pair in pairs]), |
49 | | - total=len(pairs), |
50 | | - ): |
51 | | - results.append(await result) |
52 | | - return results |
53 | | - |
54 | | - async def evaluate_single(self, pair: TextPair, dimension: str) -> float: |
55 | | - text = self._add_questions(dimension, pair.question, pair.answer) |
56 | | - loop = create_event_loop() |
57 | | - return await loop.run_in_executor(None, self._score, text) |
58 | | - |
59 | | - def get_average_score(self, pairs: list[TextPair], dimension: str) -> float: |
| 51 | + with torch.no_grad(): |
| 52 | + for pair in tqdm(pairs): |
| 53 | + text = _add_questions(dimension, pair.question, pair.answer) |
| 54 | + |
| 55 | + tgt = "No" |
| 56 | + |
| 57 | + encoded_src = tokenizer( |
| 58 | + text, |
| 59 | + max_length=max_length, |
| 60 | + truncation=True, |
| 61 | + padding=True, |
| 62 | + return_tensors='pt' |
| 63 | + ) |
| 64 | + encoded_tgt = tokenizer( |
| 65 | + tgt, |
| 66 | + max_length=max_length, |
| 67 | + truncation=True, |
| 68 | + padding=True, |
| 69 | + return_tensors='pt' |
| 70 | + ) |
| 71 | + |
| 72 | + src_tokens = encoded_src['input_ids'].to(device) |
| 73 | + src_mask = encoded_src['attention_mask'].to(device) |
| 74 | + |
| 75 | + tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1) |
| 76 | + |
| 77 | + output = rank_model( |
| 78 | + input_ids=src_tokens, |
| 79 | + attention_mask=src_mask, |
| 80 | + labels=tgt_tokens, |
| 81 | + use_cache = False |
| 82 | + ) |
| 83 | + |
| 84 | + logits = output.logits.view(-1, rank_model.config.vocab_size) |
| 85 | + |
| 86 | + pos_score = softmax(logits)[:, pos_id] # Yes |
| 87 | + neg_score = softmax(logits)[:, neg_id] |
| 88 | + score = pos_score / (pos_score + neg_score) |
| 89 | + |
| 90 | + results.append(score.item()) |
| 91 | + |
| 92 | + return_dict[rank] = results |
| 93 | + |
| 94 | + def evaluate(self, pairs: list[TextPair]) -> list[dict]: |
| 95 | + final_results = [] |
| 96 | + for dimension in self.dimensions: |
| 97 | + chunk_size = len(pairs) // self.num_gpus |
| 98 | + chunks = [] |
| 99 | + for i in range(self.num_gpus): |
| 100 | + start = i * chunk_size |
| 101 | + end = start + chunk_size |
| 102 | + if i == self.num_gpus - 1: |
| 103 | + end = len(pairs) |
| 104 | + chunks.append(pairs[start:end]) |
| 105 | + |
| 106 | + # multi-process |
| 107 | + manager = mp.Manager() |
| 108 | + return_dict = manager.dict() |
| 109 | + processes = [] |
| 110 | + |
| 111 | + for rank, chunk in enumerate(chunks): |
| 112 | + p = mp.Process( |
| 113 | + target=self.process_chunk, |
| 114 | + args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict) |
| 115 | + ) |
| 116 | + p.start() |
| 117 | + processes.append(p) |
| 118 | + |
| 119 | + for p in processes: |
| 120 | + p.join() |
| 121 | + |
| 122 | + # 合并结果 |
| 123 | + results = [] |
| 124 | + for rank in range(len(chunks)): |
| 125 | + results.extend(return_dict[rank]) |
| 126 | + |
| 127 | + for p in processes: |
| 128 | + if p.is_alive(): |
| 129 | + p.terminate() |
| 130 | + p.join() |
| 131 | + |
| 132 | + final_results.append({ |
| 133 | + dimension: results |
| 134 | + }) |
| 135 | + return final_results |
| 136 | + |
| 137 | + def get_average_score(self, pairs: list[TextPair]) -> dict: |
60 | 138 | """ |
61 | 139 | Get the average score of a batch of texts. |
62 | 140 | """ |
63 | | - return sum(self.evaluate(pairs, dimension)) / len(pairs) |
64 | | - |
65 | | - def _score(self, text: str) -> float: |
66 | | - """ |
67 | | - Get scores for the given samples. |
68 | | - final_score = postive_score / (postive_score + negative_score) |
69 | | - """ |
70 | | - |
71 | | - # The implementation of "forward" in T5 still requires decoder_input_ids. |
72 | | - # Therefore, we construct a random one-word target sequence. |
73 | | - # The content of the target has no effect on the final scores. |
74 | | - |
75 | | - tgt = "No" |
76 | | - |
77 | | - with torch.no_grad(): |
78 | | - encoded_src = self.tokenizer( |
79 | | - text, |
80 | | - max_length=self.max_length, |
81 | | - truncation=True, |
82 | | - padding=True, |
83 | | - return_tensors='pt' |
84 | | - ) |
85 | | - encoded_tgt = self.tokenizer( |
86 | | - tgt, |
87 | | - max_length=self.max_length, |
88 | | - truncation=True, |
89 | | - padding=True, |
90 | | - return_tensors='pt' |
91 | | - ) |
92 | | - |
93 | | - src_tokens = encoded_src['input_ids'].to("cuda") |
94 | | - src_mask = encoded_src['attention_mask'].to("cuda") |
95 | | - |
96 | | - tgt_tokens = encoded_tgt['input_ids'].to("cuda")[:, 0].unsqueeze(-1) |
97 | | - |
98 | | - output = self.model( |
99 | | - input_ids=src_tokens, |
100 | | - attention_mask=src_mask, |
101 | | - labels=tgt_tokens |
102 | | - ) |
103 | | - |
104 | | - logits = output.logits.view(-1, self.model.config.vocab_size) |
105 | | - |
106 | | - pos_score = self.softmax(logits)[:, self.pos_id] # Yes |
107 | | - neg_score = self.softmax(logits)[:, self.neg_id] |
108 | | - |
109 | | - score = pos_score / (pos_score + neg_score) |
110 | | - |
111 | | - return score.item() |
112 | | - |
113 | | - def _add_questions(self, dimension: str, question: str, answer: str): |
114 | | - if dimension == "naturalness": |
115 | | - cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer |
116 | | - elif dimension == "coherence": |
117 | | - cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \ |
118 | | - + answer + ' </s> dialogue history: ' + question |
119 | | - elif dimension == "understandability": |
120 | | - cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer |
121 | | - else: |
122 | | - raise NotImplementedError( |
123 | | - 'The input format for this dimension is still undefined. Please customize it first.') |
124 | | - return cur_input |
| 141 | + results = self.evaluate(pairs) |
| 142 | + final_results = {} |
| 143 | + for result in results: |
| 144 | + for key, value in result.items(): |
| 145 | + final_results[key] = sum(value) / len(value) |
| 146 | + return final_results |
0 commit comments