-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
31 lines (24 loc) · 1.02 KB
/
test.py
File metadata and controls
31 lines (24 loc) · 1.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from transformers import AutoTokenizer
from scripts.dataset import SquadDataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, random_split
from scripts.model import SquadModel
def test_small_chunking():
print("\n===== RUNNING CHUNKING TEST =====")
examples = [
{
"qas_id": "q1",
"question": "Where do I live?",
"context": "I live in Germany. Paris kappa kiakf fjkfn fnskdj"*10,
"answer_text": "Germany", # ✅ Must exist in context!
"answer_start": 68, # ✅ Position of "Germany" in context
"answer_end": 91, # ✅ 10 + len("Germany")
"is_impossible": False
}
]
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = SquadDataset(examples, tokenizer, max_length=40, stride=10)
test_dataloader = DataLoader(dataset, batch_size=32, num_workers=8, pin_memory=True)
dataset.print_chunks("q1", tokenizer)
if __name__ == "__main__":
test_small_chunking()