-
Notifications
You must be signed in to change notification settings - Fork 39
Open
Description
Seems like data = data.map(update) takes too much time. Could it be cached?
From data.py:
def load_qa(..):
...
def update(sample):
demos = demo_data
demo_text = ""
if shots > 0:
if "popqa" in dataset:
# popqa only has one split
demos = demo_data.filter(lambda x: x[key] != sample[key])
# seed ensures that we get the same demos for the same question
# hashlib is deterministic while hash() is not in Python>=3.3, the seed has to be a positive integer
h = (
int(hashlib.sha256(str(sample[key]).encode("utf-8")).hexdigest(), 16)
% 2**31
)
demos = demos.shuffle(seed=h)
demos = drop_duplicates(demos, key).select(range(shots))
demo_text = (
"\n\n".join(
[
demo_template.format(
**d,
documents="\n\n".join(
[passage_template.format(**c) for c in d["ctxs"]]
),
answer=d["answers"][0],
)
for d in demos
]
)
+ "\n\n"
)
passage_text = ""
if len(sample["ctxs"]) > 0:
passage_text = "\n\n".join(
[passage_template.format(**c) for c in sample["ctxs"]]
)
return {
"demos": demo_text,
"context": passage_text,
"answer": sample["answers"],
}
data = data.map(update)Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels