Skip to content

Commit ef7f308

Browse files
committed
revert to origin
1 parent 1cc76e0 commit ef7f308

File tree

136 files changed

+352
-14982
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+352
-14982
lines changed

.gitignore

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,7 @@ configs/process/experiments/
2020
**/meteorscorer/data/meteor-1.5.jar
2121
**/ciderscorer/coco-val-df.p
2222
**/data
23-
!dataflow/data/
2423
**/ckpt
25-
**/ckpr
2624
tmp.*
2725
configs/process/text_process_reasoner.yaml
28-
docs/src/getting_started/Dev.md
29-
configs/process/test_process_math.yaml
30-
Text2SqlPipeline/bird_example/
31-
32-
*.jsonl
33-
*.pt
34-
*.pth
35-
./.git/*
26+
docs/src/getting_started/Dev.md
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
model_cache_path: '../ckpt' # Path to cache models
2+
dependencies: [text]
3+
save_path: "../dataflow-develop/processed.jsonl"
4+
5+
data:
6+
text:
7+
use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below
8+
dataset_name: 'yahma/alpaca-cleaned'
9+
dataset_split: 'train'
10+
name: 'default'
11+
revision: null
12+
data_path: './demos/text_process/reasoners/math_5_samples.json' # Local data path, supports json, jsonl, parquet formats
13+
formatter: "TextFormatter" # Data loader type
14+
keys: 'answer' # Key name to be processed, for sft data, it can be specified as ['instruction','input','output']
15+
16+
processors:
17+
AnswerFormatterFilter: {}
18+
AnswerNgramFilter:
19+
min_score: 0.1
20+
max_score: 1.0
21+
ngrams: 5
22+
AnswerGroundTruthFilter:
23+
compare_method: math_verify # exact or math_verify
24+
AnswerTokenLengthFilter:
25+
max_answer_token_length: 512
26+
tokenizer_dir: '../Qwen2.5-0.5B-Instruct'
27+
28+
29+

dataflow/Eval/Text/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@
110110
'DeitaComplexityScorer': ('dataflow/Eval/Text/models/deita_complexity_scorer.py', 'DeitaComplexityScorer'),
111111
'PresidioScorer': ('dataflow/Eval/Text/models/presidio_scorer.py', 'PresidioScorer'),
112112
'RMScorer': ('dataflow/Eval/Text/models/rm_scorer.py', 'RMScorer'),
113-
'PairQualScorer': ('dataflow/Eval/Text/models/pair_qual_scorer.py','PairQualScorer'),
114113
'VendiScorer': ('dataflow/Eval/Text/diversity/vendi_scorer.py', 'VendiScorer'),
115114
'Task2VecScorer': ('dataflow/Eval/Text/diversity/task2vec_scorer.py', 'Task2VecScorer'),
116115
'LangkitScorer': ('dataflow/Eval/Text/statistics/langkit_scorer.py', 'LangkitScorer'),

dataflow/Eval/Text/models/Qurating/modeling/modeling_flash_llama.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -525,10 +525,7 @@ def forward(
525525
((attention_mask is not None) and (not attention_mask.all().item()))
526526
and not use_cache
527527
):
528-
try: # for flash-attn latest version
529-
hidden_states, unpad_indices, cu_seqlens, max_seqlen, _ = unpad_input(hidden_states, attention_mask)
530-
except: # for flash-attn 2.3.3 verstion
531-
hidden_states, unpad_indices, cu_seqlens, max_seqlen = unpad_input(hidden_states, attention_mask)
528+
hidden_states, unpad_indices, cu_seqlens, max_seqlen = unpad_input(hidden_states, attention_mask)
532529
unpadded_lengths = (cu_seqlens, max_seqlen)
533530
else:
534531
unpadded_lengths = None

dataflow/Eval/Text/models/pair_qual_scorer.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

dataflow/core/generator/reasoner.py

Lines changed: 0 additions & 78 deletions
This file was deleted.

dataflow/core/process/deduplicator.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
from datasets import Dataset
2-
from dataflow.format import TextFormatter
3-
from dataflow.utils.utils import get_logger
42

53
class Deduplicator:
64

@@ -13,20 +11,15 @@ def dedup_func(self, dataset):
1311
def __call__(self, dataset):
1412
init_len = len(dataset)
1513
deduped_dataset = self.dedup_func(dataset)
16-
print(f'Implemented {self.__class__.__name__}. Data Number: {init_len} -> {len(deduped_dataset)}')
14+
print(f'Implemented {self.__class__.__name__}. Data Number: {init_len} -> {len(deduped_dataset)}', flush=True)
1715

1816
return deduped_dataset
1917

2018
class TextDeduplicator(Deduplicator):
2119

2220
def __init__(self, args=None):
2321
self.data_type = "text"
24-
self.logger = get_logger()
25-
if "input_file" in args.keys():
26-
self.config = args
27-
self.formatter = TextFormatter(args)
28-
self.dataset = self.formatter.load_dataset()
29-
22+
3023
def __call__(self, dataset):
3124
init_len = len(dataset)
3225
labels = self.dedup_func(dataset)
@@ -37,12 +30,8 @@ def filter_by_labels(example, index):
3730
deduped_dataset = dataset
3831
else:
3932
deduped_dataset = dataset.filter(labels)
40-
self.logger.info(f'Implemented {self.dedupliactor_name}. Data Number: {init_len} -> {len(deduped_dataset)}')
33+
print(f'Implemented {self.dedupliactor_name}. Data Number: {init_len} -> {len(deduped_dataset)}')
4134
return deduped_dataset
42-
43-
def run(self):
44-
deduplicated_dataset = self.__call__(self.dataset)
45-
deduplicated_dataset.dump(self.config['output_file'])
4635

4736
class ImageDeduplicator(Deduplicator):
4837

dataflow/core/process/filter.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
from dataflow.data import DataFlowDataset
22
from dataflow.core import ScoreRecord
3-
from dataflow.format import TextFormatter
43
from datasets import Dataset
5-
from dataflow.utils.utils import get_logger
6-
74
class Filter():
85

96
def __init__(self, args=None):
@@ -19,11 +16,6 @@ class TextFilter(Filter):
1916

2017
def __init__(self, args=None):
2118
self.data_type = "text"
22-
self.logger = get_logger()
23-
if "input_file" in args.keys():
24-
self.config = args
25-
self.formatter = TextFormatter(args)
26-
self.dataset = self.formatter.load_dataset()
2719

2820
def __call__(self, dataset):
2921
init_len = len(dataset)
@@ -37,12 +29,9 @@ def filter_by_labels(example, index):
3729
filtered_dataset = dataset
3830
else:
3931
filtered_dataset = dataset.filter(labels)
40-
self.logger.info(f'Implemented {self.filter_name}. Data Number: {init_len} -> {len(filtered_dataset)}')
32+
33+
print(f'Implemented {self.filter_name}. Data Number: {init_len} -> {len(filtered_dataset)}', flush=True)
4134
return filtered_dataset
42-
43-
def run(self):
44-
filtered_dataset = self.__call__(self.dataset)
45-
filtered_dataset.dump(save_path=self.config['output_file'])
4635

4736
class ImageFilter(Filter):
4837

dataflow/core/process/reasoner.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from dataflow.data import DataFlowDataset
22
from dataflow.core import ScoreRecord
3-
from dataflow.format import TextFormatter
43
from datasets import Dataset
54

65
class Reasoner():
@@ -20,33 +19,11 @@ def __init__(self, args=None):
2019
self.filter_name = "ReasonerFilter"
2120
self.args = args
2221

23-
self.input_question_key = args.get("input_question_key","")
24-
self.max_worker = args.get("max_worker",1)
25-
26-
# answer format filter
27-
self.keys = args.get("input_keys","")
28-
# self.output_question_key = args.get("output_question_key","")
29-
30-
# answer gt verification
31-
self.test_answer_key = args.get("test_answer_key","")
32-
self.gt_answer_key = args.get("gt_answer_key","")
33-
34-
# ngram filter
35-
self.question_key = args.get("question_key","")
36-
self.answer_key = args.get("answer_key","")
37-
38-
# api args
3922
api_args = args.get('api_args', None)
4023
if api_args is not None:
4124
self.model_name = api_args['model_name']
4225
self.api_url = api_args['api_url']
4326
self.mode_test = api_args['mode_test']
44-
45-
if "input_file" in args.keys():
46-
self.formatter = TextFormatter(args)
47-
self.dataset = self.formatter.load_dataset()
48-
49-
5027
def filter_func(self, dataset):
5128
pass
5229

@@ -56,6 +33,7 @@ def __call__(self, dataset: DataFlowDataset):
5633
score_record = ScoreRecord()
5734
dataset.set_score_record(score_record)
5835
labels = self.filter_func(dataset)
36+
5937
if isinstance(dataset.dataset, Dataset):
6038
def filter_by_labels(example, index):
6139
return labels[index] == 1
@@ -65,8 +43,4 @@ def filter_by_labels(example, index):
6543
filtered_dataset = dataset.filter(labels)
6644

6745
print(f'Implemented {self.filter_name}. Data Number: {init_len} -> {len(filtered_dataset)}', flush=True)
68-
return filtered_dataset
69-
70-
def run(self):
71-
filtered_dataset = self.__call__(self.dataset)
72-
filtered_dataset.dump(save_path=self.args['output_file'])
46+
return filtered_dataset

dataflow/core/process/refiner.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
from dataflow.format import TextFormatter
2-
from dataflow.utils.utils import get_logger
3-
41
class Refiner():
52

63
def __init__(self, args):
@@ -13,20 +10,9 @@ class TextRefiner(Refiner):
1310

1411
def __init__(self, args=None):
1512
self.data_type = "text"
16-
self.logger = get_logger()
17-
if "input_file" in args.keys():
18-
self.config = args
19-
self.formatter = TextFormatter(args)
20-
self.dataset = self.formatter.load_dataset()
21-
22-
2313

2414
def __call__(self, dataset):
2515
refined_dataset, numbers = self.refine_func(dataset)
26-
self.logger.info(f'Implemented {self.refiner_name}. {numbers} data refined.')
16+
print(f'Implemented {self.refiner_name}. {numbers} data refined.', flush=True)
2717

2818
return refined_dataset
29-
30-
def run(self):
31-
refined_dataset = self.__call__(self.dataset)
32-
refined_dataset.dump(self.config['output_file'])

0 commit comments

Comments
 (0)