update for api test & api consistency

MOLYHECI · MOLYHECI · commit a0f78a38ac52 · 2025-03-24T16:26:59.000+08:00
diff --git a/configs/process/dedup_api.yaml b/configs/process/dedup_api.yaml
@@ -0,0 +1,4 @@
+HashDeduplicator: 
+  hash_func: 'md5'
+CCNetDeduplicator: 
+  bit_length: 64 # should be a multiple of 8
diff --git a/configs/process/filter_api.yaml b/configs/process/filter_api.yaml
@@ -7,5 +7,4 @@ FineWebEduFilter:
   min_score: 0
   max_score: 100
   scorer_args:
-    model_name: 'HuggingFaceTB/fineweb-edu-classifier'
-    device: 'cuda:4'
+    model_name: 'HuggingFaceTB/fineweb-edu-classifier'
diff --git a/dataflow/core/process/deduplicator.py b/dataflow/core/process/deduplicator.py
@@ -23,15 +23,15 @@ def __init__(self, args=None):
     def __call__(self, dataset):
         init_len = len(dataset)
         labels = self.dedup_func(dataset)
-        if isinstance(dataset.dataset, Dataset):
-            def filter_by_labels(example, index):
-                return labels[index] == 1
-            dataset.dataset = dataset.dataset.filter(filter_by_labels, with_indices=True)
-            deduped_dataset = dataset
-        else:
-            deduped_dataset = dataset.filter(labels)
-        print(f'Implemented {self.dedupliactor_name}. Data Number: {init_len} -> {len(deduped_dataset)}')
-        return deduped_dataset
+        # if isinstance(dataset.dataset, Dataset):
+        #     def filter_by_labels(example, index):
+        #         return labels[index] == 1
+        #     dataset.dataset = dataset.dataset.filter(filter_by_labels, with_indices=True)
+        #     deduped_dataset = dataset
+        # else:
+        #     deduped_dataset = dataset.filter(labels)
+        # print(f'Implemented {self.dedupliactor_name}. Data Number: {init_len} -> {len(deduped_dataset)}')
+        return labels
 
 class ImageDeduplicator(Deduplicator):
 
diff --git a/dataflow/process/text/deduplicators/ccnet_deduplicator.py b/dataflow/process/text/deduplicators/ccnet_deduplicator.py
@@ -27,6 +27,6 @@ def dedup_func(self, dataset):
             hash_values.append(hash_value)
         # print(json.dumps({"ccnet_hash_values": hash_values}))
 
-        return json.dumps({"ccnet_hash_values": hash_values})
+        return {"ccnet_hash_values": hash_values}
 
     
diff --git a/dataflow/process/text/deduplicators/hash_deduplicator.py b/dataflow/process/text/deduplicators/hash_deduplicator.py
@@ -31,4 +31,4 @@ def dedup_func(self, dataset):
             hash_value = self._compute_hash(text)
             hash_values.append(hash_value)
         # print(json.dumps({"hash_values": hash_values}))
-        return json.dumps({"exact_hash_values": hash_values})
+        return {"exact_hash_values": hash_values}
diff --git a/dataflow/process/text/deduplicators/minhash_deduplicator.py b/dataflow/process/text/deduplicators/minhash_deduplicator.py
@@ -36,7 +36,7 @@ def dedup_func(self, dataset):
                 result = lsh.query(minhash)
                 hash_values.append(result)
         # print(json.dumps({"hash_values": hash_values}))
-        return json.dumps({"minhash_hash_values": hash_values})
+        return {"minhash_hash_values": hash_values}
 
         
 
diff --git a/dataflow/process/text/deduplicators/ngramhash_deduplicator.py b/dataflow/process/text/deduplicators/ngramhash_deduplicator.py
@@ -33,7 +33,7 @@ def dedup_func(self, dataset):
             hash_value = set(self._compute_hash(ngram) for ngram in ngrams)
             hash_values.append(hash_value)
         # print(json.dumps({"hash_values": hash_values}))
-        return json.dumps({"ngram_hash_values": hash_values})
+        return {"ngram_hash_values": hash_values}
 
 
                 
diff --git a/dataflow/process/text/deduplicators/sem_deduplicator.py b/dataflow/process/text/deduplicators/sem_deduplicator.py
@@ -88,4 +88,4 @@ def dedup_func(self, dataset):
         embeddings = get_text_embedding(texts, self.tokenizer, self.model, self.device)
         embeddings = normalize(torch.tensor(embeddings), dim=1)
         # print(json.dumps({"embeddings": embeddings.tolist()}))
-        return json.dumps({"semhash_embeddings": embeddings.tolist()})
+        return {"semhash_embeddings": embeddings.tolist()}
diff --git a/dataflow/process/text/deduplicators/simhash_deduplicator.py b/dataflow/process/text/deduplicators/simhash_deduplicator.py
@@ -32,6 +32,6 @@ def get_similarity(simhash, another_simhash):
             simhash = Simhash(text, f=self.fingerprint_size)
             simhashes.append(simhash)
         # print(json.dumps({"hash_values": [simhash.value for simhash in simhashes]}))
-        return json.dumps({"simhash_values": [simhash.value for simhash in simhashes]})
+        return {"simhash_values": [simhash.value for simhash in simhashes]}
         
 
diff --git a/dataflow/utils/utils.py b/dataflow/utils/utils.py
@@ -286,6 +286,11 @@ def filter():
     #     dataset.dump(save_path)
     result[recorder] = True
     result = result.tolist()
+    save_path = cfg['save_path']
+    from bitarray import bitarray
+    ba = bitarray(result)
+    with open(save_path, 'wb') as f:
+        ba.tofile(f)
     print(json.dumps({"bool": result}))
 
 def refine():
@@ -299,7 +304,7 @@ def refine():
     if isinstance(cfg.yaml, str):
         with open(cfg.yaml, 'r') as f:
             cfg.yaml = yaml.safe_load(f)  # 解析成字典
-    
+
     for scorer_name, args in cfg.yaml.items():
         if "num_workers" in cfg:
             args["num_workers"] = cfg.num_workers
@@ -329,7 +334,7 @@ def deduplicate():
     if isinstance(cfg.yaml, str):
         with open(cfg.yaml, 'r') as f:
             cfg.yaml = yaml.safe_load(f)  # 解析成字典
-    
+    result = []
     for scorer_name, args in cfg.yaml.items():
         if "num_workers" in cfg:
             args["num_workers"] = cfg.num_workers
@@ -342,9 +347,11 @@ def deduplicate():
             dataset_dict[processor.data_type] = datasets
         else:
             datasets = dataset_dict[processor.data_type]
-        processed_dataset = processor(datasets)
-        dataset_dict[processor.data_type] = processed_dataset
-        print(processed_dataset)
+        result.append(processor(datasets))
+        # dataset_dict[processor.data_type] = processed_dataset
+    save_path = cfg['save_path']
+    with open(save_path, 'w') as f:
+        json.dump(result, f)