|
1 | | -from tempfile import NamedTemporaryFile, TemporaryDirectory |
| 1 | +from tempfile import NamedTemporaryFile |
2 | 2 | from urllib import request |
3 | 3 | import csv |
4 | 4 | import gzip |
5 | 5 | import os |
6 | | -import random |
7 | 6 | import shutil |
8 | | -import zipfile |
9 | | -from typing import Dict, Generator, List, Optional |
| 7 | +from typing import Dict, List |
10 | 8 |
|
11 | | -from rdkit import Chem |
12 | 9 | from sklearn.model_selection import ( |
13 | 10 | GroupShuffleSplit, |
14 | 11 | train_test_split, |
15 | | - StratifiedShuffleSplit, |
16 | 12 | ) |
17 | 13 | import numpy as np |
18 | | -import pysmiles |
19 | 14 | import torch |
20 | | -from sklearn.preprocessing import LabelBinarizer |
21 | 15 |
|
22 | 16 | from chebai.preprocessing import reader as dr |
23 | | -from chebai.preprocessing.datasets.base import MergedDataset, XYBaseDataModule |
24 | | -from chebai.preprocessing.datasets.chebi import JCIExtendedTokenData |
25 | | -from chebai.preprocessing.datasets.pubchem import Hazardous |
| 17 | +from chebai.preprocessing.datasets.base import XYBaseDataModule |
26 | 18 |
|
27 | 19 |
|
28 | 20 | class ClinTox(XYBaseDataModule): |
@@ -76,7 +68,7 @@ def setup_processed(self) -> None: |
76 | 68 | """Processes and splits the dataset.""" |
77 | 69 | print("Create splits") |
78 | 70 | data = list( |
79 | | - self._load_data_from_file(os.path.join(self.raw_dir, f"clintox.csv")) |
| 71 | + self._load_data_from_file(os.path.join(self.raw_dir, "clintox.csv")) |
80 | 72 | ) |
81 | 73 | groups = np.array([d["group"] for d in data]) |
82 | 74 | if not all(g is None for g in groups): |
@@ -229,14 +221,14 @@ def download(self) -> None: |
229 | 221 | """Downloads and extracts the dataset.""" |
230 | 222 | with open(os.path.join(self.raw_dir, "bbbp.csv"), "ab") as dst: |
231 | 223 | with request.urlopen( |
232 | | - f"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv", |
| 224 | + "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv", |
233 | 225 | ) as src: |
234 | 226 | shutil.copyfileobj(src, dst) |
235 | 227 |
|
236 | 228 | def setup_processed(self) -> None: |
237 | 229 | """Processes and splits the dataset.""" |
238 | 230 | print("Create splits") |
239 | | - data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"bbbp.csv"))) |
| 231 | + data = list(self._load_data_from_file(os.path.join(self.raw_dir, "bbbp.csv"))) |
240 | 232 | groups = np.array([d["group"] for d in data]) |
241 | 233 | if not all(g is None for g in groups): |
242 | 234 | print("Group shuffled") |
@@ -426,7 +418,7 @@ def download(self) -> None: |
426 | 418 | def setup_processed(self) -> None: |
427 | 419 | """Processes and splits the dataset.""" |
428 | 420 | print("Create splits") |
429 | | - data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"sider.csv"))) |
| 421 | + data = list(self._load_data_from_file(os.path.join(self.raw_dir, "sider.csv"))) |
430 | 422 | groups = np.array([d["group"] for d in data]) |
431 | 423 | if not all(g is None for g in groups): |
432 | 424 | split_size = int( |
@@ -581,14 +573,14 @@ def download(self) -> None: |
581 | 573 | """Downloads and extracts the dataset.""" |
582 | 574 | with open(os.path.join(self.raw_dir, "bace.csv"), "ab") as dst: |
583 | 575 | with request.urlopen( |
584 | | - f"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv", |
| 576 | + "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv", |
585 | 577 | ) as src: |
586 | 578 | shutil.copyfileobj(src, dst) |
587 | 579 |
|
588 | 580 | def setup_processed(self) -> None: |
589 | 581 | """Processes and splits the dataset.""" |
590 | 582 | print("Create splits") |
591 | | - data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"bace.csv"))) |
| 583 | + data = list(self._load_data_from_file(os.path.join(self.raw_dir, "bace.csv"))) |
592 | 584 | # groups = np.array([d.get("group") for d in data]) |
593 | 585 |
|
594 | 586 | # if not all(g is None for g in groups): |
@@ -729,14 +721,14 @@ def download(self) -> None: |
729 | 721 | """Downloads and extracts the dataset.""" |
730 | 722 | with open(os.path.join(self.raw_dir, "hiv.csv"), "ab") as dst: |
731 | 723 | with request.urlopen( |
732 | | - f"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv", |
| 724 | + "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv", |
733 | 725 | ) as src: |
734 | 726 | shutil.copyfileobj(src, dst) |
735 | 727 |
|
736 | 728 | def setup_processed(self) -> None: |
737 | 729 | """Processes and splits the dataset.""" |
738 | 730 | print("Create splits") |
739 | | - data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"hiv.csv"))) |
| 731 | + data = list(self._load_data_from_file(os.path.join(self.raw_dir, "hiv.csv"))) |
740 | 732 | groups = np.array([d["group"] for d in data]) |
741 | 733 | if not all(g is None for g in groups): |
742 | 734 | print("Group shuffled") |
@@ -913,7 +905,7 @@ def download(self) -> None: |
913 | 905 | def setup_processed(self) -> None: |
914 | 906 | """Processes and splits the dataset.""" |
915 | 907 | print("Create splits") |
916 | | - data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"muv.csv"))) |
| 908 | + data = list(self._load_data_from_file(os.path.join(self.raw_dir, "muv.csv"))) |
917 | 909 | groups = np.array([d["group"] for d in data]) |
918 | 910 | if not all(g is None for g in groups): |
919 | 911 | split_size = int( |
|
0 commit comments