Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,19 @@ wget https://physionet.org/static/published-projects/challenge-2020/classificati

3. Unzip the file and inside of `data/cpsc/classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2/training` move the `cpsc_2018` and `cpsc_2018_extra` folders into the `data/cpsc` directory. Then delete the `classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2` folder.

#### ECG-QA-COT

1. Create a `ecg-qa-cot` folder inside the `data` directory.

2. Inside `data/ecg-qa-cot` execute the following command in the terminal:
```
wget "https://polybox.ethz.ch/index.php/s/D5QaJSEw4dXkzXm/download/ecg_qa_cot_final.zip" -O ecg_qa_cot_final.zip
```
3. Unzip the file using
```
unzip ecg_qa_cot_final.zip
```

### Preprocessing

1. Execute the preprocessing script by `bash scripts/preprocess.sh`. We have provided default configurations for all the datasets used in our study but feel free to experiment with others!
Expand Down
1 change: 1 addition & 0 deletions ecg-plot
Submodule ecg-plot added at d7eb9a
1 change: 1 addition & 0 deletions ecg_bench/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"ecg_grounding_pulse",
"ecg_grounding",
"ecg_grounding_test",
"ecg_qa_cot"
]

# Hugging Face
Expand Down
11 changes: 11 additions & 0 deletions ecg_bench/configs/preprocessing/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
base_data: null
dev: null
map_data: ecg-qa_ptbxl
mix_data: null
num_cores: 12
preprocess: false
sampled_file: null
seed: 0
segment_len: 1250
target_sf: 250
toy: null
59 changes: 52 additions & 7 deletions ecg_bench/preprocessors/map_ecg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from datasets import load_dataset
import argparse
import csv

from ecg_bench.configs.constants import MAPPED_DATASETS
from ecg_bench.utils.file_manager import FileManager
Expand Down Expand Up @@ -32,6 +33,8 @@ def map_data(self):
data = self._prepare_ecg_qa_ptb()
elif self.args.map_data == "ecg-qa_mimic-iv-ecg":
data = self._prepare_ecg_qa_mimic()
elif self.args.map_data == "ecg_qa_cot":
data = self._prepare_ecg_qa_cot()
elif self.args.map_data in ["ecg_grounding_pulse", "ecg_grounding", "ecg_grounding_test"]:
data = self._prepare_ecg_grounding()

Expand All @@ -52,15 +55,15 @@ def map_data(self):
print(f"Total instances for {self.args.map_data}: {len(data)}")
print(f"Length of available ecgs: {len(self.available_ecgs)}")
print(f"Valid instances: {len(valid_instances)}")
self.fm.save_json(valid_instances, f"./data/{self.args.map_data}_mapped_{self.args.segment_len}.json")
self.fm.save_json(valid_instances, f"./ecg_bench/data/{self.args.map_data}_mapped_{self.args.segment_len}.json")

def _process_mapping_instance(self, instance):
name = instance.get("name", "")

if self.args.map_data in ["ecg_instruct_45k", "pretrain_mimic"]:
text = instance["conversations"]
ecg_path = "_".join(instance["ecg"].split("/"))
preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

elif self.args.map_data == "ecg_instruct_pulse":
text = instance["conversations"]
Expand All @@ -70,9 +73,9 @@ def _process_mapping_instance(self, instance):
text = [instance["question_type"], instance["question"], instance["answer"]]
ecg_path = "_".join(instance["ecg_path"][0].split("/")[2:])
if self.args.map_data == "ecg-qa_ptbxl":
preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
else:
preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

elif self.args.map_data == "ecg_bench_pulse":
text = instance["conversations"]
Expand All @@ -85,6 +88,20 @@ def _process_mapping_instance(self, instance):
file_name = instance["ecg"]
ecg_path, preprocessed_dir = self._get_ecg_grounding_path(file_name)

elif self.args.map_data == "ecg_qa_cot":
text = [
instance["question_type"],
instance["question"],
instance["answer"],
instance.get("rationale", "")
]
# Parse ecg_id from format "[13625]" to "13625"
ecg_id = instance["ecg_id"].strip("[]")
# PTB-XL path structure: records500/{subfolder}/{ecg_id}_hr
subfolder = ecg_id[:2] + "000"
ecg_path = f"records500_{subfolder}_{ecg_id}_hr"
preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

return ecg_path, text, name, preprocessed_dir

def _prepare_ecg_grounding(self):
Expand Down Expand Up @@ -123,11 +140,11 @@ def _prepare_ecg_instruct_pulse(self):
return data

def _prepare_ecg_qa_ptb(self):
preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))
dataset_name = self.args.map_data.split("_")[1]
paraphrased_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")
template_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/template/*/*.json")
paraphrased_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")
template_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/template/*/*.json")
path_to_all_jsons = paraphrased_jsons + template_jsons
data = self.setup_ecg_qa(path_to_all_jsons)
return data
Expand All @@ -154,6 +171,13 @@ def _prepare_ecg_instruct_45k(self):
data = self.fm.open_json(f"./data/{self.args.map_data}/{self.args.map_data}.json")
return data

def _prepare_ecg_qa_cot(self):
"""Prepare ECG-QA dataset with Chain-of-Thought rationale from CSV files"""
preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))
data = self.setup_ecg_qa_cot()
return data

def _setup_ecg_bench_pulse(self, json_path):
self.list_of_hf_datasets = ["cpsc-test", "csn-test-no-cot", "code15-test", "ptb-test", "ptb-test-report", "ecgqa-test"]
data = []
Expand Down Expand Up @@ -248,3 +272,24 @@ def setup_ecg_qa(self, glob_paths, question_types=["single-verify", "single-choo
filtered_list = [item for item in loaded_file if item["question_type"] in question_types]
data.extend(filtered_list)
return data

def setup_ecg_qa_cot(self):
"""Load ECG-QA CoT data from CSV files (train, val, test combined)"""
data = []
splits = {
"train": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv",
"val": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_val.csv",
"test": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_test.csv"
}

for split_name, csv_file in splits.items():
if os.path.exists(csv_file):
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
data.append(row)
print(f"Loaded {split_name} split from {csv_file}")
else:
print(f"Warning: {csv_file} not found, skipping...")

return data
2 changes: 2 additions & 0 deletions ecg_bench/utils/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def decode_batch(batch: dict) -> dict:
@staticmethod
def save_config(save_path: Union[str, Path], args: argparse.Namespace):
args_dict = {k: v for k, v in vars(args).items() if not k.startswith("_")}
# Create directory if it doesn't exist
Path(save_path).mkdir(parents=True, exist_ok=True)
with open(f"{save_path}/config.yaml", "w") as f:
yaml.dump(args_dict, f, default_flow_style=False)

Expand Down
45 changes: 25 additions & 20 deletions scripts/preproccess.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
SEG_LENS=(1250 2500 500)
# BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
# SEG_LENS=(1250 2500 500)

for base_data in "${BASE_DATA_VALUES[@]}"; do
for seg_len in "${SEG_LENS[@]}"; do
if [ "$base_data" = "mimic" ]; then
echo "Sampling $base_data with seg_len=$seg_len"
python preprocess_ecg.py \
--base_data="$base_data" \
--seg_len="$seg_len" \
--preprocess_files \
--sample_files --random_sampling
else
echo "Preprocessing $base_data with seg_len=$seg_len"
python preprocess_ecg.py \
--base_data="$base_data" \
--seg_len="$seg_len" \
--preprocess_files
fi
done
done
# for base_data in "${BASE_DATA_VALUES[@]}"; do
# for seg_len in "${SEG_LENS[@]}"; do
# if [ "$base_data" = "mimic" ]; then
# echo "Sampling $base_data with seg_len=$seg_len"
# python preprocess_ecg.py \
# --base_data="$base_data" \
# --seg_len="$seg_len" \
# --preprocess_files \
# --sample_files --random_sampling
# else
# echo "Preprocessing $base_data with seg_len=$seg_len"
# python preprocess_ecg.py \
# --base_data="$base_data" \
# --seg_len="$seg_len" \
# --preprocess_files
# fi
# done
# done

python ecg_bench/preprocess.py \
--map_data="ecg-qa_ptbxl" \
--segment_len=1250 \
--target_sf=250
1 change: 1 addition & 0 deletions transformers
Submodule transformers added at 241c04
Loading