integrate ecg-qa-cot dataset #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

nbbb24 wants to merge 2 commits into willxxy:main from nbbb24:ecgrl

README.md

-Original file line number
+Diff line change
@@ Expand Up @@
 . Unzip the file and inside of `data/cpsc/classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2/training` move the `cpsc_2018` and `cpsc_2018_extra` folders into the `data/cpsc` directory. Then delete the `classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2` folder.
+    #### ECG-QA-COT
+. Create a `ecg-qa-cot` folder inside the `data` directory.
+. Inside `data/ecg-qa-cot` execute the following command in the terminal:
+    ```
+    wget "https://polybox.ethz.ch/index.php/s/D5QaJSEw4dXkzXm/download/ecg_qa_cot_final.zip" -O ecg_qa_cot_final.zip
+    ```
+. Unzip the file using
+    ```
+    unzip ecg_qa_cot_final.zip
+    ```
     ### Preprocessing
 . Execute the preprocessing script by `bash scripts/preprocess.sh`. We have provided default configurations for all the datasets used in our study but feel free to experiment with others!
@@ Expand Down @@

ecg-plot

  
  Submodule ecg-plot

    added at
      d7eb9a

ecg_bench/configs/constants.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,7 @@ @@
         "ecg_grounding_pulse",
         "ecg_grounding",
         "ecg_grounding_test",
+        "ecg_qa_cot"
     ]
     # Hugging Face
@@ Expand Down @@

ecg_bench/configs/preprocessing/config.yaml

-Original file line number
+Diff line change
@@ -0,0 +1,11 @@
+    base_data: null
+    dev: null
+    map_data: ecg-qa_ptbxl
+    mix_data: null
+    num_cores: 12
+    preprocess: false
+    sampled_file: null
+    seed: 0
+    segment_len: 1250
+    target_sf: 250
+    toy: null

ecg_bench/preprocessors/map_ecg.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -4,6 +4,7 @@
  
    from pathlib import Path

    from datasets import load_dataset

    import argparse

    import csv

    from ecg_bench.configs.constants import MAPPED_DATASETS

    from ecg_bench.utils.file_manager import FileManager

    @@ -32,6 +33,8 @@ def map_data(self):
  
                data = self._prepare_ecg_qa_ptb()

            elif self.args.map_data == "ecg-qa_mimic-iv-ecg":

                data = self._prepare_ecg_qa_mimic()

            elif self.args.map_data == "ecg_qa_cot":

                data = self._prepare_ecg_qa_cot()

            elif self.args.map_data in ["ecg_grounding_pulse", "ecg_grounding", "ecg_grounding_test"]:

                data = self._prepare_ecg_grounding()

    @@ -52,15 +55,15 @@ def map_data(self):
  
            print(f"Total instances for {self.args.map_data}: {len(data)}")

            print(f"Length of available ecgs: {len(self.available_ecgs)}")

            print(f"Valid instances: {len(valid_instances)}")

            self.fm.save_json(valid_instances, f"./data/{self.args.map_data}_mapped_{self.args.segment_len}.json")

            self.fm.save_json(valid_instances, f"./ecg_bench/data/{self.args.map_data}_mapped_{self.args.segment_len}.json")

        def _process_mapping_instance(self, instance):

            name = instance.get("name", "")

            if self.args.map_data in ["ecg_instruct_45k", "pretrain_mimic"]:

                text = instance["conversations"]

                ecg_path = "_".join(instance["ecg"].split("/"))

                preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

                preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            elif self.args.map_data == "ecg_instruct_pulse":

                text = instance["conversations"]

    @@ -70,9 +73,9 @@ def _process_mapping_instance(self, instance):
  
                text = [instance["question_type"], instance["question"], instance["answer"]]

                ecg_path = "_".join(instance["ecg_path"][0].split("/")[2:])

                if self.args.map_data == "ecg-qa_ptbxl":

                    preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

                    preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

                else:

                    preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

                    preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            elif self.args.map_data == "ecg_bench_pulse":

                text = instance["conversations"]

    @@ -85,6 +88,20 @@ def _process_mapping_instance(self, instance):
  
                file_name = instance["ecg"]

                ecg_path, preprocessed_dir = self._get_ecg_grounding_path(file_name)

            elif self.args.map_data == "ecg_qa_cot":

                text = [

                    instance["question_type"],

                    instance["question"],

                    instance["answer"],

                    instance.get("rationale", "")

                ]

                # Parse ecg_id from format "[13625]" to "13625"

                ecg_id = instance["ecg_id"].strip("[]")

                # PTB-XL path structure: records500/{subfolder}/{ecg_id}_hr

                subfolder = ecg_id[:2] + "000"

                ecg_path = f"records500_{subfolder}_{ecg_id}_hr"

                preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            return ecg_path, text, name, preprocessed_dir

        def _prepare_ecg_grounding(self):

    @@ -123,11 +140,11 @@ def _prepare_ecg_instruct_pulse(self):
  
            return data

        def _prepare_ecg_qa_ptb(self):

            preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))

            dataset_name = self.args.map_data.split("_")[1]

            paraphrased_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")

            template_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/template/*/*.json")

            paraphrased_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")

            template_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/template/*/*.json")

            path_to_all_jsons = paraphrased_jsons + template_jsons

            data = self.setup_ecg_qa(path_to_all_jsons)

            return data

    @@ -154,6 +171,13 @@ def _prepare_ecg_instruct_45k(self):
  
            data = self.fm.open_json(f"./data/{self.args.map_data}/{self.args.map_data}.json")

            return data

        def _prepare_ecg_qa_cot(self):

            """Prepare ECG-QA dataset with Chain-of-Thought rationale from CSV files"""

            preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"

            self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))

            data = self.setup_ecg_qa_cot()

            return data

        def _setup_ecg_bench_pulse(self, json_path):

            self.list_of_hf_datasets = ["cpsc-test", "csn-test-no-cot", "code15-test", "ptb-test", "ptb-test-report", "ecgqa-test"]

            data = []

    @@ -248,3 +272,24 @@ def setup_ecg_qa(self, glob_paths, question_types=["single-verify", "single-choo
  
                filtered_list = [item for item in loaded_file if item["question_type"] in question_types]

                data.extend(filtered_list)

            return data

        def setup_ecg_qa_cot(self):

            """Load ECG-QA CoT data from CSV files (train, val, test combined)"""

            data = []

            splits = {

                "train": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv",

                "val": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_val.csv",

                "test": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_test.csv"

            }

            for split_name, csv_file in splits.items():

                if os.path.exists(csv_file):

                    with open(csv_file, 'r', encoding='utf-8') as f:

                        reader = csv.DictReader(f)

                        for row in reader:

                            data.append(row)

                    print(f"Loaded {split_name} split from {csv_file}")

                else:

                    print(f"Warning: {csv_file} not found, skipping...")

            return data

ecg_bench/utils/file_manager.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,8 @@ def decode_batch(batch: dict) -> dict: @@
         @staticmethod
         def save_config(save_path: Union[str, Path], args: argparse.Namespace):
             args_dict = {k: v for k, v in vars(args).items() if not k.startswith("_")}
+            # Create directory if it doesn't exist
+            Path(save_path).mkdir(parents=True, exist_ok=True)
             with open(f"{save_path}/config.yaml", "w") as f:
                 yaml.dump(args_dict, f, default_flow_style=False)
@@ Expand Down @@

scripts/preproccess.sh

-Original file line number
+Diff line change
@@ -1,21 +1,26 @@
-    BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
-    SEG_LENS=(1250 2500 500)
+    # BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
+    # SEG_LENS=(1250 2500 500)
-    for base_data in "${BASE_DATA_VALUES[@]}"; do
-      for seg_len in "${SEG_LENS[@]}"; do
-        if [ "$base_data" = "mimic" ]; then
-          echo "Sampling $base_data with seg_len=$seg_len"
-          python preprocess_ecg.py \
-            --base_data="$base_data" \
-            --seg_len="$seg_len" \
-            --preprocess_files \
-            --sample_files --random_sampling
-        else
-          echo "Preprocessing $base_data with seg_len=$seg_len"
-          python preprocess_ecg.py \
-            --base_data="$base_data" \
-            --seg_len="$seg_len" \
-            --preprocess_files
-        fi
-      done
-    done
+    # for base_data in "${BASE_DATA_VALUES[@]}"; do
+    #   for seg_len in "${SEG_LENS[@]}"; do
+    #     if [ "$base_data" = "mimic" ]; then
+    #       echo "Sampling $base_data with seg_len=$seg_len"
+    #       python preprocess_ecg.py \
+    #         --base_data="$base_data" \
+    #         --seg_len="$seg_len" \
+    #         --preprocess_files \
+    #         --sample_files --random_sampling
+    #     else
+    #       echo "Preprocessing $base_data with seg_len=$seg_len"
+    #       python preprocess_ecg.py \
+    #         --base_data="$base_data" \
+    #         --seg_len="$seg_len" \
+    #         --preprocess_files
+    #     fi
+    #   done
+    # done
+    python ecg_bench/preprocess.py \
+      --map_data="ecg-qa_ptbxl" \
+      --segment_len=1250 \
+      --target_sf=250

transformers

  
  Submodule transformers

    added at
      241c04

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

integrate ecg-qa-cot dataset #4

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

integrate ecg-qa-cot dataset #4

Are you sure you want to change the base?

Uh oh!

integrate ecg-qa-cot dataset #4

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!