Merge branch 'dev' into out_dim_dynamic

sfluegel05 · sfluegel05 · commit 1ae654365537 · 2025-04-02T18:11:35.000+02:00
diff --git a/chebai/loss/bce_weighted.py b/chebai/loss/bce_weighted.py
@@ -50,32 +50,29 @@ def set_pos_weight(self, input: torch.Tensor) -> None:
             and self.data_extractor is not None
             and all(
                 os.path.exists(
-                    os.path.join(self.data_extractor.processed_dir_main, file_name)
+                    os.path.join(self.data_extractor.processed_dir, file_name)
                 )
-                for file_name in self.data_extractor.processed_main_file_names
+                for file_name in self.data_extractor.processed_file_names
             )
             and self.pos_weight is None
         ):
             print(
                 f"Computing loss-weights based on v{self.data_extractor.chebi_version} dataset (beta={self.beta})"
             )
-            complete_data = pd.concat(
+            complete_labels = torch.concat(
                 [
-                    pd.read_pickle(
-                        open(
-                            os.path.join(
-                                self.data_extractor.processed_dir_main,
-                                file_name,
-                            ),
-                            "rb",
-                        )
+                    torch.stack(
+                        [
+                            torch.Tensor(row["labels"])
+                            for row in self.data_extractor.load_processed_data(
+                                filename=file_name
+                            )
+                        ]
                     )
-                    for file_name in self.data_extractor.processed_main_file_names
+                    for file_name in self.data_extractor.processed_file_names
                 ]
             )
-            value_counts = []
-            for c in complete_data.columns[3:]:
-                value_counts.append(len([v for v in complete_data[c] if v]))
+            value_counts = complete_labels.sum(dim=0)
             weights = [
                 (1 - self.beta) / (1 - pow(self.beta, value)) for value in value_counts
             ]
diff --git a/chebai/models/ffn.py b/chebai/models/ffn.py
@@ -36,7 +36,7 @@ def _get_prediction_and_labels(self, data, labels, model_output):
         loss_kwargs = data.get("loss_kwargs", dict())
         if "non_null_labels" in loss_kwargs:
             n = loss_kwargs["non_null_labels"]
-            d = data[n]
+            d = d[n]
         return torch.sigmoid(d), labels.int() if labels is not None else None
 
     def _process_for_loss(
diff --git a/chebai/preprocessing/datasets/scope/scope.py b/chebai/preprocessing/datasets/scope/scope.py
@@ -72,10 +72,12 @@ def __init__(
         self,
         scope_version: str,
         scope_version_train: Optional[str] = None,
+        max_sequence_len: int = 1000,
         **kwargs,
     ):
         self.scope_version: str = scope_version
         self.scope_version_train: str = scope_version_train
+        self.max_sequence_len: int = max_sequence_len
 
         super(_SCOPeDataExtractor, self).__init__(**kwargs)
 
@@ -195,21 +197,93 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
         """
         print("Extracting class hierarchy...")
         df_scope = self._get_scope_data()
+        pdb_chain_df = self._parse_pdb_sequence_file()
+        pdb_id_set = set(pdb_chain_df["pdb_id"])  # Search time complexity - O(1)
+
+        # Initialize sets and dictionaries for storing edges and attributes
+        parent_node_edges, node_child_edges = set(), set()
+        node_attrs = {}
+        px_level_nodes = set()
+        sequence_nodes = dict()
+        px_to_seq_edges = set()
+        required_graph_nodes = set()
+
+        # Create a lookup dictionary for PDB chain sequences
+        lookup_dict = (
+            pdb_chain_df.groupby("pdb_id")[["chain_id", "sequence"]]
+            .apply(lambda x: dict(zip(x["chain_id"], x["sequence"])))
+            .to_dict()
+        )
 
-        g = nx.DiGraph()
+        def add_sequence_nodes_edges(chain_sequence, px_sun_id):
+            """Adds sequence nodes and edges connecting px-level nodes to sequence nodes."""
+            if chain_sequence not in sequence_nodes:
+                sequence_nodes[chain_sequence] = f"seq_{len(sequence_nodes)}"
+            px_to_seq_edges.add((px_sun_id, sequence_nodes[chain_sequence]))
+
+        # Step 1: Build the graph structure and store node attributes
+        for row in df_scope.itertuples(index=False):
+            if row.level == "px":
+
+                pdb_id, chain_id = row.sid[1:5], row.sid[5]
 
-        egdes = []
-        for _, row in df_scope.iterrows():
-            g.add_node(row["sunid"], **{"sid": row["sid"], "level": row["level"]})
-            if row["parent_sunid"] != -1:
-                egdes.append((row["parent_sunid"], row["sunid"]))
+                if pdb_id not in pdb_id_set or chain_id == "_":
+                    # Don't add domain level nodes that don't have pdb_id in pdb_sequences.txt file
+                    # Also chain_id with "_" which corresponds to no chain
+                    continue
+                px_level_nodes.add(row.sunid)
 
-            for children_id in row["children_sunids"]:
-                egdes.append((row["sunid"], children_id))
+                # Add edges between px-level nodes and sequence nodes
+                if chain_id != ".":
+                    if chain_id not in lookup_dict[pdb_id]:
+                        continue
+                    add_sequence_nodes_edges(lookup_dict[pdb_id][chain_id], row.sunid)
+                else:
+                    # If chain_id is '.', connect all chains of this PDB ID
+                    for chain, chain_sequence in lookup_dict[pdb_id].items():
+                        add_sequence_nodes_edges(chain_sequence, row.sunid)
+            else:
+                required_graph_nodes.add(row.sunid)
 
-        g.add_edges_from(egdes)
+            node_attrs[row.sunid] = {"sid": row.sid, "level": row.level}
 
-        print("Computing transitive closure")
+            if row.parent_sunid != -1:
+                parent_node_edges.add((row.parent_sunid, row.sunid))
+
+            for child_id in row.children_sunids:
+                node_child_edges.add((row.sunid, child_id))
+
+        del df_scope, pdb_chain_df, pdb_id_set
+
+        g = nx.DiGraph()
+        g.add_nodes_from(node_attrs.items())
+        # Note - `add_edges` internally create a node, if a node doesn't exist already
+        g.add_edges_from({(p, c) for p, c in parent_node_edges if p in node_attrs})
+        g.add_edges_from({(p, c) for p, c in node_child_edges if c in node_attrs})
+
+        seq_nodes = set(sequence_nodes.values())
+        g.add_nodes_from([(seq_id, {"level": "sequence"}) for seq_id in seq_nodes])
+        g.add_edges_from(
+            {
+                (px_node, seq_node)
+                for px_node, seq_node in px_to_seq_edges
+                if px_node in node_attrs and seq_node in seq_nodes
+            }
+        )
+
+        # Step 2: Count sequence successors for required graph nodes only
+        for node in required_graph_nodes:
+            num_seq_successors = sum(
+                g.nodes[child]["level"] == "sequence"
+                for child in nx.descendants(g, node)
+            )
+            g.nodes[node]["num_seq_successors"] = num_seq_successors
+
+        # Step 3: Remove nodes which are not required before computing transitive closure for better efficiency
+        g.remove_nodes_from(px_level_nodes | seq_nodes)
+
+        print("Computing Transitive Closure.........")
+        # Transitive closure is not needed in `select_classes` method but is required in _SCOPeOverXPartial
         return nx.transitive_closure_dag(g)
 
     def _get_scope_data(self) -> pd.DataFrame:
@@ -388,7 +462,8 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
 
         encoded_target_columns = []
         for level in hierarchy_levels:
-            encoded_target_columns.extend(lvl_to_target_cols_mapping[level])
+            if level in lvl_to_target_cols_mapping:
+                encoded_target_columns.extend(lvl_to_target_cols_mapping[level])
 
         print(
             f"{len(encoded_target_columns)} labels has been selected for specified threshold, "
@@ -471,12 +546,12 @@ def _parse_pdb_sequence_file(self) -> pd.DataFrame:
         for record in SeqIO.parse(
             os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta"
         ):
+
+            if not record.seq or len(record.seq) > self.max_sequence_len:
+                continue
+
             pdb_id, chain = record.id.split("_")
-            sequence = (
-                re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq))
-                if record.seq
-                else ""
-            )
+            sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq))
 
             # Store as a dictionary entry (list of dicts -> DataFrame later)
             records.append(
@@ -777,12 +852,15 @@ def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]
         """
         selected_sunids_for_level = {}
         for node, attr_dict in g.nodes(data=True):
-            if g.out_degree(node) >= self.THRESHOLD:
+            if attr_dict["level"] in {"root", "px", "sequence"}:
+                # Skip nodes with level "root", "px", or "sequence"
+                continue
+
+            # Check if the number of "sequence"-level successors meets or exceeds the threshold
+            if g.nodes[node]["num_seq_successors"] >= self.THRESHOLD:
                 selected_sunids_for_level.setdefault(attr_dict["level"], []).append(
                     node
                 )
-        # Remove root node, as it will True for all instances
-        selected_sunids_for_level.pop("root", None)
         return selected_sunids_for_level
 
 
@@ -876,7 +954,8 @@ class SCOPeOverPartial2000(_SCOPeOverXPartial):
 
 
 if __name__ == "__main__":
-    scope = SCOPeOver2000(scope_version="2.08")
+    scope = SCOPeOver50(scope_version="2.08")
+
     # g = scope._extract_class_hierarchy("dummy/path")
     # # Save graph
     # import pickle
diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py
@@ -213,10 +213,8 @@ def _extract_required_data_from_splits(self) -> pd.DataFrame:
             "proteins",
             "accessions",
             "sequences",
-            # https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L45-L58
-            "exp_annotations",  # Directly associated GO ids
             # https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L60-L69
-            "prop_annotations",  # Transitively associated GO ids
+            "prop_annotations",  # Direct and Transitively associated GO ids
             "esm2",
         ]
 
@@ -228,10 +226,8 @@ def _extract_required_data_from_splits(self) -> pd.DataFrame:
             ],
             ignore_index=True,
         )
-        new_df["go_ids"] = new_df.apply(
-            lambda row: self.extract_go_id(row["exp_annotations"])
-            + self.extract_go_id(row["prop_annotations"]),
-            axis=1,
+        new_df["go_ids"] = new_df["prop_annotations"].apply(
+            lambda x: self.extract_go_id(x)
         )
 
         data_df = pd.DataFrame(
@@ -270,7 +266,7 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
         """
         print("Generating labels based on terms.pkl file.......")
         parsed_go_ids: pd.Series = self._terms_df["gos"].apply(
-            lambda gos: DeepGO2MigratedData._parse_go_id(gos)
+            DeepGO2MigratedData._parse_go_id
         )
         all_go_ids_list = parsed_go_ids.values.tolist()
         self._classes = all_go_ids_list
diff --git a/chebai/result/classification.py b/chebai/result/classification.py
@@ -78,7 +78,7 @@ def print_metrics(
     print(f"Micro-Recall: {recall_micro(preds, labels):3f}")
     if markdown_output:
         print(
-            f"| Model | Macro-F1 | Micro-F1 | Macro-Precision | Micro-Precision | Macro-Recall | Micro-Recall | Balanced Accuracy"
+            f"| Model | Macro-F1 | Micro-F1 | Macro-Precision | Micro-Precision | Macro-Recall | Micro-Recall | Balanced Accuracy |"
         )
         print(f"| --- | --- | --- | --- | --- | --- | --- | --- |")
         print(
diff --git a/chebai/result/utils.py b/chebai/result/utils.py
@@ -156,11 +156,12 @@ def evaluate_model(
             return test_preds, test_labels
         return test_preds, None
     elif len(preds_list) < 0:
-        torch.save(
-            _concat_tuple(preds_list),
-            os.path.join(buffer_dir, f"preds{save_ind:03d}.pt"),
-        )
-        if labels_list[0] is not None:
+        if len(preds_list) > 0 and preds_list[0] is not None:
+            torch.save(
+                _concat_tuple(preds_list),
+                os.path.join(buffer_dir, f"preds{save_ind:03d}.pt"),
+            )
+        if len(labels_list) > 0 and labels_list[0] is not None:
             torch.save(
                 _concat_tuple(labels_list),
                 os.path.join(buffer_dir, f"labels{save_ind:03d}.pt"),
diff --git a/configs/data/scope/scope50.yml b/configs/data/scope/scope50.yml
@@ -1,3 +1,3 @@
 class_path: chebai.preprocessing.datasets.scope.scope.SCOPeOver50
 init_args:
-  scope_version: 2.08
+  scope_version: "2.08"
diff --git a/configs/model/electra.yml b/configs/model/electra.yml
@@ -3,7 +3,7 @@ init_args:
   optimizer_kwargs:
     lr: 1e-3
   config:
-    vocab_size: 8500
+    vocab_size: 1400
     max_position_embeddings: 1800
     num_attention_heads: 8
     num_hidden_layers: 6
diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
@@ -70,7 +70,7 @@
     }
    },
    "outputs": [],
-   "source": "from chebai.preprocessing.datasets.deepGO.go_uniprot import GOUniProtOver250"
+   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/data_exploration_scope.ipynb b/tutorials/data_exploration_scope.ipynb

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ def print_metrics(`
`78`	`78`	`print(f"Micro-Recall: {recall_micro(preds, labels):3f}")`
`79`	`79`	`if markdown_output:`
`80`	`80`	`print(`
`81`		`- f"\| Model \| Macro-F1 \| Micro-F1 \| Macro-Precision \| Micro-Precision \| Macro-Recall \| Micro-Recall \| Balanced Accuracy"`
	`81`	`+ f"\| Model \| Macro-F1 \| Micro-F1 \| Macro-Precision \| Micro-Precision \| Macro-Recall \| Micro-Recall \| Balanced Accuracy \|"`
`82`	`82`	`)`
`83`	`83`	`print(f"\| --- \| --- \| --- \| --- \| --- \| --- \| --- \| --- \|")`
`84`	`84`	`print(`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@`
`70`	`70`	`}`
`71`	`71`	`},`
`72`	`72`	`"outputs": [],`
`73`		`- "source": "from chebai.preprocessing.datasets.deepGO.go_uniprot import GOUniProtOver250"`
	`73`	`+ "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"`
`74`	`74`	`},`
`75`	`75`	`{`
`76`	`76`	`"cell_type": "code",`