sunlabuiuc
diff --git a/‎pyhealth/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pyhealth/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyhealth/models/embedding.py‎
Lines changed: 62 additions & 2 deletions b/‎pyhealth/models/embedding.py‎
Lines changed: 62 additions & 2 deletions
diff --git a/‎pyhealth/models/logistic_regression.py‎
Lines changed: 264 additions & 0 deletions b/‎pyhealth/models/logistic_regression.py‎
Lines changed: 264 additions & 0 deletions
diff --git a/‎pyhealth/processors/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎pyhealth/processors/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -7,6 +7,7 @@
 from .deepr import Deepr, DeeprLayer
 from .embedding import EmbeddingModel
 from .gamenet import GAMENet, GAMENetLayer
+from .logistic_regression import LogisticRegression
 from .gan import GAN
 from .gnn import GAT, GCN
 from .graph_torchvision_model import Graph_TorchvisionModel
 
@@ -4,17 +4,46 @@
 import torch.nn as nn
 
 from ..datasets import SampleDataset
-from ..processors import SequenceProcessor, TimeseriesProcessor, TensorProcessor
+from ..processors import (
+    MultiHotProcessor,
+    SequenceProcessor,
+    TensorProcessor,
+    TimeseriesProcessor,
+)
 from .base_model import BaseModel
 
 
 class EmbeddingModel(BaseModel):
     """
     EmbeddingModel is responsible for creating embedding layers for different types of input data.
 
+    This model automatically creates appropriate embedding transformations based on the processor type:
+    
+    - SequenceProcessor: Creates nn.Embedding for categorical sequences (e.g., diagnosis codes)
+      Input: (batch, seq_len) with integer indices
+      Output: (batch, seq_len, embedding_dim)
+    
+    - TimeseriesProcessor: Creates nn.Linear for time series features
+      Input: (batch, seq_len, num_features)
+      Output: (batch, seq_len, embedding_dim)
+    
+    - TensorProcessor: Creates nn.Linear for fixed-size numerical features
+      Input: (batch, feature_size)
+      Output: (batch, embedding_dim)
+    
+    - MultiHotProcessor: Creates nn.Linear for multi-hot encoded categorical features
+      Input: (batch, num_categories) binary tensor
+      Output: (batch, embedding_dim)
+      Note: Converts sparse categorical representations to dense embeddings
+    
+    - Other processors with size(): Creates nn.Linear if processor reports a positive size
+      Input: (batch, size)
+      Output: (batch, embedding_dim)
+
     Attributes:
         dataset (SampleDataset): The dataset containing input processors.
         embedding_layers (nn.ModuleDict): A dictionary of embedding layers for each input field.
+        embedding_dim (int): The target embedding dimension for all features.
     """
 
     def __init__(self, dataset: SampleDataset, embedding_dim: int = 128):
@@ -26,6 +55,7 @@ def __init__(self, dataset: SampleDataset, embedding_dim: int = 128):
             embedding_dim (int): The dimension of the embedding space. Default is 128.
         """
         super().__init__(dataset)
+        self.embedding_dim = embedding_dim
         self.embedding_layers = nn.ModuleDict()
         for field_name, processor in self.dataset.input_processors.items():
             if isinstance(processor, SequenceProcessor):
@@ -54,6 +84,36 @@ def __init__(self, dataset: SampleDataset, embedding_dim: int = 128):
                     self.embedding_layers[field_name] = nn.Linear(
                         in_features=input_size, out_features=embedding_dim
                     )
+            elif isinstance(processor, MultiHotProcessor):
+                # MultiHotProcessor produces fixed-size binary vectors
+                # Use processor.size() to get the vocabulary size (num_categories)
+                num_categories = processor.size()
+                self.embedding_layers[field_name] = nn.Linear(
+                    in_features=num_categories, out_features=embedding_dim
+                )
+            else:
+                # Handle other processors with a size() method
+                size_attr = getattr(processor, "size", None)
+                if callable(size_attr):
+                    size_value = size_attr()
+                else:
+                    size_value = size_attr
+                
+                if isinstance(size_value, int) and size_value > 0:
+                    self.embedding_layers[field_name] = nn.Linear(
+                        in_features=size_value, out_features=embedding_dim
+                    )
+                else:
+                    # No valid size() method found - raise an error
+                    raise ValueError(
+                        f"Processor for field '{field_name}' (type: {type(processor).__name__}) "
+                        f"does not have a valid size() method or it returned an invalid value. "
+                        f"To use this processor with EmbeddingModel, it must either:\n"
+                        f"  1. Be a recognized processor type (SequenceProcessor, TimeseriesProcessor, "
+                        f"TensorProcessor, MultiHotProcessor), or\n"
+                        f"  2. Implement a size() method that returns a positive integer representing "
+                        f"the feature dimension."
+                    )
 
     def forward(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         """
@@ -67,8 +127,8 @@ def forward(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         """
         embedded = {}
         for field_name, tensor in inputs.items():
+            tensor = tensor.to(self.device)
             if field_name in self.embedding_layers:
-                tensor = tensor.to(self.device)
                 embedded[field_name] = self.embedding_layers[field_name](tensor)
             else:
                 embedded[field_name] = tensor  # passthrough for continuous features
 
@@ -0,0 +1,264 @@
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from pyhealth.datasets import SampleDataset
+from pyhealth.models import BaseModel
+
+from .embedding import EmbeddingModel
+
+
+class LogisticRegression(BaseModel):
+    """Logistic/Linear regression baseline model.
+
+    This model uses embeddings from different input features and applies a single
+    linear transformation (no hidden layers or non-linearity) to produce predictions.
+    
+    - For classification tasks: acts as logistic regression
+    - For regression tasks: acts as linear regression
+    
+    The model automatically handles different input types through the EmbeddingModel,
+    pools sequence dimensions, concatenates all feature embeddings, and applies a
+    final linear layer.
+
+    Args:
+        dataset: the dataset to train the model. It is used to query certain
+            information such as the set of all tokens.
+        embedding_dim: the embedding dimension. Default is 128.
+        **kwargs: other parameters (for compatibility).
+
+    Examples:
+        >>> from pyhealth.datasets import SampleDataset
+        >>> samples = [
+        ...         {
+        ...             "patient_id": "patient-0",
+        ...             "visit_id": "visit-0",
+        ...             "conditions": ["cond-33", "cond-86", "cond-80"],
+        ...             "procedures": [1.0, 2.0, 3.5, 4],
+        ...             "label": 0,
+        ...         },
+        ...         {
+        ...             "patient_id": "patient-1",
+        ...             "visit_id": "visit-1",
+        ...             "conditions": ["cond-33", "cond-86", "cond-80"],
+        ...             "procedures": [5.0, 2.0, 3.5, 4],
+        ...             "label": 1,
+        ...         },
+        ...     ]
+        >>> input_schema = {"conditions": "sequence",
+        ...                 "procedures": "tensor"}
+        >>> output_schema = {"label": "binary"}
+        >>> dataset = SampleDataset(samples=samples,
+        ...                        input_schema=input_schema,
+        ...                        output_schema=output_schema,
+        ...                        dataset_name="test")
+        >>>
+        >>> from pyhealth.models import LogisticRegression
+        >>> model = LogisticRegression(dataset=dataset)
+        >>>
+        >>> from pyhealth.datasets import get_dataloader
+        >>> train_loader = get_dataloader(dataset, batch_size=2, shuffle=True)
+        >>> data_batch = next(iter(train_loader))
+        >>>
+        >>> ret = model(**data_batch)
+        >>> print(ret)
+        {
+            'loss': tensor(0.6931, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
+            'y_prob': tensor([[0.5123],
+                            [0.4987]], grad_fn=<SigmoidBackward0>),
+            'y_true': tensor([[1.],
+                            [0.]]),
+            'logit': tensor([[0.0492],
+                            [-0.0052]], grad_fn=<AddmmBackward0>)
+        }
+        >>>
+
+    """
+
+    def __init__(
+        self,
+        dataset: SampleDataset,
+        embedding_dim: int = 128,
+        **kwargs,
+    ):
+        super(LogisticRegression, self).__init__(dataset)
+        self.embedding_dim = embedding_dim
+
+        assert len(self.label_keys) == 1, "Only one label key is supported"
+        self.label_key = self.label_keys[0]
+
+        # Use the EmbeddingModel to handle embedding logic
+        self.embedding_model = EmbeddingModel(dataset, embedding_dim)
+
+        # Single linear layer (no hidden layers, no activation)
+        output_size = self.get_output_size()
+        self.fc = nn.Linear(len(self.feature_keys) * self.embedding_dim, output_size)
+
+    @staticmethod
+    def mean_pooling(x, mask):
+        """Mean pooling over the middle dimension of the tensor.
+
+        Args:
+            x: tensor of shape (batch_size, seq_len, embedding_dim)
+            mask: tensor of shape (batch_size, seq_len)
+
+        Returns:
+            x: tensor of shape (batch_size, embedding_dim)
+
+        Examples:
+            >>> x.shape
+            [128, 5, 32]
+            >>> mean_pooling(x, mask).shape
+            [128, 32]
+        """
+        return x.sum(dim=1) / mask.sum(dim=1, keepdim=True)
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:
+        """Forward propagation.
+
+        Args:
+            **kwargs: keyword arguments for the model. The keys must contain
+                all the feature keys and the label key.
+
+        Returns:
+            Dict[str, torch.Tensor]: A dictionary with the following keys:
+                - loss: a scalar tensor representing the loss.
+                - y_prob: a tensor representing the predicted probabilities.
+                - y_true: a tensor representing the true labels.
+                - logit: a tensor representing the logits.
+                - embed (optional): a tensor representing the patient
+                    embeddings if requested.
+        """
+        patient_emb = []
+
+        # Preprocess inputs for EmbeddingModel
+        processed_inputs = {}
+        reshape_info = {}  # Track which inputs were reshaped
+
+        for feature_key in self.feature_keys:
+            x = kwargs[feature_key]
+
+            # Convert to tensor if not already
+            if not isinstance(x, torch.Tensor):
+                x = torch.tensor(x, device=self.device)
+            else:
+                x = x.to(self.device)
+
+            # Handle 3D input: (patient, event, # of codes) -> flatten to 2D
+            if x.dim() == 3:
+                batch_size, seq_len, inner_len = x.shape
+                x = x.view(batch_size, seq_len * inner_len)
+                reshape_info[feature_key] = {
+                    "original_shape": (batch_size, seq_len, inner_len),
+                    "was_3d": True,
+                    "expanded": False,
+                }
+            elif x.dim() == 1:
+                x = x.unsqueeze(0)
+                reshape_info[feature_key] = {"was_3d": False, "expanded": True}
+            else:
+                reshape_info[feature_key] = {"was_3d": False, "expanded": False}
+
+            processed_inputs[feature_key] = x
+
+        # Pass through EmbeddingModel
+        embedded = self.embedding_model(processed_inputs)
+
+        for feature_key in self.feature_keys:
+            x = embedded[feature_key]
+
+            info = reshape_info[feature_key]
+            if info.get("expanded") and x.dim() > 1:
+                x = x.squeeze(0)
+
+            # Handle different tensor dimensions for pooling
+            if x.dim() == 3:
+                # Case: (batch, seq_len, embedding_dim) - apply mean pooling
+                mask = (x.sum(dim=-1) != 0).float()
+                if mask.sum(dim=-1, keepdim=True).any():
+                    x = self.mean_pooling(x, mask)
+                else:
+                    x = x.mean(dim=1)
+            elif x.dim() == 2:
+                # Case: (batch, embedding_dim) - already pooled, use as is
+                pass
+            else:
+                raise ValueError(f"Unsupported tensor dimension: {x.dim()}")
+
+            patient_emb.append(x)
+
+        # Concatenate all feature embeddings
+        patient_emb = torch.cat(patient_emb, dim=1)
+
+        # Apply single linear layer (no activation)
+        logits = self.fc(patient_emb)
+        
+        # Obtain y_true, loss, y_prob
+        y_true = kwargs[self.label_key].to(self.device)
+        loss = self.get_loss_function()(logits, y_true)
+        y_prob = self.prepare_y_prob(logits)
+        
+        results = {
+            "loss": loss,
+            "y_prob": y_prob,
+            "y_true": y_true,
+            "logit": logits,
+        }
+        if kwargs.get("embed", False):
+            results["embed"] = patient_emb
+        return results
+
+
+if __name__ == "__main__":
+    from pyhealth.datasets import SampleDataset
+
+    samples = [
+        {
+            "patient_id": "patient-0",
+            "visit_id": "visit-0",
+            "conditions": ["cond-33", "cond-86", "cond-80"],
+            "procedures": [1.0, 2.0, 3.5, 4],
+            "label": 0,
+        },
+        {
+            "patient_id": "patient-1",
+            "visit_id": "visit-1",
+            "conditions": ["cond-33", "cond-86", "cond-80"],
+            "procedures": [5.0, 2.0, 3.5, 4],
+            "label": 1,
+        },
+    ]
+
+    # Define input and output schemas
+    input_schema = {
+        "conditions": "sequence",  # sequence of condition codes
+        "procedures": "tensor",  # tensor of procedure values
+    }
+    output_schema = {"label": "binary"}  # binary classification
+
+    # dataset
+    dataset = SampleDataset(
+        samples=samples,
+        input_schema=input_schema,
+        output_schema=output_schema,
+        dataset_name="test",
+    )
+
+    # data loader
+    from pyhealth.datasets import get_dataloader
+
+    train_loader = get_dataloader(dataset, batch_size=2, shuffle=True)
+
+    # model
+    model = LogisticRegression(dataset=dataset)
+
+    # data batch
+    data_batch = next(iter(train_loader))
+
+    # try the model
+    ret = model(**data_batch)
+    print(ret)
+
+    # try loss backward
+    ret["loss"].backward()
@@ -25,6 +25,7 @@ def get_processor(name: str):
     MultiLabelProcessor,
     RegressionLabelProcessor,
 )
+from .multi_hot_processor import MultiHotProcessor
 from .raw_processor import RawProcessor
 from .sequence_processor import SequenceProcessor
 from .signal_processor import SignalProcessor
@@ -40,6 +41,7 @@ def get_processor(name: str):
     "TensorProcessor",
     "TimeseriesProcessor",
     "SignalProcessor",
+    "MultiHotProcessor",
     "BinaryLabelProcessor",
     "MultiClassLabelProcessor",
     "MultiLabelProcessor",