gty111
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 6 additions & 6 deletions b/‎README.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/chat_client.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/chat_client.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎gllm/async_llm_engine.py‎
Lines changed: 10 additions & 8 deletions b/‎gllm/async_llm_engine.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎gllm/async_worker.py‎
Lines changed: 7 additions & 1 deletion b/‎gllm/async_worker.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎gllm/dist_utils.py‎
Lines changed: 135 additions & 11 deletions b/‎gllm/dist_utils.py‎
Lines changed: 135 additions & 11 deletions
diff --git a/‎gllm/entrypoints/api_server.py‎
Lines changed: 2 additions & 0 deletions b/‎gllm/entrypoints/api_server.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gllm/scheduler.py‎ ‎gllm/frontend_scheduler.py‎gllm/scheduler.py renamed to gllm/frontend_scheduler.py
Lines changed: 1 addition & 1 deletion b/‎gllm/scheduler.py‎ ‎gllm/frontend_scheduler.py‎gllm/scheduler.py renamed to gllm/frontend_scheduler.py
Lines changed: 1 addition & 1 deletion
@@ -21,7 +21,7 @@ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 
 # Supported/expected torch versions for CUDA.
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
 
 #
 # Try to find python package with an executable that exactly matches
 
@@ -15,9 +15,10 @@ Global Balanced Pipeline Parallelism System for Distributed LLM Serving with Tok
 <img src=doc/pic/overview.svg width=500>
 </p>
 
-Integreted with features like **continuous batching**, **paged attention**, **chunked prefill**, **prefix caching**, **token throttling** and **pipeline parallelism**, gLLM provides basic functionality (offline/online inference and interactive chat) to support large language model inference. gLLM provides **equivalent or superior** offline/online inference speed with mainstream inference engine and **minimal** (~4k loc) code base. You can also see gLLM as a LLM inference playground for doing experiment or academic research.
+Integreted with features like **continuous batching**, **paged attention**, **chunked prefill**, **prefix caching**, **token throttling**, **pipeline parallelism** and **tensor parallelism**, gLLM provides basic functionality (**offline/online inference and interactive chat**) to deploy distributed LLMs (**supported in huggingface**) inference. gLLM provides **equivalent or superior** offline/online inference speed with mainstream inference engine and **minimal** (~6k loc) code base. You can also see gLLM as a LLM inference playground for doing experiment or academic research.
 
 *Latest News* :fire:
+- [2025/06/14]: Tensor parallelism is now integrated, allowing joint deploying with pipeline parallelism :sunglasses:
 - [2025/05/05]: MoE architecture is supported. Try Qwen2/3 MoE models :star_struck:
 - [2025/04/29]: Qwen3 day 1 support. Come and try Qwen3 :tada:
 - [2025/04/27]: gLLM is open sourced :earth_asia:
@@ -43,7 +44,7 @@ Integreted with features like **continuous batching**, **paged attention**, **ch
 
 ## Install gLLM
 ```
-pip install torch==2.7.0
+pip install torch==2.5.1
 pip install -v -e .
 ```
 
@@ -73,7 +74,7 @@ python benchmarks/benchmark_throughput.py --model $MODEL \
 ```
 # To see the description of args, run 'python -m gllm.entrypoints.api_server -h'
 python -m gllm.entrypoints.api_server --port $PORT --model-path $MODEL_PATH \
-    --enable-prefix-caching --pp $PP
+    --enable-prefix-caching --pp $PP --tp $TP
 ```
 
 ### Launch OpenAI-Compatible Server (Multi-node)
@@ -142,13 +143,12 @@ python evaluations/evaluate_MMLU_pro.py --model $MODEL --port $PORT
 ## Supported Models
 
 - Qwen Series: Qwen3, Qwen2.5, Qwen2
-- Llama Series: Llama3.1, Llama3, Llama2 and deepseek-coder
+- Llama Series: Llama3.2, Llama3.1, Llama3, Llama2 and deepseek-coder
 - Mixtral Series: Mixtral-8x7B, Mixtral-8x22B
-- ChatGLM Series: Chatglm3 and glm4
+- ChatGLM Series: Glm4 and Chatglm3 
 
 ## Roadmap
 
-- [ ] Support TP
 - [ ] Support more models
 
 
 
@@ -22,14 +22,14 @@
 messages = []
 
 print("\nWelcome to the chatbot!\n"
-      "Type '\exit' to exit the chatbot.\n"
-      "Type '\clear' to clear the chatbot's history.\n")
+      "Type '\\exit' to exit the chatbot.\n"
+      "Type '\\clear' to clear the chatbot's history.\n")
 
 while True:
     prompt = input('>>> ')
-    if prompt == '\exit':
+    if prompt == '\\exit':
         break
-    elif prompt == '\clear':
+    elif prompt == '\\clear':
         messages = []
     messages.append({'role': 'user', 'content': prompt})
     chat_completion = client.chat.completions.create(
 
@@ -13,7 +13,7 @@
 from gllm.worker import Worker, run_worker
 from gllm.input_data import InputData
 from gllm.sequence import Sequence
-from gllm.scheduler import IPCPackage
+from gllm.frontend_scheduler import IPCPackage
 from gllm.zmq_comm import zmqComm
 
 
@@ -142,7 +142,7 @@ def __init__(self, *args, **kwargs):
             self.act_worker_ranks = [int(i) for i in self.worker_ranks.split(',')]
             assert len(self.act_worker_ranks) != 0
         else:
-            self.act_worker_ranks = list(range(self.pp_size))
+            self.act_worker_ranks = list(range(self.pp_size*self.tp_size))
         self.num_workers = len(self.act_worker_ranks)
 
         self.ctx = mp.get_context('spawn')
@@ -156,7 +156,7 @@ def __init__(self, *args, **kwargs):
         self.token_path = f'ipc:///tmp/{ipc_path_prefix}_gllm_token'
 
         self.comm = zmqComm(self.host, self.zmq_port_base, self.launch_mode, self.master_addr, 
-                            0, 0, self.schedule_path, self.output_path, self.token_path)
+                            self.schedule_path, self.output_path, self.token_path, frontend=True)
         self.comm.init()
 
         logger.info(f'Launching worker {self.act_worker_ranks} ...')
@@ -166,8 +166,10 @@ def __init__(self, *args, **kwargs):
             logger.warning(f'Multi-node support is an experimental feature')
 
         self.process_list = []
-        for local_rank, pp_rank in enumerate(self.act_worker_ranks):
-            self.start_worker(local_rank, pp_rank)
+        for local_rank, rank in enumerate(self.act_worker_ranks):
+            pp_rank = rank // self.tp_size
+            tp_rank = rank % self.tp_size
+            self.start_worker(local_rank, pp_rank, tp_rank)
 
         if kwargs['load_format'] == 'auto':
             self.load_progress()
@@ -256,21 +258,21 @@ async def run_schedule_engine(self):
             self.send_ipc_package()
             await asyncio.sleep(0)
 
-    def start_worker(self, local_rank, pp_rank):
+    def start_worker(self, local_rank, pp_rank, tp_rank):
         worker_cls = Worker if not self.use_async_worker else AsyncWorker
         comm = zmqComm(self.host,
                        self.zmq_port_base,
                        self.launch_mode,
                        self.master_addr,
-                       pp_rank,
-                       self.pp_size,
                        self.schedule_path,
                        self.output_path,
                        self.token_path)
         worker = worker_cls(self.model_runner,
                             local_rank,
                             pp_rank,
+                            tp_rank,
                             self.pp_size,
+                            self.tp_size,
                             self.master_addr,
                             self.master_port,
                             comm,
 
@@ -18,6 +18,10 @@ def __init__(self, *args, **kwargs):
     async def run_driver(self):
         return super().run_driver()
 
+    @async_wrapper
+    async def run_first_tp(self):
+        return super().run_first_tp()
+    
     @async_wrapper
     async def run_other(self):
         return super().run_other()
@@ -36,8 +40,10 @@ async def launch_async_tasks(worker: AsyncWorker):
     worker.init()
 
     ats = AsyncTasks()
-    if worker.pp_rank == 0:
+    if worker.rank == 0:
         ats.add_task(worker.run_driver)
+    elif worker.pp_rank == 0:
+        ats.add_task(worker.run_first_tp)
     else:
         ats.add_task(worker.run_other)
     await ats.wait()
 
@@ -2,24 +2,25 @@
 import torch
 
 from logger import logger
+from collections.abc import Sequence
 
 def send_pp_data(output, dst):
     if type(output) == tuple:
         assert len(output) == 2
-        dist.isend(output[0],dst)
-        dist.isend(output[1],dst)
+        dist.isend(output[0], dst)
+        dist.isend(output[1], dst)
     else:
-        dist.isend(output,dst)
+        dist.isend(output, dst)
 
 def recv_pp_data(src, shape, has_residual):
     hidden_states = torch.zeros(torch.Size(shape))
     if has_residual:
         residual = hidden_states.clone().detach()
-        hidden_states_future = dist.irecv(hidden_states,src)
-        residual_future = dist.irecv(residual,src)
+        hidden_states_future = dist.irecv(hidden_states, src)
+        residual_future = dist.irecv(residual, src)
         return hidden_states_future, residual_future, hidden_states, residual
     else:
-        hidden_states_future = dist.irecv(hidden_states,src)
+        hidden_states_future = dist.irecv(hidden_states, src)
         return hidden_states_future, hidden_states
 
 def send_obj_list(obj_list, dst):
@@ -28,36 +29,88 @@ def send_obj_list(obj_list, dst):
 def recv_obj_list(obj_list, src):
     dist.recv_object_list(obj_list, src=src)
 
+_RANK=0
 _PP_RANK=0
+_TP_RANK=0
 _LOCAL_RANK=0
 _PP_SIZE=1
+_TP_SIZE=1
+_WORLD_SIZE=1
 _ASSIGNED_LAYERS=None
+_TP_GROUP=None
+
+def get_rank():
+    return _RANK
+
+def get_world_size():
+    return _WORLD_SIZE
 
 def get_pp_rank():
     return _PP_RANK
 
+def get_tp_rank():
+    return _TP_RANK
+
 def get_local_rank():
     return _LOCAL_RANK
 
-def is_pp_last_rank():
+def get_output_rank():
+    return (get_pp_size() - 1) * get_tp_size()
+
+def is_output_rank():
+    return is_last_pp_rank() and is_first_tp_rank()
+
+def is_first_tp_rank():
+    return get_tp_rank() == 0
+
+def is_last_pp_rank():
     return get_pp_rank() == get_pp_size() - 1
 
+def get_next_pp_rank():
+    return get_rank() + get_tp_size()
+
+def get_last_pp_rank():
+    return get_rank() - get_tp_size()
+
 def get_pp_size():
     return _PP_SIZE
 
+def get_tp_size():
+    return _TP_SIZE
+
 def get_assigned_layers():
     return _ASSIGNED_LAYERS
 
-def init_dist(pp_size, local_rank, pp_rank, master_addr, master_port, assigned_layers):
-    global _PP_RANK, _PP_SIZE, _ASSIGNED_LAYERS, _LOCAL_RANK
+def get_tp_group():
+    return _TP_GROUP
+
+def init_tp_group():
+    global _TP_GROUP
+    tp_groups = [list(range(_pp_rank*get_tp_size(), (_pp_rank+1)*get_tp_size())) for _pp_rank in range(get_pp_size())]
+    for tp_ranks in tp_groups:
+        tp_group = dist.new_group(tp_ranks)
+        if _RANK in tp_ranks:
+            _TP_GROUP = tp_group
+
+def init_dist(pp_size, tp_size, local_rank, pp_rank, tp_rank, master_addr, master_port, assigned_layers):
+    global _RANK, _PP_RANK, _TP_RANK, _PP_SIZE, _TP_SIZE, _WORLD_SIZE, _ASSIGNED_LAYERS, _LOCAL_RANK, _TP_GROUP, _PP_GROUP
+    _RANK = pp_rank * tp_size + tp_rank
     _PP_RANK = pp_rank
+    _TP_RANK = tp_rank
     _LOCAL_RANK = local_rank
     _PP_SIZE = pp_size
+    _TP_SIZE = tp_size
+    _WORLD_SIZE = pp_size * tp_size
     _ASSIGNED_LAYERS = assigned_layers
+    
+    self_tp_ranks = list(range(pp_rank*tp_size, (pp_rank+1)*tp_size))
+    
     init_method = f'tcp://{master_addr}:{master_port}'
     backend = 'nccl'
-    logger.info(f'NCCL: Init_method {init_method}, Backend {backend}, Word_size {pp_size}')
-    dist.init_process_group(init_method=init_method, backend=backend, world_size=pp_size, rank=pp_rank)
+    logger.info(f'NCCL: Init_method {init_method}, Backend {backend}, Rank {_RANK}, TP Groups {self_tp_ranks}, Word_size {_WORLD_SIZE}')
+    dist.init_process_group(init_method=init_method, backend=backend, world_size=_WORLD_SIZE, rank=_RANK)
+    
+    init_tp_group()
 
 def get_pp_layers(num_layers):
     if _ASSIGNED_LAYERS is None:
@@ -93,3 +146,74 @@ def resolve_pp_layer(layer_name, idx, start_layer_idx):
         return '.'.join(layer_name_list)
     else:
         return layer_name
+    
+def tensor_model_parallel_all_gather(input_: torch.Tensor, dim=-1) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    if dim < 0:
+        # Convert negative dim to positive.
+        dim += input_.dim()
+    input_size = input_.size()
+    # NOTE: we have to use concat-style all-gather here,
+    # stack-style all-gather has compatibility issues with
+    # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+    output_size = (input_size[0] * get_tp_size(), ) + input_size[1:]
+    # Allocate output tensor.
+    output_tensor = torch.empty(output_size,
+                                dtype=input_.dtype,
+                                device=input_.device)
+    # All-gather.
+    dist.all_gather_into_tensor(output_tensor,
+                                input_,
+                                group=get_tp_group())
+    # Reshape
+    output_tensor = output_tensor.reshape((get_tp_size(), ) + input_size)
+    output_tensor = output_tensor.movedim(0, dim)
+    output_tensor = output_tensor.reshape(input_size[:dim] +
+                                            (get_tp_size() *
+                                            input_size[dim], ) +
+                                            input_size[dim + 1:])
+    return output_tensor
+
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    dist.all_reduce(input_, group=get_tp_group())
+    return input_
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> Sequence[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # NOTE: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
@@ -100,6 +100,7 @@ async def run_server(args):
     parser.add_argument('--use-naive-schedule', help='Use scheduling policy in Sarathi-Serve', action='store_true')
     parser.add_argument('--enable-prefix-caching', help='Enable KV cache reuse across requests', action='store_true')
     parser.add_argument('--pp', type=int, help='Number of pipeline stages', default=1)
+    parser.add_argument('--tp', type=int, help='Number of tensor parallel degrees', default=1)
     parser.add_argument('--load-format', type=str, choices=['auto','dummy'], help='auto: actually load model weights; dummy: initialize the model with random values', default='auto')
     parser.add_argument('--assigned-layers', type=str, help='If the model have 64 layers, we can set it to 16,16,16,16 or 16,16,17,15', default=None)
     parser.add_argument('--use-async-worker', help='Experimental feature for worker implemented by async', action='store_true')
@@ -125,6 +126,7 @@ async def run_server(args):
                   kvthresh=args.kvthresh,
                   enable_prefix_caching=args.enable_prefix_caching,
                   pp_size=args.pp,
+                  tp_size=args.tp,
                   assigned_layers=args.assigned_layers,
                   use_naive_schedule=args.use_naive_schedule,
                   use_async_worker=args.use_async_worker)
 
@@ -17,7 +17,7 @@ def __init__(self, schedule_lists: List[Sequence]):
         self.act_schedule_ids = []
         self.next_tokens = []
 
-class Scheduler:
+class FrontendScheduler:
     def __init__(self, maxd: int, maxp: int, kvthresh: float,
                  page_size: int) -> None:
         self.prompt_lists: List[Sequence] = []  # seqs to prefill
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")`
`21`	`21`
`22`	`22`
`23`	`23`	`# Supported/expected torch versions for CUDA.`
`24`		`-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")`
	`24`	`+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")`
`25`	`25`
`26`	`26`	`#`
`27`	`27`	`# Try to find python package with an executable that exactly matches`