Skip to content

Commit d96e0e8

Browse files
committed
Add [ICCAD 25] AdapMoE: Adaptive Sensitivity-based Expert Gating and Management for Efficient MoE Inference
1 parent 08dbf02 commit d96e0e8

File tree

2 files changed

+16
-0
lines changed

2 files changed

+16
-0
lines changed

main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"FAST": "File and Storage Technologies",
1313
"OSDI": "Operating Systems Design and Implementation",
1414
"SOSP": "Symposium on Operating Systems Principles",
15+
"ICCAD": "International Conference on Computer-Aided Design",
1516
}
1617

1718

model-inference-systems.bib

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,18 @@ @inproceedings{10.1145/3669940.3707267
136136
series = {ASPLOS '25},
137137
code = {https://github.com/caoshiyi/artifacts/blob/asplos25},
138138
}
139+
140+
@inbook{10.1145/3676536.3676741,
141+
author = {Zhong, Shuzhang and Liang, Ling and Wang, Yuan and Wang, Runsheng and Huang, Ru and Li, Meng},
142+
title = {AdapMoE: Adaptive Sensitivity-based Expert Gating and Management for Efficient MoE Inference},
143+
year = {2025},
144+
isbn = {9798400710773},
145+
publisher = {Association for Computing Machinery},
146+
address = {New York, NY, USA},
147+
url = {https://doi.org/10.1145/3676536.3676741},
148+
abstract = {Mixture-of-Experts (MoE) models are designed to enhance the efficiency of large language models (LLMs) without proportionally increasing the computational demands. However, their deployment on edge devices still faces significant challenges due to high on-demand loading overheads from managing sparsely activated experts. This paper introduces AdapMoE, an algorithm-system co-design framework for efficient MoE inference. AdapMoE features adaptive expert gating and management to reduce the on-demand loading overheads. We observe the heterogeneity of experts loading across layers and tokens, based on which we propose a sensitivity-based strategy to adjust the number of activated experts dynamically. Meanwhile, we also integrate advanced prefetching and cache management techniques to further reduce the loading latency. Through comprehensive evaluations on various platforms, we demonstrate AdapMoE consistently outperforms existing techniques, reducing the average number of activated experts by 25\% and achieving a 1.35\texttimes{} speedup without accuracy degradation. Code is available at: https://github.com/PKU-SEC-Lab/AdapMoE.},
149+
booktitle = {Proceedings of the 43rd IEEE/ACM International Conference on Computer-Aided Design},
150+
articleno = {51},
151+
numpages = {9},
152+
code = {https://github.com/PKU-SEC-Lab/AdapMoE},
153+
}

0 commit comments

Comments
 (0)