Skip to content

Feature Request: allow mmap to take advantage of hugepage feature which has 10x speedupΒ #12444

@nickhuang99

Description

@nickhuang99

Prerequisites

  • I am running the latest code. Mention the version if possible as well.
  • I carefully followed the README.md.
  • I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
  • I reviewed the Discussions, and have a new and useful enhancement to share.

Feature Description

using Linux kernel "hugepagetlbfs" has more than 10x speedup for extremely large model, i.e. DeepSeek-r1:671b of size of 400G. And there is very minimum code change(less than 10 lines see below).
Here is kernel doc about hugepage
and I have tested at my ubuntu 22.04 with a second-hand Dell server of 1.5T memory without any GPU to load and run DS huge model. The performance of "hugetlbfs" is significant.

Motivation

Linux kernel allows user to take advantage of huge page of RAM. This is highly efficient when loading extremely large model. In my experiment of DeepSeek-r1:671b of size 377G, the speedup is 10x times! And the code change is really minimum based on current implementation of "llama-mmap.cpp". The only missing thing is just page size alignment and "mmap" flag "MAP_HUGETLB".
I have a rough video to explain what is done .

Here is steps of using hugepage(my OS is ubuntu 22.04):

  1. using sysctl to allocate hugepage, depending how physical memory you have, in my case, I reserved 800G RAM: i.e. sudo sysctl -w vm.nr_hugepages=400000
  2. check it is done: cat /proc/meminfo | grep -i huge
  3. creating mount point of "hugetlbfs": sudo mkdir /mnt/hugepages && sudo mount -t hugetlbfs -o uid=1000,gid=1000,rw none /mnt/hugepages
  4. using a small program "mmap" to "copy" model file to the mount point with source code below

#include <sys/mman.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

extern int errno;

#define HUGE_PAGE_SIZE 2097152

static_assert(sizeof(off_t) == 8);

int main(int argc, char**argv)
{        
        if (argc != 3) {
                printf("usage: %s srcfile tgtfile\n", argv[0]);
                return -1;
        }
        struct stat st;

        if (stat(argv[1], &st) != 0) {
                printf("source file %s is not valid file: %s\n", argv[1], strerror(errno));
                return -2;
        }
        off_t srcSize = st.st_size;
        int64_t pageNumber = (srcSize + HUGE_PAGE_SIZE - 1) / HUGE_PAGE_SIZE;
        int64_t tgtSize = pageNumber * HUGE_PAGE_SIZE;
        int src_fd, tgt_fd;
        src_fd = open(argv[1], O_RDONLY);
        if (src_fd == -1) {
                printf("source file %s cannot be opened! %s\n", argv[1], strerror(errno));
                return -5;
        } 
        tgt_fd = open(argv[2], O_CREAT|O_RDWR|O_EXCL, 0666);
        if (tgt_fd == -1) {
                printf("target file %s cannot be opened! %s\n", argv[2], strerror(errno));
                close(src_fd);
                return -6;
        }
         
        void* ptr = mmap(NULL, tgtSize, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_HUGETLB, tgt_fd, 0);
        
        if (ptr == MAP_FAILED) {                
                printf("mmap target file %s failed %s", argv[2], strerror(errno));
                close(src_fd);                
                return -7;
        }        
        char* tgt_ptr = (char*)ptr;
        char buffer[HUGE_PAGE_SIZE];
        for (int64_t i = 0; i < pageNumber; i ++) {
                int size;
                if ((size = read(src_fd, buffer, HUGE_PAGE_SIZE)) == HUGE_PAGE_SIZE 
                        || (size > 0 && i == pageNumber - 1)) {
                        memcpy(tgt_ptr, buffer, size);
                        tgt_ptr += HUGE_PAGE_SIZE; // last page no need to worry
                } else {
                        if (size == -1) {
                                printf("read source file %s failed with error %s\n", argv[1], strerror(errno));
                                break;
                        } else {
                                // this is complicated situation, it require you to do a repeating call
                                // let's do it later
                                printf("returned reading number is not expected as %d", HUGE_PAGE_SIZE);
                                break;
                        }
                }
        }
        printf("copy from %s to target %s finished for file size %ld\n", argv[1], argv[2], tgtSize);
        munmap(ptr, tgtSize); 
        close(tgt_fd); // immediately close is better
        close(src_fd); 
        return 0;              
}
  1. assume above source code file is hugecp.cpp: g++ -g hugecp.cpp -o hugecp
  2. now "hugecp" model file to mount point above: hugecp /DeepSeek/671b/model/path /mnt/hugepages/model_file
  3. apply the patch below to llama-mmap.cpp to rebuild llama.cpp and run:
  4. ./llama-cli -m /mnt/hugepages/model_file --conv

Possible Implementation

for mmap, just need to make sure mmap's size parameter is pagesize aligned. and "bit-or" flag MAP_HUGETLB

As for munmap, the default 4K-page size has to be changed to actual 2M page size.(I didn't tried 1G page size as not many platform supports it.)



diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 3970b748..ef0f9a1a 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -268,15 +268,18 @@ void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_ra
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
 
 // llama_mmap
+#define HUGE_PAGE_SIZE 2097152
 
 struct llama_mmap::impl {
 #ifdef _POSIX_MAPPED_FILES
     std::vector> mapped_fragments;
 
     impl(struct llama_file * file, size_t prefetch, bool numa) {
-        size = file->size();
+        size_t number_huge_pages = (file->size() + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE;
+        size = number_huge_pages * HUGE_PAGE_SIZE;
         int fd = file->file_id();
-        int flags = MAP_SHARED;
+        // better to add hugetable support
+        int flags = MAP_SHARED|MAP_HUGETLB;
         if (numa) { prefetch = 0; }
 #ifdef __linux__
         if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -285,25 +288,25 @@ struct llama_mmap::impl {
         }
         if (prefetch) { flags |= MAP_POPULATE; }
 #endif
-        addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0);
+        addr = mmap(NULL, size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
         }
 
         if (prefetch > 0) {
-            if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
+            if (posix_madvise(addr, std::min(size, prefetch), POSIX_MADV_WILLNEED)) {
                 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
                         strerror(errno));
             }
         }
         if (numa) {
-            if (posix_madvise(addr, file->size(), POSIX_MADV_RANDOM)) {
+            if (posix_madvise(addr, size, POSIX_MADV_RANDOM)) {
                 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
                         strerror(errno));
             }
         }
 
-        mapped_fragments.emplace_back(0, file->size());
+        mapped_fragments.emplace_back(0, size);
     }
 
     static void align_range(size_t * first, size_t * last, size_t page_size) {
@@ -319,7 +322,8 @@ struct llama_mmap::impl {
     }
 
     void unmap_fragment(size_t first, size_t last) {
-        int page_size = sysconf(_SC_PAGESIZE);
+        //int page_size = sysconf(_SC_PAGESIZE);
+        int page_size = HUGE_PAGE_SIZE;
         align_range(&first, &last, page_size);
         size_t len = last - first;
 


Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions