mmap实现分析
本文不是介绍mmap函数的使用方法,而是分析其内核实现,相关使用方法网上已经有很多资料。Mmap的本质其实就是:为当前进程分配(或找到)一个合适的vma,然后为该vma设置对应的缺页处理函数。
我们知道mmap按照flag可以分为匿名映射和非匿名映射,又可分为shared映射和private映射。这样从两个维度,我们就得到了四种映射。
(1) 匿名shared映射:fd为-1,可用于父子进程通信。
(2) 匿名private映射:例如malloc大块的内存(大于128k)。
(3) 非匿名shared映射:常见的用于进程通信方式。
(4) 非匿名private映射:例如程序在启动时加载so时,就是用的这种方式,相当于“写时拷贝”。
下面我们就看下内核中几种方式的区别。
内核中mmap主要有函数sys_mmap_pgoff函数负责实现,该函数定义在mm/mmap.c中。
点击(此处)折叠或打开
- SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
- unsigned long, prot, unsigned long, flags,
- unsigned long, fd, unsigned long, pgoff)
- {
- struct file *file = NULL;
- unsigned long retval = -EBADF;
- if (!(flags & MAP_ANONYMOUS)) { /*匿名映射*/
- audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd); /*由fd找到对应的file结构*/
- if (!file)
- goto out;
- if (is_file_hugepages(file))
- len = ALIGN(len, huge_page_size(hstate_file(file)));
- } else if (flags & MAP_HUGETLB) {
- /*......*/
- }
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- if (file)
- fput(file);
- out:
- return retval;
- }
该函数主要功能由vm_mmap_pgoff来实现,而vm_mmap_pgoff主要逻辑就是调用了do_mmap_pgoff。下面我们看vm_mmap_pgoff的实现。
l do_mmap_pgoff
点击(此处)折叠或打开
- unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate)
- {
- struct mm_struct * mm = current->mm;
- struct inode *inode;
- /*......*/
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
- return addr;
- /*......*/
- addr = mmap_region(file, addr, len, vm_flags, pgoff);
- /*......*/
- return addr;
- }
这个函数首先通过 get_unmapped_area创建(或获取)一个合适的vma,然后调用mmap_region对vma进行设置。我们具体看下mmap_region的实现。
l mmap_region
点击(此处)折叠或打开
- unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
- {
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
- struct inode *inode = file ? file_inode(file) : NULL;
- /*......*/
- if (file) { /*如果不是匿名映射*/
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = get_file(file);
- error = file->f_op->mmap(file, vma); /*调用对应文件系统的mmap函数*/
- if (error)
- goto unmap_and_free_vma;
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) { /*shared 匿名映射*/
- if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
- goto free_vma;
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } /*private 匿名映射*/
- file = vma->vm_file;
- /*......*/
- }
如果传入了fd,则调用对应文件系统的mmap函数。以ext4文件系统为例。其mmap函数为 ext4_file_mmap。
l ext4_file_mmap
点击(此处)折叠或打开
- static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
- {
- struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
- return -ENOEXEC;
- file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
- return 0;
- }
可以看到这个函数只是设置vma->vm_ops为当前文件系统的处理函数。
点击(此处)折叠或打开
- static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
- .page_mkwrite = ext4_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
- };
如果是匿名映射(不传入fd),且传入了shared flag。则调用shmem_zero_setup。
l shmem_zero_setup
点击(此处)折叠或打开
- int shmem_zero_setup(struct vm_area_struct *vma)
- {
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
- file = shmem_file_setup("dev/zero", size, vma->vm_flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
- return 0;
- }
可以看到这里将vma->vm_ops设置为tmpfs文件系统的shmem_vm_ops。
点击(此处)折叠或打开
- static const struct vm_operations_struct shmem_vm_ops = {
- .fault = shmem_fault,
- #ifdef CONFIG_NUMA
- .set_policy = shmem_set_policy,
- .get_policy = shmem_get_policy,
- #endif
- .remap_pages = generic_file_remap_pages,
- };
整个mmap函数的处理过程如下:
我们知道mmap函数只是为进程分配了虚拟内存空间,并没有真的建立虚拟内存和物理内存的映射。这个建立映射的过程是到缺页中断的函数中进行的。
缺页中断的处理过程大体如下:
点击(此处)折叠或打开
- int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
- {
- pte_t entry;
- spinlock_t *ptl;
- /*......*/
- entry = *pte;
- if (!pte_present(entry)) {
- if (pte_none(entry)) {
- if (vma->vm_ops)
- return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
- /*匿名private 映射*/
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
- }
- }
- return 0;
- }
我们看到vma->vm_ops时会调用do_anonymous_page。这里需要注意,有人看到函数名就以为这是匿名映射的逻辑,但是根据前面的代码分析匿名shared的时候也是会设置vma->vm_ops的。只有一种情况不会设置,那就是匿名private映射。
所以综上,有以下结论:
(1)非匿名shared映射:调用文件各自文件系统的缺页函数;
(2)非匿名private映射:调用文件各自文件系统的缺页函数;
(3)匿名shared映射:调用tmpfs文件系统的缺页函数;
(4)匿名private映射:do_anonymous_page处理缺页,也是目前唯一支持THP(透明大页)的方式。
另外补充:其实我们常用的posix和systemV共享内存底层都是通过tmpfs实现的,详见http://hustcat.github.io/shared-memory-tmpfs/ 。但注意其实内核是有两个tmpfs文件系统的,一个是内核启动自行挂载的用于共享匿名映射和systemV共享内存,而另一个通过mount挂载,其大小默认为系统内存的1/2,用于posix共享内存。