Linux内存管理子系统(1)

Linux内存管理子系统概述很长时间没有更新博客了。主要是这段时间比较忙，第一次独立承担项目，事情比较多。另一个方面在这段时间花费了很多功夫在学习硬件的一些知识上。暂时中断了linux内核的积淀。在硬件方面做了一些入门之后，又要转向内核的研究和积淀了，毕竟这是我的老本行。而且，对于硬件的门外汉来说，也许觉得设计电路画板子是很牛的事情。但是对于懂行的人来说也许就觉得硬件无非就是那些套路，外部接口是什么，内存用什么，flash用什么，都是有固定套路的。最终产品要有好的应用还得靠软件。软件为王。扯了那么多，言归正传，下面一段时间我想主要分析一下linux内核中的内存管理系统。这部分内容比较复杂，涉及到方面众多，但是这部分内容在内功修炼中很重要，搞懂了很多方面触类旁通，搞不懂在学习块设备驱动、页面回收、文件映射等内容时候就会遇到问题。为了对linux内存管理系统有一个系统的认识，我把其中主要涉及到方面归纳如下面的层次机构。首先是linux内存管理框架，这部分内容主要是介绍一下内核中内存管理相关的数据结构，和一些比较杂的内容。其次是伙伴系统，这是内存管理的核心。然后涉及slab机制、进程地址空间、vmalloc机制等。还有一些没有涉及到的等归纳完成这几个部分再做归纳。数据结构介绍首先说linux内存管理系统框架。系统内存模式一般可分为UMA和NUMA两大类。详细去研究UMA和NUMA不是本文的重点，本文用UMA系统做例子讲述。你可以把UMA就理解成系统中只有一片地址连续的内存可以供使用，那这个系统就是UMA系统。如果系统中有多片内存，这些内存地址空间还不连在一起，那么这样的系统可以认为是NUMA系统。内存节点由于对NUMA系统的支持，linux内核中引入内存节点的概念，通俗的说一个节点管理一片内存（地址联系的内存区域）。内存域上面介绍了内存节点的概念，下面要说一说内存域的概念。一个内存节点代表了一片连续的内存区域。在这样的一片内存区域中，我们又可以将它分为多个域，比如PC机上的最起始位置的几M内存是可以给硬件做DMA操作使用的，又比如超过896M的内存内核不能直接映射，我们把超过896M的内存叫做高端内存。因此这种节点内部不同区域的划分叫做域。一般而言，常见的有DMA域、NORMAL域、高端内存域内存页面其次还有介绍一下的是页的概念，这个概念凡事了解一点内核的应该都不陌生，虚拟地址到物理地址的转换一般以Page为单位内存节点管理数据结构pglist_data 下面列举了内存节点的管理结构体pglist_data和内存域的管理结构zone Pglist_data中主要的元素有zones，这个是该内存节点中的各个域。Node_zonelists是内存节点的备用列表，当在该内存节点是分配不到内存的时候就会使用备份列表试图在其它内存节点上进行内存分配。Node_mem_map保存了一块区域的地址，这块区域保存该节点内对应页的page数据结构。Bdata表示该节点的自举内存分配器相关的数据结构 typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_CGROUP_MEM_RES_CTLR struct page_cgroup *node_page_cgroup; #endif #endif struct bootmem_data *bdata; #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * Nests above zone->lock and zone->size_seqlock. */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; } pg_data_t; 内存域管理数据结构Zone Zone数据结构中主要的成员有如下，首先是pages_min等三个变量，这就是传说中的水线，主要控制在内存分配过程中页面回收等行为。另外pageset是和冷热页机制相关的一个数据结构，比较重要。其次就是free_area，这个至关重要的数据和伙伴系统相关。 struct zone { /* Fields commonly accessed by the page allocator */ unsigned long pages_min, pages_low, pages_high; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk * to run OOM on the lower zones despite there's tons of freeable ram * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ unsigned long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; /* * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; #endif /* * free areas of different sizes */ spinlock_t lock; #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif struct free_area free_area[MAX_ORDER]; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; struct { struct list_head list; unsigned long nr_scan; } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() * invokation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped * pages. * * Access to both this field is quite racy even on uniprocessor. But * it is expected to average out OK. */ int prev_priority; /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. */ unsigned int inactive_ratio; ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ /* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ wait_queue_head_t *wait_table; unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; /* * Discontig memory support fields. */ struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * zone_start_pfn, spanned_pages and present_pages are all * protected by span_seqlock. It is a seqlock because it has * to be read outside of zone->lock, and it is done in the main * allocator path. But, it is written quite infrequently. * * The lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ /* * rarely used fields: */ const char *name; } ____cacheline_internodealigned_in_smp;

alloysystem

Linux内存管理子系统(1)