zone初始化

對頁表初始化完成之後,內核就可以對內存進行管理,但是內核並不是同一對待這些頁面的,而是採用區塊zone的方式來管理。struct zone數據結構主要成員如下

<code>​
struct zone {
/* Read-mostly fields */

/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];

/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
int node;
#endif

/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;

struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;

/*
* This is a per-zone reserve of pages that should not be
* considered dirtyable memory.
*/
unsigned long dirty_balance_reserve;

#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;

unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;

/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*
* Read access to managed_pages should be safe because it's unsigned
* long. Write access to zone->managed_pages and totalram_pages are
* protected by managed_page_count_lock at runtime. Idealy only
* adjust_managed_page_count() should be used instead of directly
* touching zone->managed_pages and totalram_pages.
*/
unsigned long managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;


const char *name;

/*
* Number of MIGRATE_RESERVE page block. To maintain for just
* optimization. Protected by zone->lock.
*/
int nr_migrate_reserve_block;

#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif

/*
* wait_table -- the array holding the hash table
* wait_table_hash_nr_entries -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t *wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;


ZONE_PADDING(_pad1_)
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];

/* zone flags, see below */
unsigned long flags;

/* Write-intensive fields used from the page allocator */
spinlock_t lock;

ZONE_PADDING(_pad2_)

/* Write-intensive fields used by page reclaim */

/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct lruvec lruvec;

/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;

/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
#endif

#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact> * are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif


ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;/<compact>/<code>

首先struct zone是經常被訪問到的,因此這個數據結構要求以L1 cache對齊。另外,這裡的ZONE_PADDING()是讓zone->lock和zone->lru_lock這兩個很熱門的鎖可以分佈在不同的cache line中。一個內存節點最多也就幾個zone,因此zone數據結構不需要像struct page一樣關注數據結構的大小,因此這裡ZONE_PADDING()可以為了性能而浪費空間。在內存管理開發過程中,內核開發者逐步發現一些自旋鎖會競爭的很厲害,很難獲取。像zone->lock和zone->lru_lock這兩個鎖有時需要同時獲取鎖,因此保證它們使用不同的cache line是內核常用的一種優化技巧;

  • watermark:每個zone在系統啟動時會計算出3個水位值,分別是WMARK_MIN、WMARK_LOW和WMARK_HIGH水位,這在頁面分配器和kswapd頁面回收中會用到;
  • lowmem_reserve:zone中預留的內存
  • zone_pgdat:指向內存節點
  • pageset:用於維護Pre-CPU上的一系列的頁面,以減少自旋鎖的爭用;
  • zone_start_pfn:zone中被夥伴系統管理的頁面數量;
  • managed_pages:zone中被夥伴系統管理的頁面數量;
  • spanned_pages:zone包含的頁面數量
  • present_pages:zone裡實際管理的頁面數量。對一些體系結構來說,其值和spanned_pages相等
  • free_area:管理空閒區域的數組,包含管理鏈表等;
  • lock:並行訪問zone,對zone保護的自旋鎖
  • lru_lock:用於zone中LRU鏈表並行訪問時進行保護的自旋鎖。
  • lruvec:LRU鏈表集合
  • vm_stat:zone計數

通常情況下,內核的zone分為ZONE_DMA、ZONE_DMA32、ZONE_NORMAL和ZONE_HIGHMEM。在ARM Vexpress平臺中,沒有定義CONFIG_ZONE_DMA和CONFIG_ZONE_DMA32,所以只有ZONE_NORMAL和ZONE_HIGHMEM兩種;zone的定義在include/linux/mmzone.h文件中:

<code>​
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390 <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.

*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
__MAX_NR_ZONES
};/<code>

zone的初始化函數集中在bootmem_init()中完成,所以需要確定每個zone的範圍。在find_limit()函數中會計算出min_low_pf、max_low_pfn和max_pfn這三個值。其中,min_low_pfn是內存塊的開始地址的頁幀號(0x60000),max_low_pfn(0x8f800)表示normal區域的結束頁幀號。


下面是ARM Vexpress平臺運行之後的打印出來的zone信息:

可以看出ARM Vexpress平臺分為兩個zone,ZONE_NORMAL和ZONE_HIGHMEM。其中ZONE_NORMAL是從0xc0000000到0xef800000,這個地址空間有多少頁面呢?

(0xef800000-0xc0000000)/4096 = 194560

所以ZONE_NORMAL有194560個頁面;

另外ZONE_NORMAL的虛擬地址的結束地址是0xef800000,減去PAGE_OFFSET(0xc0000000),再加上PHY_OFFSET(0x60000000),正好等於0x8f800000,這個值等於我們之前計算出的arm_lowmem_init。

PHYS_OFFSET為由於Vexpress的RAM起始地址映射在0x60000000。所以要加上這一部分。

<code>static inline phys_addr_t __virt_to_phys(unsigned long x)
{
  return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
}

static inline unsigned long __phys_to_virt(phys_addr_t x)
{
  return x - PHYS_OFFSET + PAGE_OFFSET;
}/<code>

ZONE_NORMAL大小760MB,從0xc0000000 - 0xef800000,ZONE_HIGHMEM大小264MB,從0xef800000 - 0xffffffff。

ZONE_HIGHMEM並不等同於vmalloc,還有8MB hole和末尾16MB空間。所以vmalloc=264-8-16=240MB。


zone的初始化在free_area_init_core()中:

<code>start_kernel    -->
• setup_arch -->
• paging_init -->
• bootmem_init -->
zone_sizes_init -->
free_area_init_node -->
free_area_init_core/<code>
<code> * Set up the zone data structures:
*   - mark all pages reserved
*   - mark all memory queues empty
*   - clear the memory bitmaps

*
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long node_start_pfn, unsigned long node_end_pfn,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;

pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
//初始化內存節點的內存置換等待隊列。
init_waitqueue_head(&pgdat->kswapd_wait);
//pfmemalloc_wait等待隊列是用來由kswapd喚醒後進行內存分配的
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);

/* 遍歷每個管理區 */
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
/* size為該管理區中的頁框數,包括洞,該函數計算區域的的包含的頁面數,包含旁邊可能存在的空泛。計算區域總頁面數要斟酌兩個因數: */
size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
node_end_pfn, zones_size);
/* realsize為管理區中的頁框數,不包括洞 /
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
node_start_pfn,
node_end_pfn,
zholes_size);

/*
* Adjust freesize so that it accounts for how much memory

* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
* 調整realsize的大小,即減去page結構體佔用的內存大小
*/
// memmap_pags為包括洞的所有頁框的page結構體所佔的大小
memmap_pages = calc_memmap_size(size, realsize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
      " %s zone: %lu pages used for memmap\\n",
      zone_names[j], memmap_pages);
} else /* 內存不夠存放page結構體 */
printk(KERN_WARNING
" %s zone: %lu pages exceeds freesize %lu\\n",
zone_names[j], memmap_pages, freesize);
}

/* Account for reserved pages */
//調整realsize的大小,即減去DMA保留頁的大小
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\\n",
zone_names[0], dma_reserve);
}
//HIGHMEM不計算映射耗費page數目
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;

zone->spanned_pages = size;
zone->present_pages = realsize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
/* 設置zone->spanned_pages為包括洞的頁框數 */

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
/* 設置zone中的節點標識符 */
zone->node = nid;
/* 設置可回收頁面比率 */
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
/* 設置slab回收緩存頁的比率 */
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
/* 設置zone的名稱 */
zone->name = zone_names[j];
/* 初始化各種鎖 */
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
/* 設置管理區屬於的節點對應的pg_data_t結構 */
zone->zone_pgdat = pgdat;
/* 初始化cpu的頁面緩存 */
zone_pcp_init(zone);

/* For bootup, initialized properly in watermark setup */
mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

/* 初始化lru相關成員 */
lruvec_init(&zone->lruvec);
if (!size)
continue;

set_pageblock_order();
/* 定義了CONFIG_SPARSEMEM該函數為空 */
setup_usemap(pgdat, zone, zone_start_pfn, size);
/* 設置pgdat->nr_zones和zone->zone_start_pfn成員
        * 初始化zone->free_area成員
        * 初始化zone->wait_table相關成員
        */
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
/* 初始化該zone對應的page結構 */

memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}/<code>

另外系統中會有一個zonelist的數據結構,夥伴系統分配器會從zonelist開始分配內存,zonelist有一個zoneref數組,數組裡有一個成員會指向zone數據結構。zoneref數組的第一個成員指向的zone是頁面分配器的第一個候選者,其他成員則是第一個候選者分配失敗之後才考慮,優先級逐漸減低,zonelist的初始化路徑如下:

<code>start_kernel->build_all_zonelists
-> build_all_zonelists_init
-> __build_all_zonelists
-> build_zonelists
-> build_zonelists_node/<code>
<code> * Builds allocation fallback zone lists.
*
* Add all populated zones of a node to the zonelist.
*/
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;

do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);

return nr_zones;
}/<code>

這裡從最高的MAX_NR_ZONES的zone開始,設置到_zonerefs[0]數組中。在ARM Vexpress平臺上,該函數的運行結果如下:

HighMem _zonerefs[0]->zone_index=1

Normal _zonerefs[1]->zone_index=0

另外,系統中還有一個非常重要的全局變量-mem_map,它是一個struct page的數組,可以實現快速的把虛擬地址映射到物理地址中,這裡指的是內核地址的線性映射,它的初始化是在free_area_init_node()->alloc_node_mem_map()函數中;


分享到:


相關文章: