linux内存管理之伙伴系统(建立)

来源:岁月联盟 编辑:exp 时间:2012-01-10
内核使用伙伴系统来解决内存分配引起的外部碎片问题。
一、数据结构描述
 
结构zone中的free_area数组描述伙伴系统该数组为free_area结构
 
www.2cto.com
struct zone { 
…… 
    struct free_area    free_area[MAX_ORDER]; 
…… 
}; 
www.2cto.com
struct free_area {/*链表类型为5类,对于分类为新加入的*/ 
    struct list_head    free_list[MIGRATE_TYPES]; 
    unsigned long       nr_free; 
}; 
下图为伙伴系统在管理区中的表示。
 
 
 /
 
 
 
二、伙伴系统的初始化
 
在初始化物理管理区的时候初始化伙伴系统的,具体实现在下面的函数中:
 
Start_kernel()->setup_arch()->paging_init()->zone_sizes_init()->free_area_init_nodes()->free_area_init_node()->free_area_init_core()->init_currently_empty_zone()->zone_init_free_lists()
 
www.2cto.com
/*初始化对应zone中所有order和所有类型的链表*/ 
static void __meminit zone_init_free_lists(struct zone *zone) 

    int order, t; 
    for_each_migratetype_order(order, t) { 
        INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 
        zone->free_area[order].nr_free = 0; 
    } 

三、伙伴系统中数据初始化
 
将bootmem分配器中的数据回收到伙伴系统中
 
start_kernel()->mm_init()->mem_init()
 
www.2cto.com
void __init mem_init(void) 

    int codesize, reservedpages, datasize, initsize; 
    int tmp; 
/*和具体硬件相关*/ 
    pci_iommu_alloc(); 
 
#ifdef CONFIG_FLATMEM 
    BUG_ON(!mem_map); 
#endif 
    /* this will put all low memory onto the freelists */ 
    /*释放bootmem中的内存到伙伴系统中,包括bootmem占有的位图
     返回总共释放的页面数**/ 
    totalram_pages += free_all_bootmem(); 
 
    reservedpages = 0; 
    for (tmp = 0; tmp < max_low_pfn; tmp++) 
        /*
         * Only count reserved RAM pages:
         */ 
        if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 
            reservedpages++; 
    /*初始化高端内存区,将高端内存区放入伙伴系统中*/ 
    set_highmem_pages_init(); 
    /*内核代码段、数据段、初始化端长度*/ 
    codesize =  (unsigned long) &_etext - (unsigned long) &_text; 
    datasize =  (unsigned long) &_edata - (unsigned long) &_etext; 
    initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin; 
    /*打印输出各种内存初始化后的信息*/ 
    printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 
            "%dk reserved, %dk data, %dk init, %ldk highmem)/n", 
        nr_free_pages() << (PAGE_SHIFT-10), 
        num_physpages << (PAGE_SHIFT-10), 
        codesize >> 10, 
        reservedpages << (PAGE_SHIFT-10), 
        datasize >> 10, 
        initsize >> 10, 
        (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 
           ); 
 
    printk(KERN_INFO "virtual kernel memory layout:/n" 
        "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)/n" 
#ifdef CONFIG_HIGHMEM 
        "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)/n" 
#endif 
        "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)/n" 
        "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)/n" 
        "      .init : 0x%08lx - 0x%08lx   (%4ld kB)/n" 
        "      .data : 0x%08lx - 0x%08lx   (%4ld kB)/n" 
        "      .text : 0x%08lx - 0x%08lx   (%4ld kB)/n", 
        FIXADDR_START, FIXADDR_TOP, 
        (FIXADDR_TOP - FIXADDR_START) >> 10, 
 
#ifdef CONFIG_HIGHMEM 
        PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 
        (LAST_PKMAP*PAGE_SIZE) >> 10, 
#endif 
 
        VMALLOC_START, VMALLOC_END, 
        (VMALLOC_END - VMALLOC_START) >> 20, 
 
        (unsigned long)__va(0), (unsigned long)high_memory, 
        ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 
 
        (unsigned long)&__init_begin, (unsigned long)&__init_end, 
        ((unsigned long)&__init_end - 
         (unsigned long)&__init_begin) >> 10, 
 
        (unsigned long)&_etext, (unsigned long)&_edata, 
        ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 
 
        (unsigned long)&_text, (unsigned long)&_etext, 
        ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 
 
    /*
     * Check boundaries twice: Some fundamental inconsistencies can
     * be detected at build time already.
     */ 
#define __FIXADDR_TOP (-PAGE_SIZE) 
#ifdef CONFIG_HIGHMEM 
    BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START); 
    BUILD_BUG_ON(VMALLOC_END            > PKMAP_BASE); 
#endif 
#define high_memory (-128UL << 20) 
    BUILD_BUG_ON(VMALLOC_START          >= VMALLOC_END); 
#undef high_memory 
#undef __FIXADDR_TOP 
 
#ifdef CONFIG_HIGHMEM 
    BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE    > FIXADDR_START); 
    BUG_ON(VMALLOC_END              > PKMAP_BASE); 
#endif 
    BUG_ON(VMALLOC_START                >= VMALLOC_END); 
    BUG_ON((unsigned long)high_memory       > VMALLOC_START); 
 
    if (boot_cpu_data.wp_works_ok < 0) 
        test_wp_bit(); 
 
    save_pg_dir(); 
    /*调用zap_low_mappings函数清low_memory的映射,内核线程只访问内核空间是不能访问用户空间的
    ,其实low_memory的映射被设置的部分也就是当初为
     8MB建立的恒等映射填充了临时内核页全局目录的第0项,第1项
     这里将用户空间的页目录项<3G的PGD清0;*/ 
    zap_low_mappings(true); 

www.2cto.com
/**
 * free_all_bootmem - release free pages to the buddy allocator
 *
 * Returns the number of pages actually released.
 */ 
unsigned long __init free_all_bootmem(void) 

    return free_all_bootmem_core(NODE_DATA(0)->bdata); 

www.2cto.com
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 

    int aligned; 
    struct page *page; 
    unsigned long start, end, pages, count = 0; 
 
    if (!bdata->node_bootmem_map) 
        return 0; 
    /*节点内存开始和结束处*/ 
    start = bdata->node_min_pfn; 
    end = bdata->node_low_pfn; 
 
    /*
     * If the start is aligned to the machines wordsize, we might
     * be able to free pages in bulks of that order.
     */ 
    aligned = !(start & (BITS_PER_LONG - 1)); 
 
    bdebug("nid=%td start=%lx end=%lx aligned=%d/n", 
        bdata - bootmem_node_data, start, end, aligned); 
    /*用于释放整个bootmem所涉及的内存*/ 
    while (start < end) { 
        unsigned long *map, idx, vec; 
 
        map = bdata->node_bootmem_map; 
        idx = start - bdata->node_min_pfn;/*相对于开始处的偏移*/ 
        vec = ~map[idx / BITS_PER_LONG];/*vec值为页面分配情况*/ 
        /*如果开始地址以32位对其、连续的32个页面都没有被分配(空闲),并且
        释放起点以上的32个页面都是合法的(不超过end值),则释放连续的32个
        页面,即1<<5个页面*/ 
        if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { 
            int order = ilog2(BITS_PER_LONG);/*32位下为5*/ 
            /*释放到伙伴系统中*/ 
            __free_pages_bootmem(pfn_to_page(start), order); 
            count += BITS_PER_LONG;/*释放的总页面数更新*/ 
        } else { 
            unsigned long off = 0; 
            /*vec!=0表示这个区间存在页面空闲,off为这个区间的下标,从0开始*/  
            while (vec && off < BITS_PER_LONG) { 
                if (vec & 1) {/*如果页面空闲*/ 
                    /*偏移转化为具体的页面*/ 
                    page = pfn_to_page(start + off); 
                    /*一个页面一个页面的释放*/ 
                    __free_pages_bootmem(page, 0);/*释放单个页面*/ 
                    count++;/*更新释放页面总数*/ 
                } 
                vec >>= 1;/*vec向右移动一位,表示访问下一个页面*/ 
                off++;/*偏移加一*/ 
            } 
        } 
        start += BITS_PER_LONG;/*偏移向后移动*/ 
    } 
    /*虚拟地址转化为page
    用于释放bdata中的位图所占有的内存*/ 
    page = virt_to_page(bdata->node_bootmem_map); 
    pages = bdata->node_low_pfn - bdata->node_min_pfn; 
     
    /*计算bootmem分配器中所使用的页面数,即位图使用的页面数*/ 
    pages = bootmem_bootmap_pages(pages); 
    count += pages;/*释放的总页面数加*/ 
    while (pages--)/*每次释放一个页面,释放
        总共的pages个页面*/ 
        __free_pages_bootmem(page++, 0); 
 
    bdebug("nid=%td released=%lx/n", bdata - bootmem_node_data, count); 
 
    return count;/*返回释放的总页面数*/ 

www.2cto.com
/*
 * permit the bootmem allocator to evade page validation on high-order frees
 */ 
void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 

    if (order == 0) { 
        __ClearPageReserved(page); 
        set_page_count(page, 0);/*设置页面的引用位为0*/ 
        set_page_refcounted(page);/*设置页面的引用位为1*/ 
        __free_page(page);/*释放单个页面到伙伴系统中*/ 
    } else { 
        int loop; 
         
        /*这个不是很明白,可能和特定的体系相关???*/ 
        prefetchw(page); 
        for (loop = 0; loop < BITS_PER_LONG; loop++) { 
            struct page *p = &page[loop]; 
 
            if (loop + 1 < BITS_PER_LONG) 
                prefetchw(p + 1); 
            __ClearPageReserved(p); 
            set_page_count(p, 0); 
        } 
 
        set_page_refcounted(page);/*设置页面的引用计数为1*/ 
        /*这里具体释放到那个类型里面,
        要看page的里面具体的东西,也就是
        可以用相关函数来获取他所属的类型*/ 
        __free_pages(page, order);/*释放order个页面*/ 
    } 

www.2cto.com
void __init set_highmem_pages_init(void) 

    struct zone *zone; 
    int nid; 
 
    for_each_zone(zone) { 
        unsigned long zone_start_pfn, zone_end_pfn; 
 
        if (!is_highmem(zone))/*验证是否属于高端内存区域中*/ 
        /*如果不属于,将不执行下面的操作*/ 
            continue; 
 
        zone_start_pfn = zone->zone_start_pfn; 
        zone_end_pfn = zone_start_pfn + zone->spanned_pages; 
        /*返回zone中的node的id*/ 
        nid = zone_to_nid(zone); 
        printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)/n", 
                zone->name, nid, zone_start_pfn, zone_end_pfn); 
        /*将区间中的内存放到伙伴系统中*/ 
        add_highpages_with_active_regions(nid, zone_start_pfn, 
                 zone_end_pfn); 
    } 
    totalram_pages += totalhigh_pages; 
} 
www.2cto.com
void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, 
                          unsigned long end_pfn) 

    struct add_highpages_data data; 
 
    data.start_pfn = start_pfn; 
    data.end_pfn = end_pfn; 
    /*对节点中的每个区域进行页面的回收到伙伴系统中*/ 
    work_with_active_regions(nid, add_highpages_work_fn, &data); 

www.2cto.com
/*用指定函数来操作活动区,在高端内存初始化时用了*/ 
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 

    int i; 
    int ret; 
 
    for_each_active_range_index_in_nid(i, nid) { 
        ret = work_fn(early_node_map[i].start_pfn, 
                  early_node_map[i].end_pfn, data); 
        if (ret) 
            break; 
    } 

www.2cto.com
static int __init add_highpages_work_fn(unsigned long start_pfn, 
                     unsigned long end_pfn, void *datax) 

    int node_pfn; 
    struct page *page; 
    unsigned long final_start_pfn, final_end_pfn; 
    struct add_highpages_data *data; 
 
    data = (struct add_highpages_data *)datax; 
    /*活动内存区间与指定考虑区间交集*/ 
    final_start_pfn = max(start_pfn, data->start_pfn); 
    final_end_pfn = min(end_pfn, data->end_pfn); 
    if (final_start_pfn >= final_end_pfn) 
        return 0; 
 
    for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 
         node_pfn++) { 
        if (!pfn_valid(node_pfn))/*验证页面是否有效*/ 
            continue; 
        page = pfn_to_page(node_pfn);/*将下标转换为具体的页面*/ 
        /*初始化页面的count值,将页面释放到伙伴系统中*/ 
        add_one_highpage_init(page, node_pfn); 
    } 
 
    return 0; 
 

www.2cto.com
static void __init add_one_highpage_init(struct page *page, int pfn) 

    /*ClearPageReserved清除了该页面flag中的reserved标志,表示该页面属于动态内存*/ 
    ClearPageReserved(page); 
    init_page_count(page);/*设置page的count值为1*/ 
    __free_page(page);  /*释放页面到伙伴系统*/ 
    totalhigh_pages++;/*更新高端页面总数*/ 

www.2cto.com
void zap_low_mappings(bool early) 

    int i; 
 
    /*
     * Zap initial low-memory mappings.
     *
     * Note that "pgd_clear()" doesn't do it for
     * us, because pgd_clear() is a no-op on i386.
     */ 
    /*这个函数很简单,就是把前面我们在arch/x86/kernel/head_32.S中设置的页全局目录的前若干项清零
    。这若干项到底是多少
     不错,0xc0000000>>22 & 1023= 768,这些也全局目录项代表虚拟地址前3G的页面,也就是所谓的用户区
     ,我们在这里把它全清零了。*/ 
    for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { 
#ifdef CONFIG_X86_PAE 
        set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 
#else 
        set_pgd(swapper_pg_dir+i, __pgd(0)); 
#endif 
    } 
 
    if (early) 
        __flush_tlb(); 
    else 
        flush_tlb_all(); 

到此,伙伴系统已经建立并且里面存放了应有的内存数据。要从伙伴系统中分配内存,必须要有分配和释放机制。后面总结具体的分配和释放工作