diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/include/linux/bootmem.h bootmem/include/linux/bootmem.h --- linux-2.5/include/linux/bootmem.h Thu Dec 5 20:54:47 2002 +++ bootmem/include/linux/bootmem.h Fri Dec 6 09:17:41 2002 @@ -1,5 +1,6 @@ /* * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree-based memory reservation system, William Irwin, IBM, Oct 2001 */ #ifndef _LINUX_BOOTMEM_H #define _LINUX_BOOTMEM_H @@ -9,6 +10,7 @@ #include #include #include +#include /* * simple boot-time physical memory area allocator. @@ -30,8 +32,10 @@ unsigned long node_boot_start; unsigned long node_low_pfn; void *node_bootmem_map; - unsigned long last_offset; - unsigned long last_pos; + segment_tree_root_t segment_tree; + segment_buf_t *free_segments; + unsigned long nr_free_segments; + int recursion_depth; } bootmem_data_t; extern unsigned long __init bootmem_bootmap_pages (unsigned long); diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/include/linux/segment_tree.h bootmem/include/linux/segment_tree.h --- linux-2.5/include/linux/segment_tree.h Wed Dec 31 16:00:00 1969 +++ bootmem/include/linux/segment_tree.h Fri Dec 6 09:17:44 2002 @@ -0,0 +1,128 @@ +/* + * linux/include/linux/segment_tree.h + * + * Copyright (C) Oct 2001 William Irwin, IBM + * + * segment trees augmented with length information. + * + */ + +#ifndef _SEGMENT_TREE_H +#define _SEGMENT_TREE_H + +#include +#include + +typedef struct segment_tree_node { + struct treap start; + struct treap length; +} segment_tree_node_t; + +typedef union segment_buf { + segment_tree_node_t segment; + union segment_buf *next; +} segment_buf_t; + +typedef struct segment_tree_root { + struct treap *start_tree; + struct treap *length_tree; +} segment_tree_root_t; + +#define segment_length(node) ((node)->length.value) +#define segment_start(node) ((node)->start.value) +#define segment_end(node) ((node)->start.value + (node)->length.value - 1) + +#define segment_above_point(node, point) \ + (segment_end(node) > (point)) + +#define segment_below_point(node, point) \ + (segment_start(node) < (point)) + +#define segment_contains_point(node, point) \ + (segment_start(node) <= (point) && segment_end(node) >= (point)) + +#define segment_above(node1, node2) \ + (segment_start(node1) > segment_end(node2)) + +#define segment_below(node1, node2) \ + (segment_end(node1) < segment_start(node2)) + +#define segment_disjoint(node1, node2) \ + (segment_above(node1, node2) || segment_below(node1, node2)) + +#define segment_intersect(node1, node2) \ + (segment_start(node1) <= segment_end(node2) \ + && segment_start(node2) <= segment_end(node1)) + +#define segment_contains(node1, node2) \ + (segment_start(node1) <= segment_start(node2) \ + && segment_end(node1) >= segment_end(node2)) + +#define segment_set_endpoints(node, start, end) \ + do { \ + segment_length(node) = (end) - (start) + 1; \ + segment_start(node) = (start); \ + } while(0) + +#define segment_unite(node1, node2) \ + segment_set_endpoints(node1, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_union(seg_union, node1, node2) \ + segment_set_endpoints(seg_union, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_intersection(intersect, node1, node2) \ + segment_set_endpoints(intersect, \ + max(segment_start(node1), segment_start(node2)), \ + min(segment_end(node1), segment_end(node2))) + +#define segment_set_start(node, start) \ + segment_set_endpoints(node, start, segment_end(node)) + +#define segment_set_end(node, end) \ + segment_set_endpoints(node, segment_start(node), end) + +#define start_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, start) +#define length_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, length) + +#define start_treap(node) segment_start(start_segment_treap(node)) +#define end_treap(node) segment_end(start_segment_treap(node)) + +#define segment_length_link(node) \ + treap_node_link(&start_segment_treap(node)->length) + +#define segment_start_link(node) \ + treap_node_link(&start_segment_treap(node)->start) + +#define segment_delete(node) \ + do { \ + treap_root_delete(segment_start_link(node)); \ + treap_root_delete(segment_length_link(node)); \ + } while(0) + +unsigned segment_tree_contains_point( segment_tree_node_t *root, + unsigned long point); + +unsigned segment_tree_intersects( segment_tree_node_t *root, + segment_tree_node_t *segment); + +void segment_complement(segment_tree_node_t **segment, + segment_tree_node_t *to_remove, + segment_tree_node_t **fragment); + +void segment_end_split(struct treap **root, + unsigned long end, + struct treap **less, + struct treap **more); + +void segment_all_intersect( struct treap **root, + unsigned long start, + unsigned long end, + struct treap **intersect); + +#endif /* _SEGMENT_TREE_H */ diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/include/linux/treap.h bootmem/include/linux/treap.h --- linux-2.5/include/linux/treap.h Wed Dec 31 16:00:00 1969 +++ bootmem/include/linux/treap.h Fri Dec 6 09:17:44 2002 @@ -0,0 +1,80 @@ +/* + * linux/include/linux/treap.h + * + * Copyright (C) 2001 William Irwin, IBM + * + * Simple treap implementation, following Aragon and Seidel. + */ + +#ifndef _TREAP_H +#define _TREAP_H + +#include +#include + +struct treap { + unsigned long priority; + unsigned long value; + struct treap *left, *right, *parent; + unsigned long marker; +}; + +#define TREAP_INIT(root) \ + do { \ + *(root) = NULL; \ + } while(0) + +/* XXX: kill this */ +#define treap_entry(ptr, type, member) container_of(ptr, type, member) + +static inline struct treap **treap_node_link(struct treap *node) +{ + if (!node || !node->parent) + return NULL; + else if (node == node->parent->left) + return &node->parent->left; + else if (node == node->parent->right) + return &node->parent->right; + + BUG(); + return NULL; +} + +/* XXX: un-macro-ize this */ +#define treap_find_parent_and_remove_child(tmp, parent) \ + do { \ + parent = (tmp)->parent; \ + if (parent && (parent)->left == tmp) \ + (parent)->left = NULL; \ + else if (parent && (parent)->right == tmp) \ + (parent)->right = NULL; \ + else \ + BUG_ON(parent); \ + } while(0) + + +/* XXX: un-macro-ize this */ +#define treap_find_leftmost_leaf(node) \ + do { \ + if (!node) \ + break; \ + while (1) { \ + if ((node)->left) \ + node = (node)->left; \ + else if (node->right) \ + node = (node)->right; \ + else \ + break; \ + } \ + } while(0) + + +void treap_rotate_left(struct treap **); +void treap_rotate_right(struct treap **); +struct treap *treap_root_delete(struct treap **); +void treap_insert(struct treap **, struct treap *); +struct treap *treap_delete(struct treap **, unsigned long); +void treap_join(struct treap **, struct treap **, struct treap **); +void treap_split(struct treap **, unsigned long, struct treap **, struct treap **); + +#endif /* _TREAP_H */ diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/lib/Makefile bootmem/lib/Makefile --- linux-2.5/lib/Makefile Thu Dec 5 20:55:08 2002 +++ bootmem/lib/Makefile Fri Dec 6 09:17:47 2002 @@ -11,8 +11,8 @@ export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o \ crc32.o rbtree.o radix-tree.o kobject.o -obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \ - bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ +obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o treap.o \ + bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o segment_tree.o \ kobject.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/lib/segment_tree.c bootmem/lib/segment_tree.c --- linux-2.5/lib/segment_tree.c Wed Dec 31 16:00:00 1969 +++ bootmem/lib/segment_tree.c Fri Dec 6 09:17:47 2002 @@ -0,0 +1,267 @@ +/* + * linux/lib/segment_tree.c + * + * Copyright (C) Oct 2001 William Irwin, IBM + * + * Implementation of segment trees augmented with length information. + * + * In this context, "segment" refers to "line segment". In particular, + * I am storing closed intervals of numbers in this tree. One very + * important invariant maintained is that all the intervals in the + * tree are disjoint. This fact is actually used to help with efficient + * search, because since they are all disjoint, they are ordered + * according to any representative, in particular, the starting and + * ending points. + * + * The separate tree on length is used to help with searches for + * intervals of at least a particular length, and does not have + * any special properties otherwise. + */ + +#include +#include + +unsigned __init segment_tree_contains_point( segment_tree_node_t *root, + unsigned long point) +{ + struct treap *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_contains_point(start_segment_treap(node), point)) + return 1; + else if(segment_below_point(start_segment_treap(node), point)) + node = node->right; + else if(segment_above_point(start_segment_treap(node), point)) + node = node->left; + else + BUG(); + } + return 0; +} + +unsigned __init segment_tree_intersects( segment_tree_node_t *root, + segment_tree_node_t *segment) +{ + struct treap *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_intersect(start_segment_treap(node), segment)) + return 1; + else if(segment_below(start_segment_treap(node), segment)) + node = node->right; + else if(segment_above(start_segment_treap(node), segment)) + node = node->left; + else + BUG(); + } + return 0; +} + +/* + * There are five cases here. + * (1) the segments are disjoint + * (2) the entire segment is removed + * (3) something from the beginning of the segment is removed + * (4) something from the end of the segment is removed + * (5) the segment is split into two fragments + */ +void __init segment_complement(segment_tree_node_t **segment, + segment_tree_node_t *to_remove, + segment_tree_node_t **fragment) +{ + + if(segment_disjoint(*segment, to_remove)) { + + *fragment = NULL; + + } else if(segment_contains(to_remove, *segment)) { + + *segment = *fragment = NULL; + + } else if(segment_start(*segment) >= segment_start(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_end(to_remove) + 1; + end = segment_end(*segment); + segment_set_endpoints(*segment, start, end); + + } else if(segment_end(*segment) <= segment_end(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_start(*segment); + end = segment_start(to_remove) - 1; + segment_set_endpoints(*segment, start, end); + + } else { + unsigned long start_seg, end_seg, start_frag, end_frag; + + start_seg = segment_start(*segment); + end_seg = segment_start(to_remove) - 1; + + start_frag = segment_end(to_remove) + 1; + end_frag = segment_end(*segment); + + segment_set_endpoints(*segment, start_seg, end_seg); + segment_set_endpoints(*fragment, start_frag, end_frag); + + } +} + +/* + * Efficiently determining all possible line segments which intersect + * with another line segment requires splitting the start treap according + * to the endpoints. This is a derived key so it unfortunately may not be + * shared with the generic treap implementation. + */ +void __init segment_end_split( struct treap **root, + unsigned long end, + struct treap **less, + struct treap **more) +{ + struct treap **tree = root; + struct treap sentinel; + + sentinel.value = end; + sentinel.priority = ULONG_MAX; + sentinel.left = sentinel.right = sentinel.parent = NULL; + + while(1) { + if(!*root) { + *root = &sentinel; + goto finish; + } else if(end > end_treap(*root) && !(*root)->right) { + (*root)->right = &sentinel; + sentinel.parent = *root; + root = &(*root)->right; + goto upward; + } else if(end <= end_treap(*root) && !(*root)->left) { + (*root)->left = &sentinel; + sentinel.parent = *root; + root = &(*root)->left; + goto upward; + } else if(end > end_treap(*root)) + root = &(*root)->right; + else /* end <= end_treap(*root) */ + root = &(*root)->left; + } + +upward: + + while(1) { + if((*root)->left && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + goto finish; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent->parent->left == (*root)->parent) + root = &(*root)->parent->parent->left; + else if((*root)->parent->parent->right == (*root)->parent) + root = &(*root)->parent->parent->right; + } + +finish: + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +void __init segment_all_intersect( struct treap **root, + unsigned long start, + unsigned long end, + struct treap **intersect) +{ + struct treap *less_end, *more_end, *more_start, *less_start; + less_start = more_start = NULL; + + if(start) { + less_end = more_end = NULL; + segment_end_split(root, start, &less_end, &more_end); + treap_split(&more_end, end + 1, &less_start, &more_start); + *root = NULL; + treap_join(root, &less_end, &more_start); + } else { + treap_split(root, end + 1, &less_start, &more_start); + *root = more_start; + } + *intersect = less_start; +} + +#if 0 +/* + * If for some reason there is a reason to visualize the trees, + * the following routines may be useful examples as to how they + * may be rendered using dot from AT&T's graphviz. + */ + +extern void early_printk(const char *fmt, ...); + +void print_ptr_graph(struct treap **root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = start_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p, start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_ptr_graph(&(*root)->parent); + print_ptr_graph(&(*root)->left); + print_ptr_graph(&(*root)->right); + (*root)->marker = 0UL; + } + /* + * This is no good for cycle detection since we also traverse + * the parent links. It's -very- cyclic with those. + */ +} +static void print_length_graph(struct treap **root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = length_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p: start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_length_graph(&(*root)->parent); + print_length_graph(&(*root)->left); + print_length_graph(&(*root)->right); + (*root)->marker = 0UL; + } +} +#endif /* 0 */ diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/lib/treap.c bootmem/lib/treap.c --- linux-2.5/lib/treap.c Wed Dec 31 16:00:00 1969 +++ bootmem/lib/treap.c Fri Dec 6 09:17:47 2002 @@ -0,0 +1,251 @@ +/* + * linux/lib/treap.c + * + * Copyright (C) 2001 William Irwin, IBM + * + * Simple treap implementation, following Aragon and Seidel. + * + * Treaps are a simple binary search tree structure, with a twist that + * radically simplifies their management. That is that they keep both + * the search key and a randomly generated priority. They are then both + * heap-ordered according to the priority and binary search tree ordered + * according to the search keys. They are specifically designed for, and + * also reputed to be effective at range tree and segment tree structures + * according to both Knuth and dynamic sets according to the + * Blelloch/Reid-Miller paper. + * + * The rotations themselves are simple, and they are done less often + * than for some kinds of trees, where splay trees where specifically + * mentioned by Knuth. The decision process as to when to perform a + * rotation is simplified by the heap structure. Rotations are done in + * two instances: when rotating a node down to a leaf position before + * deletion, and in restoring the heap ordering after an insertion. + * + * Treaps also support fast splitting and joining operations, which + * make them convenient for interval searches. + * + * One important fact to observe is that when joining, all of the + * members of the left tree must be less than all the members of + * the right tree, or otherwise the search tree ordering breaks. + */ + +#include +#include + +/* + * The diagram according to which the assignments in rotation are done: + * + * T T + * | | + * y <- left x + * / \ / \ + * x C right -> A y + * / \ / \ + * A B B C + * + * Some of these assignments are not necessary, as the edges do + * not change. In these cases the assignments are retained as comments. + */ + +void __init treap_rotate_left(struct treap **root) +{ + struct treap *x, *y, *B, *T; + /* struct treap *A, *C; */ + + if(*root) { + x = *root; + T = x->parent; + y = x->right; + if(y) { + if(T && T->left == x) T->left = y; + if(T && T->right == x) T->right = y; + + y->parent = T; + *root = y; + + /* A = x->left; */ + + B = y->left; + + /* C = y->right; */ + + y->left = x; + x->parent = y; + + /* + x->left = A; + if(A) A->parent = x; + */ + + x->right = B; + if(B) B->parent = x; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +void __init treap_rotate_right(struct treap **root) +{ + struct treap *x, *y, *B, *T; + /* struct treap *A, *C; */ + + if(*root) { + y = *root; + T = y->parent; + x = y->left; + if(x) { + if(T && T->left == y) T->left = x; + if(T && T->right == y) T->right = x; + + x->parent = T; + *root = x; + + /* A = x->left; */ + + B = x->right; + + /* C = y->right; */ + + x->right = y; + y->parent = x; + + /* + x->left = A; + if(A) A->parent = x; + */ + + y->left = B; + if(B) B->parent = y; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +struct treap * __init treap_root_delete(struct treap **root) +{ + struct treap *tmp; + + while(1) { + + if(!root || !*root) return NULL; + else if(!(*root)->left && !(*root)->right) { + tmp = *root; + *root = tmp->parent = NULL; + return tmp; + } else if(!(*root)->left) { + treap_rotate_left(root); + root = &(*root)->left; + } else if(!(*root)->right) { + treap_rotate_right(root); + root = &(*root)->right; + } else if((*root)->left->priority > (*root)->right->priority) { + treap_rotate_right(root); + root = &(*root)->right; + } else { + treap_rotate_left(root); + root = &(*root)->left; + } + } +} + +void __init treap_insert(struct treap **root, struct treap *node) +{ + struct treap **tree = root; + node->left = node->right = node->parent = NULL; + + while(1) { + if(!*root) { + *root = node; + return; + } else if(node->value <= (*root)->value && !(*root)->left) { + (*root)->left = node; + node->parent = *root; + root = &(*root)->left; + break; + } else if(node->value > (*root)->value && !(*root)->right) { + (*root)->right = node; + node->parent = *root; + root = &(*root)->right; + break; + } else if(node->value <= (*root)->value) { + root = &(*root)->left; + } else { /* node->value > (*root)->value */ + root = &(*root)->right; + } + } + while(1) { + if(!*root) return; + else if((*root)->left + && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + return; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent == (*root)->parent->parent->left) + root = &(*root)->parent->parent->left; + else if((*root)->parent == (*root)->parent->parent->right) + root = &(*root)->parent->parent->right; + + } +} + +struct treap * __init treap_delete(struct treap **root, unsigned long k) +{ + while(1) { + if(!*root) return NULL; + else if(k < (*root)->value) root = &(*root)->left; + else if(k > (*root)->value) root = &(*root)->right; + else return treap_root_delete(root); + } +} + +void __init treap_split(struct treap **root, + unsigned long k, + struct treap **less, + struct treap **more) +{ + struct treap sentinel; + + sentinel.value = k; + sentinel.priority = ULONG_MAX; + sentinel.parent = sentinel.left = sentinel.right = NULL; + + treap_insert(root, &sentinel); + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +void __init treap_join(struct treap **root, + struct treap **left, + struct treap **right) +{ + struct treap sentinel; + sentinel.priority = 0UL; + sentinel.left = *left; + sentinel.right = *right; + sentinel.parent = NULL; + + if(*left) (*left)->parent = &sentinel; + if(*right) (*right)->parent = &sentinel; + + *root = &sentinel; + treap_root_delete(root); +} diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/mm/bootmem.c bootmem/mm/bootmem.c --- linux-2.5/mm/bootmem.c Thu Dec 5 20:55:08 2002 +++ bootmem/mm/bootmem.c Fri Dec 6 09:17:47 2002 @@ -3,8 +3,9 @@ * * Copyright (C) 1999 Ingo Molnar * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree memory reservation system, William Irwin, IBM, Oct 2001 * - * simple boot-time physical memory area allocator and + * Simple boot-time physical memory area allocator and * free memory collector. It's used to deal with reserved * system memory and memory holes as well. */ @@ -16,358 +17,1030 @@ #include #include #include +#include #include #include /* - * Access to this subsystem has to be serialized externally. (this is - * true for the boot process anyway) + * Design notes: + * + * This design was arrived at by considering four principal concerns, + * beyond properly representing discontiguous memory machines: + * + * (1) Machines on which the physical address space is highly fragmented. + * (2) Machines where nodes' memory fragments may be interleaved. + * (3) Machines whose physical address space layouts are irregular. + * (4) Machines requiring heavy boot-time memory reservation activity. + * + * These design concerns led to an implementation which represented + * available physical memory explicitly in terms of intervals to save + * space and also one utilizing an efficient search structure. These + * design concerns may not be universally important; however, small + * benefits should be seen even on low-memory machines, or machines + * without significant boot-time memory reservation activity. + * + * Concern (3) is perhaps the principal concern. In this situation, + * there is very little prior knowledge of memory range to node + * mappings, so perhaps a large portion of the work the bootmem + * allocator is intended to do must be done "up front" when bitmaps + * associated with memory ranges are used to represent availability + * information. While it is possible to use bitmaps for that purpose, + * it is my belief that the reduced space overhead of the segment + * trees and the obliviousness of their storage management with + * respect to the address ranges they represent is advantageous. + * + * In order to motivate how (2) is addressed, the notion of + * "residency" is useful. When a memory range is associated with + * a node, only a certain portion of it is actually available. + * the ratio of available memory to the size of the memory range + * being tracked, sizeof(available memory)/sizeof(memory in map), + * is what I call the residency of the range. When the map of the + * available memory requires a contiguous range of memory that is + * a larger proportion of the range of memory being tracked than + * the residency of that range, then the algorithm can no longer + * properly function. + * So to address that, a representation has been chosen which does + * not grow with the size of the range of memory being represented. + * The residency requirements of the bitmap-based representation + * are 1/(8*sizeof(page)) on byte addressed machines. But the range + * set representation has no specific residency requirements. + * Segment pools need not be drawn from a contiguous range of memory + * larger than the combined size of a header for tracking all the + * segment pools and the size of a single range structure. Dynamic + * addition of segment pools is not implemented here yet. + */ + +/* + * Access to this subsystem has to be serialized externally. (This is + * true for the boot process anyway.) + */ + +/* + * Alignment has to be a power of 2 value. + * These macros abstract out common address calculations for alignments. + */ +#define RND_DN(x,n) ((x) & ~((n)-1)) +#define RND_UP(x,n) RND_DN((x) + (n) - 1, n) +#define DIV_DN(x,n) ((x) / (n)) +#define DIV_UP(x,n) DIV_DN((x) + ((n) - 1), n) + + +/* + * The number of segments to start with. + */ +#define NR_SEGMENTS (PAGE_SIZE/sizeof(segment_buf_t)) + +/* + * the minimum number of segments to allow in the free segment pool + */ +#define MIN_FREE_SEGMENTS 4 + +/* + * The highest and lowest page frame numbers on the system. + * These refer to physical addresses backed by memory regardless + * of runtime availability. */ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -/* return the number of _pages_ that will be allocated for the boot bitmap */ -unsigned long __init bootmem_bootmap_pages (unsigned long pages) +/* + * This is a poor choice of random seeds for deterministic + * behavior during debugging. Oddly enough it does not seem + * to damage the structure of the trees. + */ +static unsigned long __initdata random_seed = 1UL; + +/* + * Park-Miller random number generator, using Schrage's + * technique for overflow handling. + */ +static unsigned long __init rand(void) +{ + unsigned long a = 16807; + unsigned long q = 12773; + unsigned long r = 2386; + unsigned long k; + + k = random_seed / q; + random_seed = a*(random_seed - k*q) - r*k; + return random_seed; +} + +/* + * Initialize the segment pool, which occupies node_bootmem_map. + * This is the memory from which the tree nodes tracking available + * memory are allocated. + */ +static void __init segment_pool_init(bootmem_data_t *bdata) +{ + unsigned k; + segment_buf_t *segment_pool; + + segment_pool = (segment_buf_t *)bdata->node_bootmem_map; + + for (k = 0; k < NR_SEGMENTS - 1; ++k) + segment_pool[k].next = &segment_pool[k+1]; + segment_pool[NR_SEGMENTS-1].next = NULL; + + bdata->free_segments = segment_pool; + bdata->nr_free_segments = NR_SEGMENTS; +} + +/* + * Allocates a tree node from a node's segment pool, initializing the + * whole of the memory block to zeroes. + */ +static segment_tree_node_t * __init segment_alloc(bootmem_data_t *bdata) +{ + segment_tree_node_t *segment; + + segment = (segment_tree_node_t *)bdata->free_segments; + + /* + * Even though we're stealing pages, we've backed ourselves + * into a corner at this point and can no longer proceed. + */ + if (!bdata->free_segments) + return NULL; + + bdata->free_segments = bdata->free_segments->next; + bdata->nr_free_segments--; + + memset(segment, 0, sizeof(segment_tree_node_t)); + + return segment; +} + +/* + * Dynamically refill the segment pool, with prevention of infinite + * recursion by means of a flag in the bootmem control data. + */ +static void * __init __alloc_bootmem_core( bootmem_data_t *, + unsigned long, + unsigned long, + unsigned long); +static void __init segment_pool_refill(bootmem_data_t *bdata) +{ + int k; + segment_buf_t *new_segments, *end_segment; + + if (bdata->nr_free_segments >= MIN_FREE_SEGMENTS) + return; + + /* + * This prevents segment_pool_refill() from recursing + * indefinitely; it cannot recurse deeper if the check succeeds, + * and if it fails, at the next level of recursion, this check + * will fail and the recursion terminates. + */ + if (bdata->recursion_depth) + return; + + bdata->recursion_depth++; + + /* + * The choice of 0 as a goal always prevents additional + * fragmentation of the segment tree, as it's the lowest + * possible starting goal address. + */ + new_segments = __alloc_bootmem_core(bdata, PAGE_SIZE, 0, 0); + + if (!new_segments) + goto out; + + for (k = 0; k < NR_SEGMENTS - 1; ++k) + new_segments[k].next = &new_segments[k+1]; + new_segments[NR_SEGMENTS-1].next = NULL; + + bdata->nr_free_segments += NR_SEGMENTS; + end_segment = bdata->free_segments; + + if (!end_segment) { + bdata->free_segments = new_segments; + goto out; + } + + while (end_segment->next) + end_segment = end_segment->next; + + end_segment->next = new_segments; + +out: + bdata->recursion_depth--; +} + +/* + * Convenience operation to insert a tree node into both + * of the segment trees associated with a node. The randomized + * priorities are used here. + */ +static void __init segment_insert(segment_tree_root_t *root, + segment_tree_node_t *node) { - unsigned long mapsize; + node->start.priority = rand(); + node->length.priority = rand(); + treap_insert(&root->start_tree, &node->start); + treap_insert(&root->length_tree, &node->length); +} - mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; +/* + * Returns a segment tree node to the node-local pool of available + * tree nodes. + */ +static void __init segment_free(bootmem_data_t *bdata, + segment_tree_node_t *node) +{ + segment_buf_t *tmp; - return mapsize; + if (!node) + return; + + tmp = (segment_buf_t *)node; + tmp->next = bdata->free_segments; + bdata->free_segments = tmp; + bdata->nr_free_segments++; +} + +/* + * Return the number of _pages_ that will be allocated for the bootmem + * segment pool. Its sole purpose is to warn callers of the bootmem + * interface in advance of its size, so that a suitably large range of + * physical memory may be found to hold it. + */ +unsigned long __init bootmem_bootmap_pages (unsigned long pages) +{ + return DIV_UP(NR_SEGMENTS*sizeof(segment_buf_t),PAGE_SIZE); } /* * Called once to set up the allocator itself. + * Its responsibilities are manipulate the bootmem_data_t within + * a node, initializing its address range and node-local segment + * pool fields. It is supposed to calculate the amount of memory + * required for the node_bootmem_map, but this is not possible + * without a change of interface. */ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; pgdat->pgdat_next = pgdat_list; pgdat_list = pgdat; - mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; + bdata->nr_free_segments = 0; + bdata->recursion_depth = 0; /* * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ - memset(bdata->node_bootmem_map, 0xff, mapsize); + bdata->segment_tree.start_tree = NULL; + bdata->segment_tree.length_tree = NULL; + segment_pool_init(bdata); - return mapsize; + return RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE); } /* - * Marks a particular physical memory range as unallocatable. Usable RAM - * might be used for boot-time allocations - or it might get added - * to the free page pool later on. + * reserve_bootmem_core marks a particular segment of physical + * memory as unavailable. Available memory might be used for boot-time + * allocations, or it might be made available again later on. + * + * Its behavior is to mark the specified range of physical memory + * as unavailable, irrespective of alignment constraints (in contrast + * to prior incarnations, which page-aligned the starting and ending + * addresses of the unavailable interval of memory). */ -static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +static void __init reserve_bootmem_core(bootmem_data_t *bdata, + unsigned long addr, unsigned long size) { - unsigned long i; + unsigned long start; + unsigned long end; + segment_tree_node_t split_segment, segment; + segment_tree_node_t reserved_left, reserved_right; + segment_tree_node_t *multiple_left, *multiple_right; + struct treap *tmp, *parent, *intersect; + + /* + * Round up, partially reserved pages are considered fully reserved. + */ + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + + segment_all_intersect(&bdata->segment_tree.start_tree, + start, end, &intersect); + + /* + * If the set of intersecting intervals is empty, report + * the entire interval as multiply-reserved. Then the + * condition of the loop ensures a proper exit will follow. + */ + if (!intersect) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved (!intersect)\n", + segment_start(&segment), + segment_end(&segment)); + /* - * round up, partially reserved pages are considered - * fully reserved. + * For error-checking, this must be called only for a single + * node per reservation. The next step in strict error checking + * would be to track the fragments of the interval to reserve + * that do not lie within any available interval and then report + * them as multiply-reserved. + * + * Unfortunately, error checking that way appears to require + * unbounded allocations in order to maintain the set of multiply + * reserved intervals, so it is not entirely robust. + * + * For the moment, a cruder form of error checking is done: + * if the available interval does not contain the interval + * to be reserved, then the complement of the reserved + * interval with respect to the available interval is reported + * as multiply reserved. This may multiply report multiply + * reserved ranges, but it is still less verbose than the + * mechanism used in the bitmap-based allocator. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; - unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; - if (!size) BUG(); + /* + * Destructive post-order traversal of the set of + * intersecting intervals. + */ + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while (tmp) { + segment_tree_node_t *fragment = &split_segment; + segment_tree_node_t *avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + multiple_left = &reserved_left; + multiple_right = &reserved_right; + + if (!segment_contains(avail, &segment)) { + segment_set_endpoints(multiple_left, + segment_start(&segment), + segment_end(&segment)); + segment_complement(&multiple_left, avail, + &multiple_right); + if (multiple_left) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (left)\n", + segment_start(multiple_left), + segment_end(multiple_left)); + if (multiple_right) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (right)\n", + segment_start(multiple_right), + segment_end(multiple_right)); + } + + if (!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); - if (sidx < 0) - BUG(); - if (eidx < 0) - BUG(); - if (sidx >= eidx) - BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) - BUG(); - for (i = sidx; i < eidx; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + segment_complement(&avail, &segment, &fragment); + + if (!avail) + segment_free(bdata, start_segment_treap(tmp)); + else + segment_insert(&bdata->segment_tree, avail); + + if (fragment) { + + avail = segment_alloc(bdata); + BUG_ON(!avail); + + segment_set_endpoints(avail, segment_start(fragment), + segment_end(fragment)); + segment_insert(&bdata->segment_tree, avail); + } + + tmp = parent; + treap_find_leftmost_leaf(tmp); + } } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +/* + * free_bootmem_core marks a particular segment of the physical + * address space as available. Its semantics are to make the range + * of addresses available, irrespective of alignment constraints. + */ +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) { - unsigned long i; - unsigned long start; + unsigned long start, end; + segment_tree_node_t segment, *avail, intersection, freed; + struct treap *tmp, *parent, *intersect = NULL; + + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + segment_set_endpoints(&freed, start, end); + + segment_all_intersect(&bdata->segment_tree.start_tree, + start ? start - 1 : start, end + 1, &intersect); + /* - * round down end of usable mem, partially free pages are - * considered reserved. + * Error checking here is simple: + * If the available segment and the segment being freed truly + * intersect, their intersection should be reported as multiply + * made available. */ - unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; - - if (!size) BUG(); - if (end > bdata->node_low_pfn) - BUG(); /* - * Round up the beginning of the address. + * Destructive post-order traversal of the set of intervals + * intersecting with the freed interval expanded by one. This + * provides for merging of available intervals, as all the + * adjacent intervals are united with newly available interval. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while (tmp) { + + avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + if (segment_intersect(&freed, avail)) { + segment_intersection(&intersection, &freed, avail); + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply made available\n", + segment_start(&intersection), + segment_end(&intersection)); + } - for (i = sidx; i < eidx; i++) { - if (!test_and_clear_bit(i, bdata->node_bootmem_map)) - BUG(); + segment_unite(&segment, avail); + + if (!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); + + segment_free(bdata, avail); + + tmp = parent; + treap_find_leftmost_leaf(tmp); } + + avail = segment_alloc(bdata); + BUG_ON(!avail); + + segment_set_endpoints(avail, segment_start(&segment), + segment_end(&segment)); + + segment_insert(&bdata->segment_tree, avail); } /* - * We 'merge' subsequent allocations to save space. We might 'lose' - * some fraction of a page if allocations cannot be satisfied due to - * size constraints on boxes where there is physical RAM space - * fragmentation - in these cases * (mostly large memory boxes) this - * is not a problem. + * The terms are borrowed from linear programming. + * A feasible line segment is one which contains a subinterval + * aligned on the appropriate boundary of sufficient length. * - * On low memory boxes we get it right in 100% of the cases. + * The objective function is the magnitude of the least residue + * of the smallest aligned address within the subinterval minus the goal + * mod the largest page frame number. A conditional is used instead of + * of remainder so as to avoid the overhead of division. + * + * The idea here is to iterate over the feasible set and minimize + * the objective function (by exhaustive search). The search space + * is "thinned" prior to the iteration by using the heuristic that + * the interval must be at least of the length requested, though + * that is not sufficient because of alignment constraints. */ +#define FEASIBLE(seg, len, align) \ +( \ + (segment_end(seg) >= RND_UP(segment_start(seg), align)) \ + && \ + ((segment_end(seg) - RND_UP(segment_start(seg), align)) > (len))\ +) + +#define STARTS_BELOW(seg,goal,align,len) \ + (RND_UP(segment_start(seg), align) <= (goal)) + +#define ENDS_ABOVE(seg, goal, align, len) \ + ((segment_end(seg) > (goal)) && ((segment_end(seg) - (goal)) > (len))) + +#define GOAL_WITHIN(seg,goal,align,len) \ + (STARTS_BELOW(seg,goal,align,len) && ENDS_ABOVE(seg,goal,align,len)) + +#define GOAL_ABOVE(seg, goal, align) \ + ((goal) > segment_end(seg)) + +#define DISTANCE_BELOW(seg, goal, align) \ + (segment_start(seg) - (goal)) + +#define DISTANCE_ABOVE(seg, goal, align) \ + (((ULONG_MAX - (goal)) + 1) + segment_start(seg)) + +#define OBJECTIVE(seg, goal, align, len) \ +( GOAL_WITHIN(seg,goal,align,len) \ + ? 0UL \ + : ( \ + GOAL_ABOVE(seg, goal, align) \ + ? DISTANCE_ABOVE(seg, goal, align) \ + : DISTANCE_BELOW(seg, goal, align) \ + ) \ +) + +#define UNVISITED 0 +#define LEFT_SEARCHED 1 +#define RIGHT_SEARCHED 2 +#define VISITED 3 + /* - * alignment has to be a power of 2 value. + * __alloc_bootmem_core attempts to satisfy reservation requests + * of a certain size with alignment constraints, so that the beginning + * of the allocated line segment is as near as possible to the goal + * in the following sense: + * + * The beginning of the allocated line segment is either the lowest + * possible address above the goal, or the lowest possible address + * overall. This actually has a simple notion of distance, namely + * (goal - start) % (MAX_ADDR + 1). The OBJECTIVE macros measures + * this distance, albeit with some arithmetic complications. + * + * The algorithm proceeds as follows: + * (1) Divide the set of available intervals into those which are + * long enough and those which are not long enough, ignoring + * alignment constraints. + * (2) Perform depth-first search over the tree of supposedly + * long enough intervals for the best possible interval. + * + * The FEASIBLE macro is used to determine whether it is truly + * possible to place an aligned interval of sufficient length + * within the interval, and it is needed because the true length + * of the interval is not sufficient to determine that, and + * because it is not truly possible to subdivide the set of available + * intervals according to this criterion with pure tree operations. + * + * As address ranges are the granularity of available interval tracking, + * this should provide optimal merging behavior. */ + static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal) { - unsigned long i, start = 0; + unsigned long length; + segment_tree_node_t left_half, right_half, reserved, *left, *right; + segment_tree_node_t *optimum, *node; + struct treap *tmp, *infeasible, *feasible; void *ret; - unsigned long offset, remaining_size; - unsigned long areasize, preferred, incr; - unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >> - PAGE_SHIFT); - - if (!size) BUG(); - - if (align & (align-1)) - BUG(); - - offset = 0; - if (align && - (bdata->node_boot_start & (align - 1UL)) != 0) - offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; - - /* - * We try to allocate bootmem pages above 'goal' - * first, then we try to allocate lower pages. - */ - if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { - preferred = goal - bdata->node_boot_start; - } else - preferred = 0; - - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; - preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; - -restart_scan: - for (i = preferred; i < eidx; i += incr) { - unsigned long j; - if (test_bit(i, bdata->node_bootmem_map)) + + feasible = infeasible = NULL; + + if (!align) + align = 1; + + length = size; + BUG_ON(!length); + + treap_split(&bdata->segment_tree.length_tree, length, &infeasible, + &feasible); + optimum = NULL; + + tmp = feasible; + while (tmp) { + + if (tmp->marker == UNVISITED) { + if (tmp->left) { + tmp->marker = LEFT_SEARCHED; + tmp = tmp->left; + continue; + } else if (tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if (tmp->marker == LEFT_SEARCHED) { + if (tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if (tmp->marker == RIGHT_SEARCHED) + tmp->marker = VISITED; + else if (tmp->marker == VISITED) { + tmp->marker = UNVISITED; + tmp = tmp->parent; continue; - for (j = i + 1; j < i + areasize; ++j) { - if (j >= eidx) - goto fail_block; - if (test_bit (j, bdata->node_bootmem_map)) - goto fail_block; - } - start = i; - goto found; - fail_block:; - } - if (preferred) { - preferred = offset; - goto restart_scan; + } else + BUG(); + + if (!tmp) + break; + + node = length_segment_treap(tmp); + + if (!optimum && FEASIBLE(node, length, align)) + + optimum = node; + + else if (FEASIBLE(node, length, align) + && (OBJECTIVE(node, goal, align, length) + < OBJECTIVE(optimum, goal, align, length))) + + optimum = node; + } - return NULL; -found: - if (start >= eidx) - BUG(); /* - * Is the next page of the previous allocation-end the start - * of this allocation's buffer? If yes then we can 'merge' - * the previous partial page with this allocation. - */ - if (align <= PAGE_SIZE - && bdata->last_offset && bdata->last_pos+1 == start) { - offset = (bdata->last_offset+align-1) & ~(align-1); - if (offset > PAGE_SIZE) - BUG(); - remaining_size = PAGE_SIZE-offset; - if (size < remaining_size) { - areasize = 0; - // last_pos unchanged - bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - } else { - remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - bdata->last_pos = start+areasize-1; - bdata->last_offset = remaining_size; - } - bdata->last_offset &= ~PAGE_MASK; - } else { - bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + * Restore the set of available intervals keyed by length, + * taking into account the need to remove the optimum from + * the set if it has been determined. + */ + if (!optimum) { + treap_join(&bdata->segment_tree.length_tree, &feasible, + &infeasible); + return NULL; } + + if (!treap_root_delete(treap_node_link(&optimum->start))) + treap_root_delete(&bdata->segment_tree.start_tree); + + if (!treap_root_delete(treap_node_link(&optimum->length))) + treap_root_delete(&feasible); + + treap_join(&bdata->segment_tree.length_tree, &infeasible, &feasible); + /* - * Reserve the area now: + * Now the iteration has converged to the optimal feasible interval. + * Within that interval we must now choose a subinterval + * satisfying the alignment constraints and do the appropriate + * splitting of the interval from which it was drawn. */ - for (i = start; i < start+areasize; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - BUG(); + + segment_set_endpoints(&reserved, goal, goal + length - 1); + + if (!segment_contains_point(optimum, goal) + || !segment_contains(optimum, &reserved)) + + segment_set_endpoints(&reserved, + RND_UP(segment_start(optimum), align), + RND_UP(segment_start(optimum),align)+length-1); + + segment_set_endpoints(&left_half, segment_start(optimum), + segment_end(optimum)); + + left = &left_half; + right = &right_half; + segment_complement(&left, &reserved, &right); + + if (!left && !right) + segment_free(bdata, optimum); + + if (left) { + segment_set_endpoints(optimum, segment_start(left), + segment_end(left)); + segment_insert(&bdata->segment_tree, optimum); + } + + if (right) { + segment_tree_node_t *segment = segment_alloc(bdata); + BUG_ON(!segment); + segment_set_endpoints(segment, segment_start(right), + segment_end(right)); + segment_insert(&bdata->segment_tree, segment); + } + + /* + * Convert the physical address to a kernel virtual address, + * zero out the memory within the interval, and return it. + */ + ret = (void *)(phys_to_virt(segment_start(&reserved))); memset(ret, 0, size); + + segment_pool_refill(bdata); + return ret; } +/* + * Determine the order of the page which may be freed. + */ +static unsigned long find_order(unsigned long start, unsigned long end) +{ + unsigned long long order = 0; + unsigned long max_order; + + /* + * The highest-order page usable for __free_pages() is limited + * by the alignment of the starting pageframe number. + */ + max_order = ffz(~start & (~0UL >> (BITS_PER_LONG - MAX_ORDER + 1))); + + while (order + 1 < max_order) { + unsigned long new_order = (order + max_order)/2; + if (1 << new_order <= end - start) + order = new_order; + else + max_order = new_order; + } + + /* + * Clean up after the possible difference between order and max_order. + */ + if (1 << max_order <= end - start) + return max_order; + + return order; +} + +static void __init free_bootmem_range( struct page *node_mem_map, + unsigned long node_start, + unsigned long start, + unsigned long end) +{ + unsigned long start_index = start - node_start; + unsigned long end_index = end - node_start - 1; + + while (end_index >= start_index) { + unsigned long order = find_order(start_index, end_index); + __free_pages(node_mem_map + start_index, order); + start_index += 1 << order; + } +} + +/* + * free_all_bootmem_core's responsibilities are to initialize the + * node_mem_map array of struct page with the availability information + * regarding physical memory, and to make available the memory the + * bootmem allocator itself used for tracking available physical memory. + * Here the prior behavior with respect to page alignment is emulated + * by reducing the granularity of the address ranges to page frames, + * using the conservative approximation of the largest page-aligned + * interval lying within the interval seen to be available, or making + * no memory available if the interval is smaller than a page in length. + */ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { - struct page *page = pgdat->node_mem_map; - bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; - unsigned long *map; - - if (!bdata->node_bootmem_map) BUG(); - - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); - map = bdata->node_bootmem_map; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - if (v) { - unsigned long m; - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + unsigned long total = 0UL, mapstart, start, end; + unsigned long node_start = pgdat->bdata->node_boot_start >> PAGE_SHIFT; + struct page *page; + struct treap *parent, *tmp; + + mapstart = virt_to_phys(pgdat->bdata->node_bootmem_map); + +#ifdef DEBUG_BOOTMEM + + printk("Available physical memory:\n"); + +#endif /* DEBUG_BOOTMEM */ + + free_bootmem_core(pgdat->bdata, mapstart, + RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE)); + + /* + * Destructive post-order traversal of the length tree. + * The tree is never used again, so no attempt is made + * to restore it to working order. + */ + tmp = pgdat->bdata->segment_tree.length_tree; + treap_find_leftmost_leaf(tmp); + while (tmp) { + segment_tree_node_t *segment = length_segment_treap(tmp); + + /* + * This calculation differs from that in prior + * incarnations in this subsystem, so I describe it + * in passing detail here. + * + ******************************************************* + * + * We have start so that start is the least pfn with + * + * PAGE_SIZE * start >= segment_start(segment) + * + * so after division and ceiling: + * + * start = DIV_UP(segment_start(segment), PAGE_SIZE) + * + ******************************************************* + * + * Now the last pfn is the greatest pfn such that + * + * PAGE_SIZE * last + PAGE_SIZE - 1 <= segment_end(segment) + * + * -or- + * + * PAGE_SIZE * (last + 1) <= segment_end(segment) + 1 + * + * giving us after division and flooring: + * + * last + 1 = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + * or using end as a -strict- upper bound (i.e. end > pfn), + * we have + * + * end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + */ + + start = DIV_UP(segment_start(segment), PAGE_SIZE); + end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE); + +#ifdef DEBUG_BOOTMEM + + if (start < end) + printk("available segment: [%lx,%lx]\n", + start * PAGE_SIZE, + end * PAGE_SIZE - 1); + +#endif /* DEBUG_BOOTMEM */ + + for ( page = pgdat->node_mem_map + (start - node_start); + page < pgdat->node_mem_map + (end - node_start); + ++page) { + + ClearPageReserved(page); + set_page_count(page, 1); } + + free_bootmem_range( pgdat->node_mem_map, + node_start, + start, + end); + + /* + * In most calculations in this file, closed intervals + * are considered. In this instance, a half-open interval + * is being considered, and so the usual end - start + 1 + * calculation does not apply. + */ + if (start < end) + total += end - start; + + treap_find_parent_and_remove_child(tmp, parent); + tmp = parent; + treap_find_leftmost_leaf(tmp); } - } else { - i+=BITS_PER_LONG; - page+=BITS_PER_LONG; - } - } - total += count; - - /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - total += count; - bdata->node_bootmem_map = NULL; return total; } -unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +/* + * Wrappers around the core routines so that they operate on the + * per-node memory structures (pg_data_t *pgdat). + */ +unsigned long __init init_bootmem_node (pg_data_t *pgdat, + unsigned long freepfn, + unsigned long startpfn, + unsigned long endpfn) { - return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); + return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { reserve_bootmem_core(pgdat->bdata, physaddr, size); } -void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) { - return(free_bootmem_core(pgdat->bdata, physaddr, size)); + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + free_bootmem_core(pgdat->bdata, physaddr, size); } unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) { - return(free_all_bootmem_core(pgdat)); + return free_all_bootmem_core(pgdat); } #ifndef CONFIG_DISCONTIGMEM +/* + * Non-node-aware wrappers for the core routines. The per-node + * structures are hidden by using the global variable contig_page_data. + */ + unsigned long __init init_bootmem (unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return(init_bootmem_core(&contig_page_data, start, 0, pages)); + return init_bootmem_core(&contig_page_data, start, 0, pages); } -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -void __init reserve_bootmem (unsigned long addr, unsigned long size) -{ - reserve_bootmem_core(contig_page_data.bdata, addr, size); -} -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ - +/* + * In multinode configurations it is not desirable to make memory + * available without information about the node assignment of the + * memory range, so even though reserve_bootmem() may operate + * without node information this cannot. + * + * This apparent inconsistency in the interface actually makes + * some sense, as when presented with irregular node to memory range + * assignments in firmware tables, the original request to make memory + * available will be aware of its node assignment. But an outstanding + * issue is that a non-node-aware memory reservation request (via + * alloc_bootmem()) will not know to which node to return the memory. + * + * Resolving that issue would involve tracking dynamic allocations + * separately from assertions regarding the presence of physical + * memory, which is feasible given a change of interface, or perhaps a + * separate tree in each node for memory reserved by dynamic allocations. + */ void __init free_bootmem (unsigned long addr, unsigned long size) { - return(free_bootmem_core(contig_page_data.bdata, addr, size)); -} - -unsigned long __init free_all_bootmem (void) -{ - return(free_all_bootmem_core(&contig_page_data)); + free_bootmem_core(contig_page_data.bdata, addr, size); } #endif /* !CONFIG_DISCONTIGMEM */ -void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +/* + * reserve_bootmem operates without node information, yet is node + * aware. In situations where it may not be clear to where a given + * physical memory range is assigned this performs the task of + * searching the nodes on behalf of the caller. + */ +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +void __init reserve_bootmem (unsigned long addr, unsigned long size) { + unsigned long start, end; + unsigned in_any_node = 0; + segment_tree_node_t segment, *tree; pg_data_t *pgdat = pgdat_list; - void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal))) - return(ptr); + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); /* - * Whoops, we cannot satisfy the allocation request. + * For error checking, this must determine the node(s) within + * which an interval to be reserved lies. Otherwise, once the + * error checking is in place, the memory will be reported as + * multiply-reserved on those nodes not containing the memory. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; + for_each_pgdat (pgdat) { + unsigned in_node; + + tree = start_segment_treap(pgdat->bdata->segment_tree.start_tree); + in_node = segment_tree_intersects(tree, &segment); + in_any_node |= in_node; + + if (in_node) + reserve_bootmem_node(pgdat, addr, size); + } + + if (!in_any_node) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved\n", + segment_start(&segment), + segment_end(&segment)); +} +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + +/* + * free_all_bootmem is now a convenience function, and iterates over + * all the nodes, performing free_all_bootmem_core. + */ +unsigned long __init free_all_bootmem (void) +{ + pg_data_t *pgdat = pgdat_list; + unsigned long total = 0UL; + + for_each_pgdat (pgdat) + total += free_all_bootmem_core(pgdat); + + return total; } -void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) +/* + * __alloc_bootmem performs a search over all nodes in order to satisfy + * an allocation request, for when it is unimportant from which node + * the memory used to satisfy an allocation is drawn. + */ +void * __init __alloc_bootmem (unsigned long size, unsigned long align, + unsigned long goal) { + pg_data_t *pgdat = pgdat_list; void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); - if (ptr) - return (ptr); + for_each_pgdat (pgdat) { + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + + if (ptr) + return ptr; + } - /* - * Whoops, we cannot satisfy the allocation request. - */ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } -