1 diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
2 index 081c497..75bde3d 100644
3 --- a/Documentation/vm/00-INDEX
4 +++ b/Documentation/vm/00-INDEX
5 @@ -16,6 +16,8 @@ hwpoison.txt
6 - explains what hwpoison is
8 - how to use the Kernel Samepage Merging feature.
10 + - Introduction to Ultra KSM
12 - information about NUMA specific code in the Linux vm.
13 numa_memory_policy.txt
14 diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
16 index 0000000..08bd645
18 +++ b/Documentation/vm/uksm.txt
20 +The Ultra Kernel Samepage Merging feature
21 +----------------------------------------------
23 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
25 + * This is an improvement upon KSM. Some basic data structures and routines
26 + * are borrowed from ksm.c .
29 + * 1. Full system scan:
30 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
31 + * interaction to submit a memory area to KSM is no longer needed.
33 + * 2. Rich area detection:
34 + * It automatically detects rich areas containing abundant duplicated
35 + * pages based. Rich areas are given a full scan speed. Poor areas are
36 + * sampled at a reasonable speed with very low CPU consumption.
38 + * 3. Ultra Per-page scan speed improvement:
39 + * A new hash algorithm is proposed. As a result, on a machine with
40 + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
41 + * can scan memory areas that does not contain duplicated pages at speed of
42 + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
43 + * 477MB/sec ~ 923MB/sec.
45 + * 4. Thrashing area avoidance:
46 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
47 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
48 + * hash value based volatile page detection.
51 + * 5. Misc changes upon KSM:
52 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
53 + * comparison. It's much faster than default C version on x86.
54 + * * rmap_item now has an struct *page member to loosely cache a
55 + * address-->page mapping, which reduces too much time-costly
57 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
58 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
59 + * ksm is needed for this case.
61 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
62 + * Now uksmd consider full zero pages as special pages and merge them to an
63 + * special unswappable uksm zero page.
68 +2012-05-05 The creation of this Doc
69 +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
70 +2012-05-28 UKSM 0.1.1.2 bug fix release
71 +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
72 +2012-07-2 UKSM 0.1.2-beta2
73 +2012-07-10 UKSM 0.1.2-beta3
74 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
75 +2012-10-13 UKSM 0.1.2.1 Bug fixes.
76 +2012-12-31 UKSM 0.1.2.2 Minor bug fixes
77 +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug"
78 diff --git a/fs/exec.c b/fs/exec.c
79 index 7302b75..84b0df5 100644
83 * current->executable is only used by the procfs. This allows a dispatch
84 * table to check for several different types of binary formats. We keep
85 * trying until we recognize the file or we run out of supported binary
90 #include <linux/slab.h>
92 #include <linux/pipe_fs_i.h>
93 #include <linux/oom.h>
94 #include <linux/compat.h>
95 +#include <linux/ksm.h>
97 #include <asm/uaccess.h>
98 #include <asm/mmu_context.h>
99 @@ -1128,6 +1129,7 @@ void setup_new_exec(struct linux_binprm * bprm)
100 /* An exec changes our domain. We are no longer part of the thread
102 current->self_exec_id++;
104 flush_signal_handlers(current, 0);
105 do_close_on_exec(current->files);
107 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
108 index aa1eee0..5a9149c 100644
109 --- a/fs/proc/meminfo.c
110 +++ b/fs/proc/meminfo.c
111 @@ -121,6 +121,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
112 "SUnreclaim: %8lu kB\n"
113 "KernelStack: %8lu kB\n"
114 "PageTables: %8lu kB\n"
116 + "KsmZeroPages: %8lu kB\n"
118 #ifdef CONFIG_QUICKLIST
119 "Quicklists: %8lu kB\n"
121 @@ -175,6 +178,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
122 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
123 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
124 K(global_page_state(NR_PAGETABLE)),
126 + K(global_page_state(NR_UKSM_ZERO_PAGES)),
128 #ifdef CONFIG_QUICKLIST
129 K(quicklist_total_size()),
131 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
132 index 752e30d..1e7c826 100644
133 --- a/include/asm-generic/pgtable.h
134 +++ b/include/asm-generic/pgtable.h
135 @@ -537,12 +537,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
140 +static inline int is_uksm_zero_pfn(unsigned long pfn)
142 + extern unsigned long uksm_zero_pfn;
143 + return pfn == uksm_zero_pfn;
146 +static inline int is_uksm_zero_pfn(unsigned long pfn)
152 #ifdef __HAVE_COLOR_ZERO_PAGE
153 static inline int is_zero_pfn(unsigned long pfn)
155 extern unsigned long zero_pfn;
156 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
157 - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
158 + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
161 #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
162 @@ -551,7 +564,7 @@ static inline int is_zero_pfn(unsigned long pfn)
163 static inline int is_zero_pfn(unsigned long pfn)
165 extern unsigned long zero_pfn;
166 - return pfn == zero_pfn;
167 + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
170 static inline unsigned long my_zero_pfn(unsigned long addr)
171 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
172 index 3be6bb1..51557d1 100644
173 --- a/include/linux/ksm.h
174 +++ b/include/linux/ksm.h
175 @@ -19,21 +19,6 @@ struct mem_cgroup;
177 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
178 unsigned long end, int advice, unsigned long *vm_flags);
179 -int __ksm_enter(struct mm_struct *mm);
180 -void __ksm_exit(struct mm_struct *mm);
182 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
184 - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
185 - return __ksm_enter(mm);
189 -static inline void ksm_exit(struct mm_struct *mm)
191 - if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
196 * A KSM page is one of those write-protected "shared pages" or "merged pages"
197 @@ -76,6 +61,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
198 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
199 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
201 +#ifdef CONFIG_KSM_LEGACY
202 +int __ksm_enter(struct mm_struct *mm);
203 +void __ksm_exit(struct mm_struct *mm);
204 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
206 + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
207 + return __ksm_enter(mm);
211 +static inline void ksm_exit(struct mm_struct *mm)
213 + if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
217 +#elif defined(CONFIG_UKSM)
218 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
223 +static inline void ksm_exit(struct mm_struct *mm)
226 +#endif /* !CONFIG_UKSM */
228 #else /* !CONFIG_KSM */
230 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
231 @@ -123,4 +135,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
232 #endif /* CONFIG_MMU */
233 #endif /* !CONFIG_KSM */
235 +#include <linux/uksm.h>
237 #endif /* __LINUX_KSM_H */
238 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
239 index 6e0b286..ebd6243 100644
240 --- a/include/linux/mm_types.h
241 +++ b/include/linux/mm_types.h
242 @@ -308,6 +308,9 @@ struct vm_area_struct {
244 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
247 + struct vma_slot *uksm_vma_slot;
252 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
253 index ffe66e3..355f0e9 100644
254 --- a/include/linux/mmzone.h
255 +++ b/include/linux/mmzone.h
256 @@ -157,6 +157,9 @@ enum zone_stat_item {
257 WORKINGSET_NODERECLAIM,
258 NR_ANON_TRANSPARENT_HUGEPAGES,
261 + NR_UKSM_ZERO_PAGES,
263 NR_VM_ZONE_STAT_ITEMS };
266 @@ -865,7 +868,7 @@ static inline int is_highmem_idx(enum zone_type idx)
270 - * is_highmem - helper function to quickly check if a struct zone is a
271 + * is_highmem - helper function to quickly check if a struct zone is a
272 * highmem zone or not. This is an attempt to keep references
273 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
274 * @zone - pointer to struct zone variable
275 diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
277 index 0000000..6780fdb
279 +++ b/include/linux/sradix-tree.h
281 +#ifndef _LINUX_SRADIX_TREE_H
282 +#define _LINUX_SRADIX_TREE_H
285 +#define INIT_SRADIX_TREE(root, mask) \
287 + (root)->height = 0; \
288 + (root)->gfp_mask = (mask); \
289 + (root)->rnode = NULL; \
292 +#define ULONG_BITS (sizeof(unsigned long) * 8)
293 +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
294 +//#define SRADIX_TREE_MAP_SHIFT 6
295 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
296 +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
298 +struct sradix_tree_node {
299 + unsigned int height; /* Height from the bottom */
300 + unsigned int count;
301 + unsigned int fulls; /* Number of full sublevel trees */
302 + struct sradix_tree_node *parent;
306 +/* A simple radix tree implementation */
307 +struct sradix_tree_root {
308 + unsigned int height;
309 + struct sradix_tree_node *rnode;
311 + /* Where found to have available empty stores in its sublevels */
312 + struct sradix_tree_node *enter_node;
313 + unsigned int shift;
314 + unsigned int stores_size;
316 + unsigned long min; /* The first hole index */
318 + //unsigned long *height_to_maxindex;
320 + /* How the node is allocated and freed. */
321 + struct sradix_tree_node *(*alloc)(void);
322 + void (*free)(struct sradix_tree_node *node);
324 + /* When a new node is added and removed */
325 + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
326 + void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
327 + void (*rm)(struct sradix_tree_node *node, unsigned offset);
330 +struct sradix_tree_path {
331 + struct sradix_tree_node *node;
336 +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
339 + root->rnode = NULL;
340 + root->shift = shift;
341 + root->stores_size = 1UL << shift;
342 + root->mask = root->stores_size - 1;
346 +extern void *sradix_tree_next(struct sradix_tree_root *root,
347 + struct sradix_tree_node *node, unsigned long index,
348 + int (*iter)(void *, unsigned long));
350 +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
352 +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
353 + struct sradix_tree_node *node, unsigned long index);
355 +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
357 +#endif /* _LINUX_SRADIX_TREE_H */
358 diff --git a/include/linux/uksm.h b/include/linux/uksm.h
360 index 0000000..a644bca
362 +++ b/include/linux/uksm.h
364 +#ifndef __LINUX_UKSM_H
365 +#define __LINUX_UKSM_H
367 + * Memory merging support.
369 + * This code enables dynamic sharing of identical pages found in different
370 + * memory areas, even if they are not shared by fork().
373 +/* if !CONFIG_UKSM this file should not be compiled at all. */
376 +#include <linux/bitops.h>
377 +#include <linux/mm.h>
378 +#include <linux/pagemap.h>
379 +#include <linux/rmap.h>
380 +#include <linux/sched.h>
382 +extern unsigned long zero_pfn __read_mostly;
383 +extern unsigned long uksm_zero_pfn __read_mostly;
384 +extern struct page *empty_uksm_zero_page;
386 +/* must be done before linked to mm */
387 +extern void uksm_vma_add_new(struct vm_area_struct *vma);
388 +extern void uksm_remove_vma(struct vm_area_struct *vma);
390 +#define UKSM_SLOT_NEED_SORT (1 << 0)
391 +#define UKSM_SLOT_NEED_RERAND (1 << 1)
392 +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
393 +#define UKSM_SLOT_FUL_SCANNED (1 << 3)
394 +#define UKSM_SLOT_IN_UKSM (1 << 4)
397 + struct sradix_tree_node *snode;
398 + unsigned long sindex;
400 + struct list_head slot_list;
401 + unsigned long fully_scanned_round;
402 + unsigned long dedup_num;
403 + unsigned long pages_scanned;
404 + unsigned long last_scanned;
405 + unsigned long pages_to_scan;
406 + struct scan_rung *rung;
407 + struct page **rmap_list_pool;
408 + unsigned int *pool_counts;
409 + unsigned long pool_size;
410 + struct vm_area_struct *vma;
411 + struct mm_struct *mm;
412 + unsigned long ctime_j;
413 + unsigned long pages;
414 + unsigned long flags;
415 + unsigned long pages_cowed; /* pages cowed this round */
416 + unsigned long pages_merged; /* pages merged this round */
417 + unsigned long pages_bemerged;
419 + /* when it has page merged in this eval round */
420 + struct list_head dedup_list;
423 +static inline void uksm_unmap_zero_page(pte_t pte)
425 + if (pte_pfn(pte) == uksm_zero_pfn)
426 + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
429 +static inline void uksm_map_zero_page(pte_t pte)
431 + if (pte_pfn(pte) == uksm_zero_pfn)
432 + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
435 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
437 + if (vma->uksm_vma_slot && PageKsm(page))
438 + vma->uksm_vma_slot->pages_cowed++;
441 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
443 + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
444 + vma->uksm_vma_slot->pages_cowed++;
447 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
452 + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
453 + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP |
454 + VM_SHARED | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO));
457 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
459 + if (uksm_flags_can_scan(*vm_flags_p))
460 + *vm_flags_p |= VM_MERGEABLE;
464 + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
465 + * be removed when uksm zero page patch is stable enough.
467 +static inline void uksm_bugon_zeropage(pte_t pte)
469 + BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
472 +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
476 +static inline void uksm_remove_vma(struct vm_area_struct *vma)
480 +static inline void uksm_unmap_zero_page(pte_t pte)
484 +static inline void uksm_map_zero_page(pte_t pte)
488 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
492 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
496 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
501 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
505 +static inline void uksm_bugon_zeropage(pte_t pte)
508 +#endif /* !CONFIG_UKSM */
509 +#endif /* __LINUX_UKSM_H */
510 diff --git a/kernel/fork.c b/kernel/fork.c
511 index 9b7d746..73ad90d 100644
514 @@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
518 - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
519 + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
523 @@ -467,7 +467,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
524 __vma_link_rb(mm, tmp, rb_link, rb_parent);
525 rb_link = &tmp->vm_rb.rb_right;
526 rb_parent = &tmp->vm_rb;
528 + uksm_vma_add_new(tmp);
530 retval = copy_page_range(mm, oldmm, mpnt);
532 diff --git a/lib/Makefile b/lib/Makefile
533 index 0211d2b..426536f 100644
536 @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
539 lib-y := ctype.o string.o vsprintf.o cmdline.o \
540 - rbtree.o radix-tree.o dump_stack.o timerqueue.o\
541 + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
542 idr.o int_sqrt.o extable.o \
543 sha1.o md5.o irq_regs.o argv_split.o \
544 proportions.o flex_proportions.o ratelimit.o show_mem.o \
545 diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
547 index 0000000..8d06329
549 +++ b/lib/sradix-tree.c
551 +#include <linux/errno.h>
552 +#include <linux/mm.h>
553 +#include <linux/mman.h>
554 +#include <linux/spinlock.h>
555 +#include <linux/slab.h>
556 +#include <linux/gcd.h>
557 +#include <linux/sradix-tree.h>
559 +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
561 + return node->fulls == root->stores_size ||
562 + (node->height == 1 && node->count == root->stores_size);
566 + * Extend a sradix tree so it can store key @index.
568 +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
570 + struct sradix_tree_node *node;
571 + unsigned int height;
573 + if (unlikely(root->rnode == NULL)) {
574 + if (!(node = root->alloc()))
578 + root->rnode = node;
582 + /* Figure out what the height should be. */
583 + height = root->height;
584 + index >>= root->shift * height;
587 + index >>= root->shift;
591 + while (height > root->height) {
592 + unsigned int newheight;
593 + if (!(node = root->alloc()))
596 + /* Increase the height. */
597 + node->stores[0] = root->rnode;
598 + root->rnode->parent = node;
600 + root->extend(node, root->rnode);
602 + newheight = root->height + 1;
603 + node->height = newheight;
605 + if (sradix_node_full(root, root->rnode))
608 + root->rnode = node;
609 + root->height = newheight;
616 + * Search the next item from the current node, that is not NULL
617 + * and can satify root->iter().
619 +void *sradix_tree_next(struct sradix_tree_root *root,
620 + struct sradix_tree_node *node, unsigned long index,
621 + int (*iter)(void *item, unsigned long height))
623 + unsigned long offset;
626 + if (unlikely(node == NULL)) {
627 + node = root->rnode;
628 + for (offset = 0; offset < root->stores_size; offset++) {
629 + item = node->stores[offset];
630 + if (item && (!iter || iter(item, node->height)))
634 + if (unlikely(offset >= root->stores_size))
637 + if (node->height == 1)
644 + offset = (index & root->mask) + 1;
645 + for (;offset < root->stores_size; offset++) {
646 + item = node->stores[offset];
647 + if (item && (!iter || iter(item, node->height)))
651 + if (offset < root->stores_size)
654 + node = node->parent;
655 + index >>= root->shift;
661 + while (node->height > 1) {
664 + for (offset = 0; offset < root->stores_size; offset++) {
665 + item = node->stores[offset];
666 + if (item && (!iter || iter(item, node->height)))
670 + if (unlikely(offset >= root->stores_size))
674 + BUG_ON(offset > root->stores_size);
680 + * Blindly insert the item to the tree. Typically, we reuse the
681 + * first empty store item.
683 +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
685 + unsigned long index;
686 + unsigned int height;
687 + struct sradix_tree_node *node, *tmp = NULL;
688 + int offset, offset_saved;
689 + void **store = NULL;
690 + int error, i, j, shift;
695 + if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
696 + node = root->enter_node;
697 + BUG_ON((index >> (root->shift * root->height)));
699 + node = root->rnode;
700 + if (node == NULL || (index >> (root->shift * root->height))
701 + || sradix_node_full(root, node)) {
702 + error = sradix_tree_extend(root, index);
706 + node = root->rnode;
711 + height = node->height;
712 + shift = (height - 1) * root->shift;
713 + offset = (index >> shift) & root->mask;
714 + while (shift > 0) {
715 + offset_saved = offset;
716 + for (; offset < root->stores_size; offset++) {
717 + store = &node->stores[offset];
720 + if (!tmp || !sradix_node_full(root, tmp))
723 + BUG_ON(offset >= root->stores_size);
725 + if (offset != offset_saved) {
726 + index += (offset - offset_saved) << shift;
727 + index &= ~((1UL << shift) - 1);
731 + if (!(tmp = root->alloc()))
734 + tmp->height = shift / root->shift;
736 + tmp->parent = node;
738 +// if (root->extend)
739 +// root->extend(node, tmp);
743 + shift -= root->shift;
744 + offset = (index >> shift) & root->mask;
747 + BUG_ON(node->height != 1);
750 + store = &node->stores[offset];
752 + j < root->stores_size - node->count &&
753 + i < root->stores_size - offset && j < num; i++) {
755 + store[i] = item[j];
757 + root->assign(node, index + i, item[j]);
766 + while (sradix_node_full(root, node)) {
767 + node = node->parent;
774 + if (unlikely(!node)) {
775 + /* All nodes are full */
776 + root->min = 1 << (root->height * root->shift);
777 + root->enter_node = NULL;
779 + root->min = index + i - 1;
780 + root->min |= (1UL << (node->height - 1)) - 1;
782 + root->enter_node = node;
795 + * sradix_tree_shrink - shrink height of a sradix tree to minimal
796 + * @root sradix tree root
799 +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
801 + /* try to shrink tree height */
802 + while (root->height > 1) {
803 + struct sradix_tree_node *to_free = root->rnode;
806 + * The candidate node has more than one child, or its child
807 + * is not at the leftmost store, we cannot shrink.
809 + if (to_free->count != 1 || !to_free->stores[0])
812 + root->rnode = to_free->stores[0];
813 + root->rnode->parent = NULL;
815 + if (unlikely(root->enter_node == to_free)) {
816 + root->enter_node = NULL;
818 + root->free(to_free);
823 + * Del the item on the known leaf node and index
825 +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
826 + struct sradix_tree_node *node, unsigned long index)
828 + unsigned int offset;
829 + struct sradix_tree_node *start, *end;
831 + BUG_ON(node->height != 1);
834 + while (node && !(--node->count))
835 + node = node->parent;
839 + root->rnode = NULL;
843 + root->enter_node = NULL;
845 + offset = (index >> (root->shift * (node->height - 1))) & root->mask;
847 + root->rm(node, offset);
848 + node->stores[offset] = NULL;
850 + if (root->min > index) {
852 + root->enter_node = node;
856 + if (start != end) {
859 + start = start->parent;
860 + if (unlikely(root->enter_node == node))
861 + root->enter_node = end;
863 + } while (start != end);
866 + * Note that shrink may free "end", so enter_node still need to
867 + * be checked inside.
869 + sradix_tree_shrink(root);
870 + } else if (node->count == root->stores_size - 1) {
871 + /* It WAS a full leaf node. Update the ancestors */
872 + node = node->parent;
875 + if (node->fulls != root->stores_size - 1)
878 + node = node->parent;
883 +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
885 + unsigned int height, offset;
886 + struct sradix_tree_node *node;
889 + node = root->rnode;
890 + if (node == NULL || (index >> (root->shift * root->height)))
893 + height = root->height;
894 + shift = (height - 1) * root->shift;
897 + offset = (index >> shift) & root->mask;
898 + node = node->stores[offset];
902 + shift -= root->shift;
903 + } while (shift >= 0);
909 + * Return the item if it exists, otherwise create it in place
910 + * and return the created item.
912 +void *sradix_tree_lookup_create(struct sradix_tree_root *root,
913 + unsigned long index, void *(*item_alloc)(void))
915 + unsigned int height, offset;
916 + struct sradix_tree_node *node, *tmp;
920 + if (root->rnode == NULL || (index >> (root->shift * root->height))) {
922 + error = sradix_tree_extend(root, index);
930 + node = root->rnode;
931 + height = root->height;
932 + shift = (height - 1) * root->shift;
935 + offset = (index >> shift) & root->mask;
936 + if (!node->stores[offset]) {
937 + if (!(tmp = root->alloc()))
940 + tmp->height = shift / root->shift;
941 + node->stores[offset] = tmp;
942 + tmp->parent = node;
946 + node = node->stores[offset];
949 + shift -= root->shift;
950 + } while (shift > 0);
952 + BUG_ON(node->height != 1);
953 + offset = index & root->mask;
954 + if (node->stores[offset]) {
955 + return node->stores[offset];
956 + } else if (item_alloc) {
957 + if (!(item = item_alloc()))
960 + node->stores[offset] = item;
963 + * NOTE: we do NOT call root->assign here, since this item is
964 + * newly created by us having no meaning. Caller can call this
965 + * if it's necessary to do so.
971 + while (sradix_node_full(root, node)) {
972 + node = node->parent;
979 + if (unlikely(!node)) {
980 + /* All nodes are full */
981 + root->min = 1 << (root->height * root->shift);
983 + if (root->min == index) {
984 + root->min |= (1UL << (node->height - 1)) - 1;
986 + root->enter_node = node;
997 +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
999 + unsigned int height, offset;
1000 + struct sradix_tree_node *node;
1003 + node = root->rnode;
1004 + if (node == NULL || (index >> (root->shift * root->height)))
1007 + height = root->height;
1008 + shift = (height - 1) * root->shift;
1011 + offset = (index >> shift) & root->mask;
1012 + node = node->stores[offset];
1016 + shift -= root->shift;
1017 + } while (shift > 0);
1019 + offset = index & root->mask;
1020 + if (!node->stores[offset])
1023 + sradix_tree_delete_from_leaf(root, node, index);
1027 diff --git a/mm/Kconfig b/mm/Kconfig
1028 index 1d1ae6b..511dde6 100644
1031 @@ -339,6 +339,32 @@ config KSM
1032 See Documentation/vm/ksm.txt for more information: KSM is inactive
1033 until a program has madvised that an area is MADV_MERGEABLE, and
1034 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1036 + prompt "Choose UKSM/KSM strategy"
1040 + This option allows to select a UKSM/KSM stragety.
1043 + bool "Ultra-KSM for page merging"
1046 + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1047 + page Merging), but with a fundamentally rewritten core algorithm. With
1048 + an advanced algorithm, UKSM now can transparently scans all anonymously
1049 + mapped user space applications with an significantly improved scan speed
1050 + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1051 + UKSM. Now UKSM has its first stable release and first real world enterprise user.
1052 + For more information, please goto its project page.
1053 + (www.kerneldedup.org)
1056 + bool "Legacy KSM implementation"
1059 + The legacy KSM implementation from Redhat.
1062 config DEFAULT_MMAP_MIN_ADDR
1063 int "Low address space to protect from user allocation"
1064 diff --git a/mm/Makefile b/mm/Makefile
1065 index 8405eb0..7689f0c 100644
1068 @@ -44,7 +44,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
1069 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1070 obj-$(CONFIG_SLOB) += slob.o
1071 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1072 -obj-$(CONFIG_KSM) += ksm.o
1073 +obj-$(CONFIG_KSM_LEGACY) += ksm.o
1074 +obj-$(CONFIG_UKSM) += uksm.o
1075 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
1076 obj-$(CONFIG_SLAB) += slab.o
1077 obj-$(CONFIG_SLUB) += slub.o
1078 diff --git a/mm/memory.c b/mm/memory.c
1079 index d5f2ae9..86b5d09 100644
1082 @@ -120,6 +120,28 @@ unsigned long highest_memmap_pfn __read_mostly;
1084 EXPORT_SYMBOL(zero_pfn);
1087 +unsigned long uksm_zero_pfn __read_mostly;
1088 +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1089 +struct page *empty_uksm_zero_page;
1091 +static int __init setup_uksm_zero_page(void)
1093 + unsigned long addr;
1094 + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
1096 + panic("Oh boy, that early out of memory?");
1098 + empty_uksm_zero_page = virt_to_page((void *) addr);
1099 + SetPageReserved(empty_uksm_zero_page);
1101 + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1105 +core_initcall(setup_uksm_zero_page);
1109 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1111 @@ -131,6 +153,7 @@ static int __init init_zero_pfn(void)
1112 core_initcall(init_zero_pfn);
1116 #if defined(SPLIT_RSS_COUNTING)
1118 void sync_mm_rss(struct mm_struct *mm)
1119 @@ -878,6 +901,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1120 rss[MM_ANONPAGES]++;
1122 rss[MM_FILEPAGES]++;
1124 + /* Should return NULL in vm_normal_page() */
1125 + uksm_bugon_zeropage(pte);
1127 + uksm_map_zero_page(pte);
1131 @@ -1120,8 +1148,10 @@ again:
1132 ptent = ptep_get_and_clear_full(mm, addr, pte,
1134 tlb_remove_tlb_entry(tlb, pte, addr);
1135 - if (unlikely(!page))
1136 + if (unlikely(!page)) {
1137 + uksm_unmap_zero_page(ptent);
1140 if (unlikely(details) && details->nonlinear_vma
1141 && linear_page_index(details->nonlinear_vma,
1142 addr) != page->index) {
1143 @@ -1983,8 +2013,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1145 kunmap_atomic(kaddr);
1146 flush_dcache_page(dst);
1149 copy_user_highpage(dst, src, va, vma);
1150 + uksm_cow_page(vma, src);
1155 @@ -2198,6 +2230,7 @@ gotten:
1156 new_page = alloc_zeroed_user_highpage_movable(vma, address);
1159 + uksm_cow_pte(vma, orig_pte);
1161 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1163 @@ -2223,8 +2256,11 @@ gotten:
1164 dec_mm_counter_fast(mm, MM_FILEPAGES);
1165 inc_mm_counter_fast(mm, MM_ANONPAGES);
1168 + uksm_bugon_zeropage(orig_pte);
1170 + uksm_unmap_zero_page(orig_pte);
1171 inc_mm_counter_fast(mm, MM_ANONPAGES);
1173 flush_cache_page(vma, address, pte_pfn(orig_pte));
1174 entry = mk_pte(new_page, vma->vm_page_prot);
1175 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1176 diff --git a/mm/mmap.c b/mm/mmap.c
1177 index ae91989..844f366 100644
1181 #include <linux/notifier.h>
1182 #include <linux/memory.h>
1183 #include <linux/printk.h>
1184 +#include <linux/ksm.h>
1186 #include <asm/uaccess.h>
1187 #include <asm/cacheflush.h>
1188 @@ -279,6 +280,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1191 mpol_put(vma_policy(vma));
1192 + uksm_remove_vma(vma);
1193 kmem_cache_free(vm_area_cachep, vma);
1196 @@ -739,9 +741,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1197 long adjust_next = 0;
1198 int remove_next = 0;
1201 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1204 + uksm_remove_vma(vma);
1206 if (next && !insert) {
1207 struct vm_area_struct *exporter = NULL;
1209 + uksm_remove_vma(next);
1210 if (end >= next->vm_end) {
1212 * vma expands, overlapping all the next, and
1213 @@ -838,6 +847,7 @@ again: remove_next = 1 + (end > next->vm_end);
1216 vma->vm_pgoff = pgoff;
1219 next->vm_start += adjust_next << PAGE_SHIFT;
1220 next->vm_pgoff += adjust_next;
1221 @@ -908,16 +918,22 @@ again: remove_next = 1 + (end > next->vm_end);
1222 * up the code too much to do both in one go.
1224 next = vma->vm_next;
1225 - if (remove_next == 2)
1226 + if (remove_next == 2) {
1227 + uksm_remove_vma(next);
1230 + } else if (next) {
1231 vma_gap_update(next);
1234 mm->highest_vm_end = end;
1237 + if (next && !insert)
1238 + uksm_vma_add_new(next);
1241 uprobe_mmap(insert);
1243 + uksm_vma_add_new(vma);
1247 @@ -1310,6 +1326,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1248 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1249 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1251 + /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
1252 + uksm_vm_flags_mod(&vm_flags);
1254 if (flags & MAP_LOCKED)
1255 if (!can_do_mlock())
1257 @@ -1651,6 +1670,7 @@ munmap_back:
1258 allow_write_access(file);
1260 file = vma->vm_file;
1261 + uksm_vma_add_new(vma);
1263 perf_event_mmap(vma);
1265 @@ -1692,6 +1712,7 @@ allow_write_and_free_vma:
1266 if (vm_flags & VM_DENYWRITE)
1267 allow_write_access(file);
1269 + uksm_remove_vma(vma);
1270 kmem_cache_free(vm_area_cachep, vma);
1273 @@ -2488,6 +2509,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1275 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1277 + uksm_vma_add_new(new);
1282 @@ -2654,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1285 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1286 + uksm_vm_flags_mod(&flags);
1288 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1289 if (error & ~PAGE_MASK)
1290 @@ -2712,6 +2736,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1291 vma->vm_flags = flags;
1292 vma->vm_page_prot = vm_get_page_prot(flags);
1293 vma_link(mm, vma, prev, rb_link, rb_parent);
1294 + uksm_vma_add_new(vma);
1296 perf_event_mmap(vma);
1297 mm->total_vm += len >> PAGE_SHIFT;
1298 @@ -2747,6 +2772,12 @@ void exit_mmap(struct mm_struct *mm)
1299 /* mm's last user has gone, and its about to be pulled down */
1300 mmu_notifier_release(mm);
1303 + * Taking write lock on mmap_sem does not harm others,
1304 + * but it's crucial for uksm to avoid races.
1306 + down_write(&mm->mmap_sem);
1308 if (mm->locked_vm) {
1311 @@ -2783,6 +2814,11 @@ void exit_mmap(struct mm_struct *mm)
1313 vm_unacct_memory(nr_accounted);
1316 + mm->mm_rb = RB_ROOT;
1317 + vmacache_invalidate(mm);
1318 + up_write(&mm->mmap_sem);
1320 WARN_ON(atomic_long_read(&mm->nr_ptes) >
1321 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1323 @@ -2891,6 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1324 new_vma->vm_ops->open(new_vma);
1325 vma_link(mm, new_vma, prev, rb_link, rb_parent);
1326 *need_rmap_locks = false;
1327 + uksm_vma_add_new(new_vma);
1331 @@ -3004,10 +3041,10 @@ static struct vm_area_struct *__install_special_mapping(
1332 ret = insert_vm_struct(mm, vma);
1336 mm->total_vm += len >> PAGE_SHIFT;
1338 perf_event_mmap(vma);
1339 + uksm_vma_add_new(vma);
1343 diff --git a/mm/rmap.c b/mm/rmap.c
1344 index 3e4c721..d39d8a3 100644
1347 @@ -908,9 +908,9 @@ void page_move_anon_rmap(struct page *page,
1350 * __page_set_anon_rmap - set up new anonymous rmap
1351 - * @page: Page to add to rmap
1352 + * @page: Page to add to rmap
1353 * @vma: VM area to add page to.
1354 - * @address: User virtual address of the mapping
1355 + * @address: User virtual address of the mapping
1356 * @exclusive: the page is exclusively owned by the current process
1358 static void __page_set_anon_rmap(struct page *page,
1359 diff --git a/mm/uksm.c b/mm/uksm.c
1360 new file mode 100644
1361 index 0000000..c76fcfc
1366 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1368 + * This is an improvement upon KSM. Some basic data structures and routines
1369 + * are borrowed from ksm.c .
1371 + * Its new features:
1372 + * 1. Full system scan:
1373 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
1374 + * interaction to submit a memory area to KSM is no longer needed.
1376 + * 2. Rich area detection:
1377 + * It automatically detects rich areas containing abundant duplicated
1378 + * pages based. Rich areas are given a full scan speed. Poor areas are
1379 + * sampled at a reasonable speed with very low CPU consumption.
1381 + * 3. Ultra Per-page scan speed improvement:
1382 + * A new hash algorithm is proposed. As a result, on a machine with
1383 + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1384 + * can scan memory areas that does not contain duplicated pages at speed of
1385 + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1386 + * 477MB/sec ~ 923MB/sec.
1388 + * 4. Thrashing area avoidance:
1389 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
1390 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
1391 + * hash value based volatile page detection.
1394 + * 5. Misc changes upon KSM:
1395 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1396 + * comparison. It's much faster than default C version on x86.
1397 + * * rmap_item now has an struct *page member to loosely cache a
1398 + * address-->page mapping, which reduces too much time-costly
1400 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1401 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1402 + * ksm is needed for this case.
1404 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
1405 + * Now uksmd consider full zero pages as special pages and merge them to an
1406 + * special unswappable uksm zero page.
1409 +#include <linux/errno.h>
1410 +#include <linux/mm.h>
1411 +#include <linux/fs.h>
1412 +#include <linux/mman.h>
1413 +#include <linux/sched.h>
1414 +#include <linux/rwsem.h>
1415 +#include <linux/pagemap.h>
1416 +#include <linux/rmap.h>
1417 +#include <linux/spinlock.h>
1418 +#include <linux/jhash.h>
1419 +#include <linux/delay.h>
1420 +#include <linux/kthread.h>
1421 +#include <linux/wait.h>
1422 +#include <linux/slab.h>
1423 +#include <linux/rbtree.h>
1424 +#include <linux/memory.h>
1425 +#include <linux/mmu_notifier.h>
1426 +#include <linux/swap.h>
1427 +#include <linux/ksm.h>
1428 +#include <linux/crypto.h>
1429 +#include <linux/scatterlist.h>
1430 +#include <crypto/hash.h>
1431 +#include <linux/random.h>
1432 +#include <linux/math64.h>
1433 +#include <linux/gcd.h>
1434 +#include <linux/freezer.h>
1435 +#include <linux/sradix-tree.h>
1437 +#include <asm/tlbflush.h>
1438 +#include "internal.h"
1443 +#ifdef CONFIG_X86_32
1444 +#define memcmp memcmpx86_32
1446 + * Compare 4-byte-aligned address s1 and s2, with length n
1448 +int memcmpx86_32(void *s1, void *s2, size_t n)
1450 + size_t num = n / 4;
1453 + __asm__ __volatile__
1461 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1469 + * Check the page is all zero ?
1471 +static int is_full_zero(const void *s1, size_t len)
1473 + unsigned char same;
1477 + __asm__ __volatile__
1480 + : "=qm" (same), "+D" (s1), "+c" (len)
1488 +#elif defined(CONFIG_X86_64)
1489 +#define memcmp memcmpx86_64
1491 + * Compare 8-byte-aligned address s1 and s2, with length n
1493 +int memcmpx86_64(void *s1, void *s2, size_t n)
1495 + size_t num = n / 8;
1498 + __asm__ __volatile__
1500 + "testq %q3,%q3\n\t"
1503 + "sbbq %q0,%q0\n\t"
1506 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1513 +static int is_full_zero(const void *s1, size_t len)
1515 + unsigned char same;
1519 + __asm__ __volatile__
1522 + : "=qm" (same), "+D" (s1), "+c" (len)
1531 +static int is_full_zero(const void *s1, size_t len)
1533 + unsigned long *src = s1;
1536 + len /= sizeof(*src);
1538 + for (i = 0; i < len; i++) {
1547 +#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
1548 +#define TIME_RATIO_SCALE 10000
1550 +#define SLOT_TREE_NODE_SHIFT 8
1551 +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
1552 +struct slot_tree_node {
1553 + unsigned long size;
1554 + struct sradix_tree_node snode;
1555 + void *stores[SLOT_TREE_NODE_STORE_SIZE];
1558 +static struct kmem_cache *slot_tree_node_cachep;
1560 +static struct sradix_tree_node *slot_tree_node_alloc(void)
1562 + struct slot_tree_node *p;
1563 + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
1570 +static void slot_tree_node_free(struct sradix_tree_node *node)
1572 + struct slot_tree_node *p;
1574 + p = container_of(node, struct slot_tree_node, snode);
1575 + kmem_cache_free(slot_tree_node_cachep, p);
1578 +static void slot_tree_node_extend(struct sradix_tree_node *parent,
1579 + struct sradix_tree_node *child)
1581 + struct slot_tree_node *p, *c;
1583 + p = container_of(parent, struct slot_tree_node, snode);
1584 + c = container_of(child, struct slot_tree_node, snode);
1586 + p->size += c->size;
1589 +void slot_tree_node_assign(struct sradix_tree_node *node,
1590 + unsigned index, void *item)
1592 + struct vma_slot *slot = item;
1593 + struct slot_tree_node *cur;
1595 + slot->snode = node;
1596 + slot->sindex = index;
1599 + cur = container_of(node, struct slot_tree_node, snode);
1600 + cur->size += slot->pages;
1601 + node = node->parent;
1605 +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
1607 + struct vma_slot *slot;
1608 + struct slot_tree_node *cur;
1609 + unsigned long pages;
1611 + if (node->height == 1) {
1612 + slot = node->stores[offset];
1613 + pages = slot->pages;
1615 + cur = container_of(node->stores[offset],
1616 + struct slot_tree_node, snode);
1617 + pages = cur->size;
1621 + cur = container_of(node, struct slot_tree_node, snode);
1622 + cur->size -= pages;
1623 + node = node->parent;
1627 +unsigned long slot_iter_index;
1628 +int slot_iter(void *item, unsigned long height)
1630 + struct slot_tree_node *node;
1631 + struct vma_slot *slot;
1633 + if (height == 1) {
1635 + if (slot_iter_index < slot->pages) {
1639 + slot_iter_index -= slot->pages;
1644 + node = container_of(item, struct slot_tree_node, snode);
1645 + if (slot_iter_index < node->size) {
1649 + slot_iter_index -= node->size;
1656 +static inline void slot_tree_init_root(struct sradix_tree_root *root)
1658 + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1659 + root->alloc = slot_tree_node_alloc;
1660 + root->free = slot_tree_node_free;
1661 + root->extend = slot_tree_node_extend;
1662 + root->assign = slot_tree_node_assign;
1663 + root->rm = slot_tree_node_rm;
1666 +void slot_tree_init(void)
1668 + slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1669 + sizeof(struct slot_tree_node), 0,
1670 + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1675 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1677 + //struct list_head scanned_list;
1678 + struct sradix_tree_root vma_root;
1679 + struct sradix_tree_root vma_root2;
1681 + struct vma_slot *current_scan;
1682 + unsigned long current_offset;
1685 + * The initial value for current_offset, it should loop over
1686 + * [0~ step - 1] to let all slot have its chance to be scanned.
1688 + unsigned long offset_init;
1689 + unsigned long step; /* dynamic step for current_offset */
1690 + unsigned int flags;
1691 + unsigned long pages_to_scan;
1692 + //unsigned long fully_scanned_slots;
1694 + * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1695 + * the cpu time ratio it can spend in rung_i for every scan
1696 + * period. if < 0, then it is the cpu time ratio relative to the
1697 + * max cpu percentage user specified. Both in unit of
1698 + * 1/TIME_RATIO_SCALE
1703 + * How long it will take for all slots in this rung to be fully
1704 + * scanned? If it's zero, we don't care about the cover time:
1705 + * it's fully scanned.
1707 + unsigned int cover_msecs;
1708 + //unsigned long vma_num;
1709 + //unsigned long pages; /* Sum of all slot's pages in rung */
1713 + * node of either the stable or unstale rbtree
1717 + struct rb_node node; /* link in the main (un)stable rbtree */
1718 + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1720 + unsigned long count; /* TODO: merged with sub_root */
1721 + struct list_head all_list; /* all tree nodes in stable/unstable tree */
1725 + * struct stable_node - node of the stable rbtree
1726 + * @node: rb node of this ksm page in the stable tree
1727 + * @hlist: hlist head of rmap_items using this ksm page
1728 + * @kpfn: page frame number of this ksm page
1730 +struct stable_node {
1731 + struct rb_node node; /* link in sub-rbtree */
1732 + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1733 + struct hlist_head hlist;
1734 + unsigned long kpfn;
1735 + u32 hash_max; /* if ==0 then it's not been calculated yet */
1736 + struct list_head all_list; /* in a list for all stable nodes */
1740 + * struct node_vma - group rmap_items linked in a same stable
1745 + struct vma_slot *slot;
1746 + unsigned long key; /* slot is used as key sorted on hlist */
1748 + struct hlist_node hlist;
1749 + struct hlist_head rmap_hlist;
1750 + struct stable_node *head;
1754 + * struct rmap_item - reverse mapping item for virtual addresses
1755 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1756 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1757 + * @mm: the memory structure this rmap_item is pointing into
1758 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1759 + * @node: rb node of this rmap_item in the unstable tree
1760 + * @head: pointer to stable_node heading this list in the stable tree
1761 + * @hlist: link into hlist of rmap_items hanging off that stable_node
1764 + struct vma_slot *slot;
1765 + struct page *page;
1766 + unsigned long address; /* + low bits used for flags below */
1767 + unsigned long hash_round;
1768 + unsigned long entry_index;
1770 + struct {/* when in unstable tree */
1771 + struct rb_node node;
1772 + struct tree_node *tree_node;
1775 + struct { /* when in stable tree */
1776 + struct node_vma *head;
1777 + struct hlist_node hlist;
1778 + struct anon_vma *anon_vma;
1781 +} __attribute__((aligned(4)));
1783 +struct rmap_list_entry {
1785 + struct rmap_item *item;
1786 + unsigned long addr;
1788 + /* lowest bit is used for is_addr tag */
1789 +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
1792 +/* Basic data structure definition ends */
1796 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1797 + * The flags use the low bits of rmap_item.address
1799 +#define UNSTABLE_FLAG 0x1
1800 +#define STABLE_FLAG 0x2
1801 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
1804 + * rmap_list_entry helpers
1806 +#define IS_ADDR_FLAG 1
1807 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
1808 +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
1809 +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1813 + * High speed caches for frequently allocated and freed structs
1815 +static struct kmem_cache *rmap_item_cache;
1816 +static struct kmem_cache *stable_node_cache;
1817 +static struct kmem_cache *node_vma_cache;
1818 +static struct kmem_cache *vma_slot_cache;
1819 +static struct kmem_cache *tree_node_cache;
1820 +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1821 + sizeof(struct __struct), __alignof__(struct __struct),\
1824 +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1825 +#define SCAN_LADDER_SIZE 4
1826 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1828 +/* The evaluation rounds uksmd has finished */
1829 +static unsigned long long uksm_eval_round = 1;
1832 + * we add 1 to this var when we consider we should rebuild the whole
1835 +static unsigned long uksm_hash_round = 1;
1838 + * How many times the whole memory is scanned.
1840 +static unsigned long long fully_scanned_round = 1;
1842 +/* The total number of virtual pages of all vma slots */
1843 +static u64 uksm_pages_total;
1845 +/* The number of pages has been scanned since the start up */
1846 +static u64 uksm_pages_scanned;
1848 +static u64 scanned_virtual_pages;
1850 +/* The number of pages has been scanned since last encode_benefit call */
1851 +static u64 uksm_pages_scanned_last;
1853 +/* If the scanned number is tooo large, we encode it here */
1854 +static u64 pages_scanned_stored;
1856 +static unsigned long pages_scanned_base;
1858 +/* The number of nodes in the stable tree */
1859 +static unsigned long uksm_pages_shared;
1861 +/* The number of page slots additionally sharing those nodes */
1862 +static unsigned long uksm_pages_sharing;
1864 +/* The number of nodes in the unstable tree */
1865 +static unsigned long uksm_pages_unshared;
1868 + * Milliseconds ksmd should sleep between scans,
1869 + * >= 100ms to be consistent with
1870 + * scan_time_to_sleep_msec()
1872 +static unsigned int uksm_sleep_jiffies;
1874 +/* The real value for the uksmd next sleep */
1875 +static unsigned int uksm_sleep_real;
1877 +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1878 +static unsigned int uksm_sleep_saved;
1880 +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1881 +static unsigned int uksm_max_cpu_percentage;
1883 +static int uksm_cpu_governor;
1885 +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1887 +struct uksm_cpu_preset_s {
1888 + int cpu_ratio[SCAN_LADDER_SIZE];
1889 + unsigned int cover_msecs[SCAN_LADDER_SIZE];
1890 + unsigned int max_cpu; /* percentage */
1893 +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1894 + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1895 + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1896 + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1897 + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1900 +/* The default value for uksm_ema_page_time if it's not initialized */
1901 +#define UKSM_PAGE_TIME_DEFAULT 500
1903 +/*cost to scan one page by expotional moving average in nsecs */
1904 +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1906 +/* The expotional moving average alpha weight, in percentage. */
1907 +#define EMA_ALPHA 20
1910 + * The threshold used to filter out thrashing areas,
1911 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1912 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1913 + * will be considered as having a zero duplication ratio.
1915 +static unsigned int uksm_thrash_threshold = 50;
1917 +/* How much dedup ratio is considered to be abundant*/
1918 +static unsigned int uksm_abundant_threshold = 10;
1920 +/* All slots having merged pages in this eval round. */
1921 +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1923 +/* How many times the ksmd has slept since startup */
1924 +static unsigned long long uksm_sleep_times;
1926 +#define UKSM_RUN_STOP 0
1927 +#define UKSM_RUN_MERGE 1
1928 +static unsigned int uksm_run = 1;
1930 +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1931 +static DEFINE_MUTEX(uksm_thread_mutex);
1934 + * List vma_slot_new is for newly created vma_slot waiting to be added by
1935 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1936 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1937 + * VMA has been removed/freed.
1939 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1940 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1941 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1942 +static DEFINE_SPINLOCK(vma_slot_list_lock);
1944 +/* The unstable tree heads */
1945 +static struct rb_root root_unstable_tree = RB_ROOT;
1948 + * All tree_nodes are in a list to be freed at once when unstable tree is
1949 + * freed after each scan round.
1951 +static struct list_head unstable_tree_node_list =
1952 + LIST_HEAD_INIT(unstable_tree_node_list);
1954 +/* List contains all stable nodes */
1955 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1958 + * When the hash strength is changed, the stable tree must be delta_hashed and
1959 + * re-structured. We use two set of below structs to speed up the
1960 + * re-structuring of stable tree.
1962 +static struct list_head
1963 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1964 + LIST_HEAD_INIT(stable_tree_node_list[1])};
1966 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1967 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1968 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
1969 +static unsigned long stable_tree_index;
1971 +/* The hash strength needed to hash a full page */
1972 +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
1974 +/* The hash strength needed for loop-back hashing */
1975 +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
1977 +/* The random offsets in a page */
1978 +static u32 *random_nums;
1980 +/* The hash strength */
1981 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1983 +/* The delta value each time the hash strength increases or decreases */
1984 +static unsigned long hash_strength_delta;
1985 +#define HASH_STRENGTH_DELTA_MAX 5
1987 +/* The time we have saved due to random_sample_hash */
1988 +static u64 rshash_pos;
1990 +/* The time we have wasted due to hash collision */
1991 +static u64 rshash_neg;
1993 +struct uksm_benefit {
1997 + unsigned long base;
2001 + * The relative cost of memcmp, compared to 1 time unit of random sample
2002 + * hash, this value is tested when ksm module is initialized
2004 +static unsigned long memcmp_cost;
2006 +static unsigned long rshash_neg_cont_zero;
2007 +static unsigned long rshash_cont_obscure;
2009 +/* The possible states of hash strength adjustment heuristic */
2010 +enum rshash_states {
2018 +/* The possible direction we are about to adjust hash strength */
2019 +enum rshash_direct {
2026 +/* random sampling hash state machine */
2028 + enum rshash_states state;
2029 + enum rshash_direct pre_direct;
2031 + /* Keep a lookup window of size 5, iff above_count/below_count > 3
2032 + * in this window we stop trying.
2034 + u8 lookup_window_index;
2035 + u64 stable_benefit;
2036 + unsigned long turn_point_down;
2037 + unsigned long turn_benefit_down;
2038 + unsigned long turn_point_up;
2039 + unsigned long turn_benefit_up;
2040 + unsigned long stable_point;
2043 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2044 +static u32 *zero_hash_table;
2046 +static inline struct node_vma *alloc_node_vma(void)
2048 + struct node_vma *node_vma;
2049 + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
2051 + INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2052 + INIT_HLIST_NODE(&node_vma->hlist);
2057 +static inline void free_node_vma(struct node_vma *node_vma)
2059 + kmem_cache_free(node_vma_cache, node_vma);
2063 +static inline struct vma_slot *alloc_vma_slot(void)
2065 + struct vma_slot *slot;
2068 + * In case ksm is not initialized by now.
2069 + * Oops, we need to consider the call site of uksm_init() in the future.
2071 + if (!vma_slot_cache)
2074 + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
2076 + INIT_LIST_HEAD(&slot->slot_list);
2077 + INIT_LIST_HEAD(&slot->dedup_list);
2078 + slot->flags |= UKSM_SLOT_NEED_RERAND;
2083 +static inline void free_vma_slot(struct vma_slot *vma_slot)
2085 + kmem_cache_free(vma_slot_cache, vma_slot);
2090 +static inline struct rmap_item *alloc_rmap_item(void)
2092 + struct rmap_item *rmap_item;
2094 + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
2096 + /* bug on lowest bit is not clear for flag use */
2097 + BUG_ON(is_addr(rmap_item));
2102 +static inline void free_rmap_item(struct rmap_item *rmap_item)
2104 + rmap_item->slot = NULL; /* debug safety */
2105 + kmem_cache_free(rmap_item_cache, rmap_item);
2108 +static inline struct stable_node *alloc_stable_node(void)
2110 + struct stable_node *node;
2111 + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
2115 + INIT_HLIST_HEAD(&node->hlist);
2116 + list_add(&node->all_list, &stable_node_list);
2120 +static inline void free_stable_node(struct stable_node *stable_node)
2122 + list_del(&stable_node->all_list);
2123 + kmem_cache_free(stable_node_cache, stable_node);
2126 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
2128 + struct tree_node *node;
2129 + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
2133 + list_add(&node->all_list, list);
2137 +static inline void free_tree_node(struct tree_node *node)
2139 + list_del(&node->all_list);
2140 + kmem_cache_free(tree_node_cache, node);
2143 +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2145 + struct anon_vma *anon_vma = rmap_item->anon_vma;
2147 + put_anon_vma(anon_vma);
2152 + * Remove a stable node from stable_tree, may unlink from its tree_node and
2153 + * may remove its parent tree_node if no other stable node is pending.
2155 + * @stable_node The node need to be removed
2156 + * @unlink_rb Will this node be unlinked from the rbtree?
2157 + * @remove_tree_ node Will its tree_node be removed if empty?
2159 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
2160 + int unlink_rb, int remove_tree_node)
2162 + struct node_vma *node_vma;
2163 + struct rmap_item *rmap_item;
2164 + struct hlist_node *n;
2166 + if (!hlist_empty(&stable_node->hlist)) {
2167 + hlist_for_each_entry_safe(node_vma, n,
2168 + &stable_node->hlist, hlist) {
2169 + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2170 + uksm_pages_sharing--;
2172 + uksm_drop_anon_vma(rmap_item);
2173 + rmap_item->address &= PAGE_MASK;
2175 + free_node_vma(node_vma);
2179 + /* the last one is counted as shared */
2180 + uksm_pages_shared--;
2181 + uksm_pages_sharing++;
2184 + if (stable_node->tree_node && unlink_rb) {
2185 + rb_erase(&stable_node->node,
2186 + &stable_node->tree_node->sub_root);
2188 + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2189 + remove_tree_node) {
2190 + rb_erase(&stable_node->tree_node->node,
2191 + root_stable_treep);
2192 + free_tree_node(stable_node->tree_node);
2194 + stable_node->tree_node->count--;
2198 + free_stable_node(stable_node);
2203 + * get_uksm_page: checks if the page indicated by the stable node
2204 + * is still its ksm page, despite having held no reference to it.
2205 + * In which case we can trust the content of the page, and it
2206 + * returns the gotten page; but if the page has now been zapped,
2207 + * remove the stale node from the stable tree and return NULL.
2209 + * You would expect the stable_node to hold a reference to the ksm page.
2210 + * But if it increments the page's count, swapping out has to wait for
2211 + * ksmd to come around again before it can free the page, which may take
2212 + * seconds or even minutes: much too unresponsive. So instead we use a
2213 + * "keyhole reference": access to the ksm page from the stable node peeps
2214 + * out through its keyhole to see if that page still holds the right key,
2215 + * pointing back to this stable node. This relies on freeing a PageAnon
2216 + * page to reset its page->mapping to NULL, and relies on no other use of
2217 + * a page to put something that might look like our key in page->mapping.
2219 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2220 + * but this is different - made simpler by uksm_thread_mutex being held, but
2221 + * interesting for assuming that no other use of the struct page could ever
2222 + * put our expected_mapping into page->mapping (or a field of the union which
2223 + * coincides with page->mapping). The RCU calls are not for KSM at all, but
2224 + * to keep the page_count protocol described with page_cache_get_speculative.
2226 + * Note: it is possible that get_uksm_page() will return NULL one moment,
2227 + * then page the next, if the page is in between page_freeze_refs() and
2228 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2229 + * is on its way to being freed; but it is an anomaly to bear in mind.
2231 + * @unlink_rb: if the removal of this node will firstly unlink from
2232 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
2233 + * node from its old tree.
2235 + * @remove_tree_node: if this is the last one of its tree_node, will the
2236 + * tree_node be freed ? If we are inserting stable node, this tree_node may
2237 + * be reused, so don't free it.
2239 +static struct page *get_uksm_page(struct stable_node *stable_node,
2240 + int unlink_rb, int remove_tree_node)
2242 + struct page *page;
2243 + void *expected_mapping;
2245 + page = pfn_to_page(stable_node->kpfn);
2246 + expected_mapping = (void *)stable_node +
2247 + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
2249 + if (page->mapping != expected_mapping)
2251 + if (!get_page_unless_zero(page))
2253 + if (page->mapping != expected_mapping) {
2257 + rcu_read_unlock();
2260 + rcu_read_unlock();
2261 + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2267 + * Removing rmap_item from stable or unstable tree.
2268 + * This function will clean the information from the stable/unstable tree.
2270 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2272 + if (rmap_item->address & STABLE_FLAG) {
2273 + struct stable_node *stable_node;
2274 + struct node_vma *node_vma;
2275 + struct page *page;
2277 + node_vma = rmap_item->head;
2278 + stable_node = node_vma->head;
2279 + page = get_uksm_page(stable_node, 1, 1);
2284 + * page lock is needed because it's racing with
2285 + * try_to_unmap_ksm(), etc.
2288 + hlist_del(&rmap_item->hlist);
2290 + if (hlist_empty(&node_vma->rmap_hlist)) {
2291 + hlist_del(&node_vma->hlist);
2292 + free_node_vma(node_vma);
2294 + unlock_page(page);
2297 + if (hlist_empty(&stable_node->hlist)) {
2298 + /* do NOT call remove_node_from_stable_tree() here,
2299 + * it's possible for a forked rmap_item not in
2300 + * stable tree while the in-tree rmap_items were
2303 + uksm_pages_shared--;
2305 + uksm_pages_sharing--;
2308 + uksm_drop_anon_vma(rmap_item);
2309 + } else if (rmap_item->address & UNSTABLE_FLAG) {
2310 + if (rmap_item->hash_round == uksm_hash_round) {
2312 + rb_erase(&rmap_item->node,
2313 + &rmap_item->tree_node->sub_root);
2314 + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2315 + rb_erase(&rmap_item->tree_node->node,
2316 + &root_unstable_tree);
2318 + free_tree_node(rmap_item->tree_node);
2320 + rmap_item->tree_node->count--;
2322 + uksm_pages_unshared--;
2325 + rmap_item->address &= PAGE_MASK;
2326 + rmap_item->hash_max = 0;
2329 + cond_resched(); /* we're called from many long loops */
2332 +static inline int slot_in_uksm(struct vma_slot *slot)
2334 + return list_empty(&slot->slot_list);
2338 + * Test if the mm is exiting
2340 +static inline bool uksm_test_exit(struct mm_struct *mm)
2342 + return atomic_read(&mm->mm_users) == 0;
2346 + * Need to do two things:
2347 + * 1. check if slot was moved to del list
2348 + * 2. make sure the mmap_sem is manipulated under valid vma.
2350 + * My concern here is that in some cases, this may make
2351 + * vma_slot_list_lock() waiters to serialized further by some
2352 + * sem->wait_lock, can this really be expensive?
2356 + * 0: if successfully locked mmap_sem
2357 + * -ENOENT: this slot was moved to del list
2358 + * -EBUSY: vma lock failed
2360 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2362 + struct vm_area_struct *vma;
2363 + struct mm_struct *mm;
2364 + struct rw_semaphore *sem;
2366 + spin_lock(&vma_slot_list_lock);
2368 + /* the slot_list was removed and inited from new list, when it enters
2369 + * uksm_list. If now it's not empty, then it must be moved to del list
2371 + if (!slot_in_uksm(slot)) {
2372 + spin_unlock(&vma_slot_list_lock);
2376 + BUG_ON(slot->pages != vma_pages(slot->vma));
2377 + /* Ok, vma still valid */
2380 + sem = &mm->mmap_sem;
2382 + if (uksm_test_exit(mm)) {
2383 + spin_unlock(&vma_slot_list_lock);
2387 + if (down_read_trylock(sem)) {
2388 + spin_unlock(&vma_slot_list_lock);
2392 + spin_unlock(&vma_slot_list_lock);
2396 +static inline unsigned long
2397 +vma_page_address(struct page *page, struct vm_area_struct *vma)
2399 + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2400 + unsigned long address;
2402 + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2403 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2404 + /* page should be within @vma mapping range */
2411 +/* return 0 on success with the item's mmap_sem locked */
2412 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2414 + struct mm_struct *mm;
2415 + struct vma_slot *slot = item->slot;
2416 + int err = -EINVAL;
2418 + struct page *page;
2421 + * try_down_read_slot_mmap_sem() returns non-zero if the slot
2422 + * has been removed by uksm_remove_vma().
2424 + if (try_down_read_slot_mmap_sem(slot))
2427 + mm = slot->vma->vm_mm;
2429 + if (uksm_test_exit(mm))
2432 + page = item->page;
2434 + if (!get_page_unless_zero(page)) {
2435 + rcu_read_unlock();
2439 + /* No need to consider huge page here. */
2440 + if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2441 + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2444 + * should we release this item becase of its stale page
2448 + rcu_read_unlock();
2451 + rcu_read_unlock();
2455 + up_read(&mm->mmap_sem);
2460 + * What kind of VMA is considered ?
2462 +static inline int vma_can_enter(struct vm_area_struct *vma)
2464 + return uksm_flags_can_scan(vma->vm_flags);
2468 + * Called whenever a fresh new vma is created A new vma_slot.
2469 + * is created and inserted into a global list Must be called.
2470 + * after vma is inserted to its mm .
2472 +void uksm_vma_add_new(struct vm_area_struct *vma)
2474 + struct vma_slot *slot;
2476 + if (!vma_can_enter(vma)) {
2477 + vma->uksm_vma_slot = NULL;
2481 + slot = alloc_vma_slot();
2483 + vma->uksm_vma_slot = NULL;
2487 + vma->uksm_vma_slot = slot;
2488 + vma->vm_flags |= VM_MERGEABLE;
2490 + slot->mm = vma->vm_mm;
2491 + slot->ctime_j = jiffies;
2492 + slot->pages = vma_pages(vma);
2493 + spin_lock(&vma_slot_list_lock);
2494 + list_add_tail(&slot->slot_list, &vma_slot_new);
2495 + spin_unlock(&vma_slot_list_lock);
2499 + * Called after vma is unlinked from its mm
2501 +void uksm_remove_vma(struct vm_area_struct *vma)
2503 + struct vma_slot *slot;
2505 + if (!vma->uksm_vma_slot)
2508 + slot = vma->uksm_vma_slot;
2509 + spin_lock(&vma_slot_list_lock);
2510 + if (slot_in_uksm(slot)) {
2512 + * This slot has been added by ksmd, so move to the del list
2513 + * waiting ksmd to free it.
2515 + list_add_tail(&slot->slot_list, &vma_slot_del);
2518 + * It's still on new list. It's ok to free slot directly.
2520 + list_del(&slot->slot_list);
2521 + free_vma_slot(slot);
2523 + spin_unlock(&vma_slot_list_lock);
2524 + vma->uksm_vma_slot = NULL;
2527 +/* 32/3 < they < 32/2 */
2531 +#define HASH_FROM_TO(from, to) \
2532 +for (index = from; index < to; index++) { \
2533 + pos = random_nums[index]; \
2534 + hash += key[pos]; \
2535 + hash += (hash << shiftl); \
2536 + hash ^= (hash >> shiftr); \
2540 +#define HASH_FROM_DOWN_TO(from, to) \
2541 +for (index = from - 1; index >= to; index--) { \
2542 + hash ^= (hash >> shiftr); \
2543 + hash ^= (hash >> (shiftr*2)); \
2544 + hash -= (hash << shiftl); \
2545 + hash += (hash << (shiftl*2)); \
2546 + pos = random_nums[index]; \
2547 + hash -= key[pos]; \
2551 + * The main random sample hash function.
2553 +static u32 random_sample_hash(void *addr, u32 hash_strength)
2555 + u32 hash = 0xdeadbeef;
2556 + int index, pos, loop = hash_strength;
2557 + u32 *key = (u32 *)addr;
2559 + if (loop > HASH_STRENGTH_FULL)
2560 + loop = HASH_STRENGTH_FULL;
2562 + HASH_FROM_TO(0, loop);
2564 + if (hash_strength > HASH_STRENGTH_FULL) {
2565 + loop = hash_strength - HASH_STRENGTH_FULL;
2566 + HASH_FROM_TO(0, loop);
2574 + * It's used when hash strength is adjusted
2576 + * @addr The page's virtual address
2577 + * @from The original hash strength
2578 + * @to The hash strength changed to
2579 + * @hash The hash value generated with "from" hash value
2581 + * return the hash value
2583 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
2585 + u32 *key = (u32 *)addr;
2586 + int index, pos; /* make sure they are int type */
2589 + if (from >= HASH_STRENGTH_FULL) {
2590 + from -= HASH_STRENGTH_FULL;
2591 + to -= HASH_STRENGTH_FULL;
2592 + HASH_FROM_TO(from, to);
2593 + } else if (to <= HASH_STRENGTH_FULL) {
2594 + HASH_FROM_TO(from, to);
2596 + HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2597 + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2600 + if (from <= HASH_STRENGTH_FULL) {
2601 + HASH_FROM_DOWN_TO(from, to);
2602 + } else if (to >= HASH_STRENGTH_FULL) {
2603 + from -= HASH_STRENGTH_FULL;
2604 + to -= HASH_STRENGTH_FULL;
2605 + HASH_FROM_DOWN_TO(from, to);
2607 + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2608 + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2618 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2622 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2625 + * return 0 if no page has been scanned since last call, 1 otherwise.
2627 +static inline int encode_benefit(void)
2629 + u64 scanned_delta, pos_delta, neg_delta;
2630 + unsigned long base = benefit.base;
2632 + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2634 + if (!scanned_delta)
2637 + scanned_delta >>= base;
2638 + pos_delta = rshash_pos >> base;
2639 + neg_delta = rshash_neg >> base;
2641 + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2642 + CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2643 + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2644 + benefit.scanned >>= 1;
2645 + benefit.neg >>= 1;
2646 + benefit.pos >>= 1;
2648 + scanned_delta >>= 1;
2653 + benefit.pos += pos_delta;
2654 + benefit.neg += neg_delta;
2655 + benefit.scanned += scanned_delta;
2657 + BUG_ON(!benefit.scanned);
2659 + rshash_pos = rshash_neg = 0;
2660 + uksm_pages_scanned_last = uksm_pages_scanned;
2665 +static inline void reset_benefit(void)
2670 + benefit.scanned = 0;
2673 +static inline void inc_rshash_pos(unsigned long delta)
2675 + if (CAN_OVERFLOW_U64(rshash_pos, delta))
2678 + rshash_pos += delta;
2681 +static inline void inc_rshash_neg(unsigned long delta)
2683 + if (CAN_OVERFLOW_U64(rshash_neg, delta))
2686 + rshash_neg += delta;
2690 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2691 + int cost_accounting)
2694 + unsigned long delta;
2696 + void *addr = kmap_atomic(page);
2698 + val = random_sample_hash(addr, hash_strength);
2699 + kunmap_atomic(addr);
2701 + if (cost_accounting) {
2702 + if (HASH_STRENGTH_FULL > hash_strength)
2703 + delta = HASH_STRENGTH_FULL - hash_strength;
2707 + inc_rshash_pos(delta);
2713 +static int memcmp_pages(struct page *page1, struct page *page2,
2714 + int cost_accounting)
2716 + char *addr1, *addr2;
2719 + addr1 = kmap_atomic(page1);
2720 + addr2 = kmap_atomic(page2);
2721 + ret = memcmp(addr1, addr2, PAGE_SIZE);
2722 + kunmap_atomic(addr2);
2723 + kunmap_atomic(addr1);
2725 + if (cost_accounting)
2726 + inc_rshash_neg(memcmp_cost);
2731 +static inline int pages_identical(struct page *page1, struct page *page2)
2733 + return !memcmp_pages(page1, page2, 0);
2736 +static inline int is_page_full_zero(struct page *page)
2741 + addr = kmap_atomic(page);
2742 + ret = is_full_zero(addr, PAGE_SIZE);
2743 + kunmap_atomic(addr);
2748 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2749 + pte_t *orig_pte, pte_t *old_pte)
2751 + struct mm_struct *mm = vma->vm_mm;
2752 + unsigned long addr;
2756 + int err = -EFAULT;
2757 + unsigned long mmun_start; /* For mmu_notifiers */
2758 + unsigned long mmun_end; /* For mmu_notifiers */
2760 + addr = page_address_in_vma(page, vma);
2761 + if (addr == -EFAULT)
2764 + BUG_ON(PageTransCompound(page));
2766 + mmun_start = addr;
2767 + mmun_end = addr + PAGE_SIZE;
2768 + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2770 + ptep = page_check_address(page, mm, addr, &ptl, 0);
2777 + if (pte_write(*ptep) || pte_dirty(*ptep)) {
2780 + swapped = PageSwapCache(page);
2781 + flush_cache_page(vma, addr, page_to_pfn(page));
2783 + * Ok this is tricky, when get_user_pages_fast() run it doesnt
2784 + * take any lock, therefore the check that we are going to make
2785 + * with the pagecount against the mapcount is racey and
2786 + * O_DIRECT can happen right after the check.
2787 + * So we clear the pte and flush the tlb before the check
2788 + * this assure us that no O_DIRECT can happen after the check
2789 + * or in the middle of the check.
2791 + entry = ptep_clear_flush(vma, addr, ptep);
2793 + * Check that no O_DIRECT or similar I/O is in progress on the
2796 + if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2797 + set_pte_at(mm, addr, ptep, entry);
2800 + if (pte_dirty(entry))
2801 + set_page_dirty(page);
2802 + entry = pte_mkclean(pte_wrprotect(entry));
2803 + set_pte_at_notify(mm, addr, ptep, entry);
2805 + *orig_pte = *ptep;
2809 + pte_unmap_unlock(ptep, ptl);
2811 + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2816 +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2817 +#define MERGE_ERR_COLLI 2 /* there is a collision */
2818 +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
2819 +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
2823 + * replace_page - replace page in vma by new ksm page
2824 + * @vma: vma that holds the pte pointing to page
2825 + * @page: the page we are replacing by kpage
2826 + * @kpage: the ksm page we replace page by
2827 + * @orig_pte: the original value of the pte
2829 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2831 +static int replace_page(struct vm_area_struct *vma, struct page *page,
2832 + struct page *kpage, pte_t orig_pte)
2834 + struct mm_struct *mm = vma->vm_mm;
2842 + unsigned long addr;
2843 + int err = MERGE_ERR_PGERR;
2844 + unsigned long mmun_start; /* For mmu_notifiers */
2845 + unsigned long mmun_end; /* For mmu_notifiers */
2847 + addr = page_address_in_vma(page, vma);
2848 + if (addr == -EFAULT)
2851 + pgd = pgd_offset(mm, addr);
2852 + if (!pgd_present(*pgd))
2855 + pud = pud_offset(pgd, addr);
2856 + if (!pud_present(*pud))
2859 + pmd = pmd_offset(pud, addr);
2860 + BUG_ON(pmd_trans_huge(*pmd));
2861 + if (!pmd_present(*pmd))
2864 + mmun_start = addr;
2865 + mmun_end = addr + PAGE_SIZE;
2866 + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2868 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2869 + if (!pte_same(*ptep, orig_pte)) {
2870 + pte_unmap_unlock(ptep, ptl);
2874 + flush_cache_page(vma, addr, pte_pfn(*ptep));
2875 + ptep_clear_flush(vma, addr, ptep);
2876 + entry = mk_pte(kpage, vma->vm_page_prot);
2878 + /* special treatment is needed for zero_page */
2879 + if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2880 + (page_to_pfn(kpage) == zero_pfn))
2881 + entry = pte_mkspecial(entry);
2884 + page_add_anon_rmap(kpage, vma, addr);
2887 + set_pte_at_notify(mm, addr, ptep, entry);
2889 + page_remove_rmap(page);
2890 + if (!page_mapped(page))
2891 + try_to_free_swap(page);
2894 + pte_unmap_unlock(ptep, ptl);
2897 + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2904 + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2905 + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2906 + * hash_max member has not been calculated.
2908 + * @page The page needs to be hashed
2909 + * @hash_old The hash value calculated with current hash strength
2911 + * return the new hash value calculated at HASH_STRENGTH_MAX
2913 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2918 + addr = kmap_atomic(page);
2919 + hash_max = delta_hash(addr, hash_strength,
2920 + HASH_STRENGTH_MAX, hash_old);
2922 + kunmap_atomic(addr);
2927 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2932 + * We compare the hash again, to ensure that it is really a hash collision
2933 + * instead of being caused by page write.
2935 +static inline int check_collision(struct rmap_item *rmap_item,
2939 + struct page *page = rmap_item->page;
2941 + /* if this rmap_item has already been hash_maxed, then the collision
2942 + * must appears in the second-level rbtree search. In this case we check
2943 + * if its hash_max value has been changed. Otherwise, the collision
2944 + * happens in the first-level rbtree search, so we check against it's
2945 + * current hash value.
2947 + if (rmap_item->hash_max) {
2948 + inc_rshash_neg(memcmp_cost);
2949 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2951 + if (rmap_item->hash_max == page_hash_max(page, hash))
2952 + err = MERGE_ERR_COLLI;
2954 + err = MERGE_ERR_CHANGED;
2956 + inc_rshash_neg(memcmp_cost + hash_strength);
2958 + if (page_hash(page, hash_strength, 0) == hash)
2959 + err = MERGE_ERR_COLLI;
2961 + err = MERGE_ERR_CHANGED;
2967 +static struct page *page_trans_compound_anon(struct page *page)
2969 + if (PageTransCompound(page)) {
2970 + struct page *head = compound_head(page);
2972 + * head may actually be splitted and freed from under
2973 + * us but it's ok here.
2975 + if (PageAnon(head))
2981 +static int page_trans_compound_anon_split(struct page *page)
2984 + struct page *transhuge_head = page_trans_compound_anon(page);
2985 + if (transhuge_head) {
2986 + /* Get the reference on the head to split it. */
2987 + if (get_page_unless_zero(transhuge_head)) {
2989 + * Recheck we got the reference while the head
2990 + * was still anonymous.
2992 + if (PageAnon(transhuge_head))
2993 + ret = split_huge_page(transhuge_head);
2996 + * Retry later if split_huge_page run
3000 + put_page(transhuge_head);
3002 + /* Retry later if split_huge_page run from under us. */
3009 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3010 + * already be a ksm page.
3012 + * @return 0 if the pages were merged, -EFAULT otherwise.
3014 +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3015 + struct page *kpage, u32 hash)
3017 + struct vm_area_struct *vma = rmap_item->slot->vma;
3018 + struct mm_struct *mm = vma->vm_mm;
3019 + pte_t orig_pte = __pte(0);
3020 + int err = MERGE_ERR_PGERR;
3021 + struct page *page;
3023 + if (uksm_test_exit(mm))
3026 + page = rmap_item->page;
3028 + if (page == kpage) { /* ksm page forked */
3033 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3035 + BUG_ON(PageTransCompound(page));
3037 + if (!PageAnon(page) || !PageKsm(kpage))
3041 + * We need the page lock to read a stable PageSwapCache in
3042 + * write_protect_page(). We use trylock_page() instead of
3043 + * lock_page() because we don't want to wait here - we
3044 + * prefer to continue scanning and merging different pages,
3045 + * then come back to this page when it is unlocked.
3047 + if (!trylock_page(page))
3050 + * If this anonymous page is mapped only here, its pte may need
3051 + * to be write-protected. If it's mapped elsewhere, all of its
3052 + * ptes are necessarily already write-protected. But in either
3053 + * case, we need to lock and check page_count is not raised.
3055 + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3056 + if (pages_identical(page, kpage))
3057 + err = replace_page(vma, page, kpage, orig_pte);
3059 + err = check_collision(rmap_item, hash);
3062 + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3063 + munlock_vma_page(page);
3064 + if (!PageMlocked(kpage)) {
3065 + unlock_page(page);
3067 + mlock_vma_page(kpage);
3068 + page = kpage; /* for final unlock */
3072 + unlock_page(page);
3080 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3081 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
3083 + * @return 0 on success.
3085 +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3086 + pte_t orig_pte, pte_t wprt_pte)
3088 + struct mm_struct *mm = vma->vm_mm;
3095 + int err = -EFAULT;
3097 + pgd = pgd_offset(mm, addr);
3098 + if (!pgd_present(*pgd))
3101 + pud = pud_offset(pgd, addr);
3102 + if (!pud_present(*pud))
3105 + pmd = pmd_offset(pud, addr);
3106 + if (!pmd_present(*pmd))
3109 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3110 + if (!pte_same(*ptep, wprt_pte)) {
3111 + /* already copied, let it be */
3112 + pte_unmap_unlock(ptep, ptl);
3117 + * Good boy, still here. When we still get the ksm page, it does not
3118 + * return to the free page pool, there is no way that a pte was changed
3119 + * to other page and gets back to this page. And remind that ksm page
3120 + * do not reuse in do_wp_page(). So it's safe to restore the original
3123 + flush_cache_page(vma, addr, pte_pfn(*ptep));
3124 + ptep_clear_flush(vma, addr, ptep);
3125 + set_pte_at_notify(mm, addr, ptep, orig_pte);
3127 + pte_unmap_unlock(ptep, ptl);
3134 + * try_to_merge_two_pages() - take two identical pages and prepare
3135 + * them to be merged into one page(rmap_item->page)
3137 + * @return 0 if we successfully merged two identical pages into
3138 + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3139 + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3140 + * changed since it's hashed. MERGE_ERR_PGERR otherwise.
3143 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3144 + struct rmap_item *tree_rmap_item,
3147 + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3148 + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3149 + struct vm_area_struct *vma1 = rmap_item->slot->vma;
3150 + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3151 + struct page *page = rmap_item->page;
3152 + struct page *tree_page = tree_rmap_item->page;
3153 + int err = MERGE_ERR_PGERR;
3154 + struct address_space *saved_mapping;
3157 + if (rmap_item->page == tree_rmap_item->page)
3160 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3162 + BUG_ON(PageTransCompound(page));
3164 + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
3166 + BUG_ON(PageTransCompound(tree_page));
3168 + if (!PageAnon(page) || !PageAnon(tree_page))
3171 + if (!trylock_page(page))
3175 + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3176 + unlock_page(page);
3181 + * While we hold page lock, upgrade page from
3182 + * PageAnon+anon_vma to PageKsm+NULL stable_node:
3183 + * stable_tree_insert() will update stable_node.
3185 + saved_mapping = page->mapping;
3186 + set_page_stable_node(page, NULL);
3187 + mark_page_accessed(page);
3188 + unlock_page(page);
3190 + if (!trylock_page(tree_page))
3193 + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3194 + unlock_page(tree_page);
3198 + if (pages_identical(page, tree_page)) {
3199 + err = replace_page(vma2, tree_page, page, wprt_pte2);
3201 + unlock_page(tree_page);
3205 + if ((vma2->vm_flags & VM_LOCKED)) {
3206 + munlock_vma_page(tree_page);
3207 + if (!PageMlocked(page)) {
3208 + unlock_page(tree_page);
3210 + mlock_vma_page(page);
3211 + tree_page = page; /* for final unlock */
3215 + unlock_page(tree_page);
3217 + goto out; /* success */
3220 + if (tree_rmap_item->hash_max &&
3221 + tree_rmap_item->hash_max == rmap_item->hash_max) {
3222 + err = MERGE_ERR_COLLI_MAX;
3223 + } else if (page_hash(page, hash_strength, 0) ==
3224 + page_hash(tree_page, hash_strength, 0)) {
3225 + inc_rshash_neg(memcmp_cost + hash_strength * 2);
3226 + err = MERGE_ERR_COLLI;
3228 + err = MERGE_ERR_CHANGED;
3231 + unlock_page(tree_page);
3236 + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3237 + orig_pte1, wprt_pte1))
3238 + page->mapping = saved_mapping;
3240 + unlock_page(page);
3245 +static inline int hash_cmp(u32 new_val, u32 node_val)
3247 + if (new_val > node_val)
3249 + else if (new_val < node_val)
3255 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3257 + u32 hash_max = item->hash_max;
3260 + hash_max = page_hash_max(item->page, hash);
3262 + item->hash_max = hash_max;
3271 + * stable_tree_search() - search the stable tree for a page
3273 + * @item: the rmap_item we are comparing with
3274 + * @hash: the hash value of this item->page already calculated
3276 + * @return the page we have found, NULL otherwise. The page returned has
3279 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3281 + struct rb_node *node = root_stable_treep->rb_node;
3282 + struct tree_node *tree_node;
3283 + unsigned long hash_max;
3284 + struct page *page = item->page;
3285 + struct stable_node *stable_node;
3287 + stable_node = page_stable_node(page);
3288 + if (stable_node) {
3289 + /* ksm page forked, that is
3290 + * if (PageKsm(page) && !in_stable_tree(rmap_item))
3291 + * it's actually gotten once outside.
3300 + tree_node = rb_entry(node, struct tree_node, node);
3302 + cmp = hash_cmp(hash, tree_node->hash);
3305 + node = node->rb_left;
3307 + node = node->rb_right;
3315 + if (tree_node->count == 1) {
3316 + stable_node = rb_entry(tree_node->sub_root.rb_node,
3317 + struct stable_node, node);
3318 + BUG_ON(!stable_node);
3320 + goto get_page_out;
3324 + * ok, we have to search the second
3325 + * level subtree, hash the page to a
3328 + node = tree_node->sub_root.rb_node;
3330 + hash_max = rmap_item_hash_max(item, hash);
3335 + stable_node = rb_entry(node, struct stable_node, node);
3337 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3340 + node = node->rb_left;
3342 + node = node->rb_right;
3344 + goto get_page_out;
3350 + page = get_uksm_page(stable_node, 1, 1);
3354 +static int try_merge_rmap_item(struct rmap_item *item,
3355 + struct page *kpage,
3356 + struct page *tree_page)
3360 + unsigned long addr;
3361 + struct vm_area_struct *vma = item->slot->vma;
3363 + addr = get_rmap_addr(item);
3364 + ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
3368 + if (pte_write(*ptep)) {
3369 + /* has changed, abort! */
3370 + pte_unmap_unlock(ptep, ptl);
3374 + get_page(tree_page);
3375 + page_add_anon_rmap(tree_page, vma, addr);
3377 + flush_cache_page(vma, addr, pte_pfn(*ptep));
3378 + ptep_clear_flush(vma, addr, ptep);
3379 + set_pte_at_notify(vma->vm_mm, addr, ptep,
3380 + mk_pte(tree_page, vma->vm_page_prot));
3382 + page_remove_rmap(kpage);
3385 + pte_unmap_unlock(ptep, ptl);
3391 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3392 + * into stable tree, the page was found to be identical to a stable ksm page,
3393 + * this is the last chance we can merge them into one.
3395 + * @item1: the rmap_item holding the page which we wanted to insert
3396 + * into stable tree.
3397 + * @item2: the other rmap_item we found when unstable tree search
3398 + * @oldpage: the page currently mapped by the two rmap_items
3399 + * @tree_page: the page we found identical in stable tree node
3400 + * @success1: return if item1 is successfully merged
3401 + * @success2: return if item2 is successfully merged
3403 +static void try_merge_with_stable(struct rmap_item *item1,
3404 + struct rmap_item *item2,
3405 + struct page **kpage,
3406 + struct page *tree_page,
3407 + int *success1, int *success2)
3409 + struct vm_area_struct *vma1 = item1->slot->vma;
3410 + struct vm_area_struct *vma2 = item2->slot->vma;
3414 + if (unlikely(*kpage == tree_page)) {
3415 + /* I don't think this can really happen */
3416 + printk(KERN_WARNING "UKSM: unexpected condition detected in "
3417 + "try_merge_with_stable() -- *kpage == tree_page !\n");
3423 + if (!PageAnon(*kpage) || !PageKsm(*kpage))
3426 + if (!trylock_page(tree_page))
3429 + /* If the oldpage is still ksm and still pointed
3430 + * to in the right place, and still write protected,
3431 + * we are confident it's not changed, no need to
3433 + * be ware, we cannot take nested pte locks,
3436 + if (!try_merge_rmap_item(item1, *kpage, tree_page))
3437 + goto unlock_failed;
3439 + /* ok, then vma2, remind that pte1 already set */
3440 + if (!try_merge_rmap_item(item2, *kpage, tree_page))
3448 + if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3449 + (*success2 && vma2->vm_flags & VM_LOCKED)) {
3450 + munlock_vma_page(*kpage);
3451 + if (!PageMlocked(tree_page))
3452 + mlock_vma_page(tree_page);
3456 + * We do not need oldpage any more in the caller, so can break the lock
3459 + unlock_page(*kpage);
3460 + *kpage = tree_page; /* Get unlocked outside. */
3464 + unlock_page(tree_page);
3469 +static inline void stable_node_hash_max(struct stable_node *node,
3470 + struct page *page, u32 hash)
3472 + u32 hash_max = node->hash_max;
3475 + hash_max = page_hash_max(page, hash);
3476 + node->hash_max = hash_max;
3481 +struct stable_node *new_stable_node(struct tree_node *tree_node,
3482 + struct page *kpage, u32 hash_max)
3484 + struct stable_node *new_stable_node;
3486 + new_stable_node = alloc_stable_node();
3487 + if (!new_stable_node)
3490 + new_stable_node->kpfn = page_to_pfn(kpage);
3491 + new_stable_node->hash_max = hash_max;
3492 + new_stable_node->tree_node = tree_node;
3493 + set_page_stable_node(kpage, new_stable_node);
3495 + return new_stable_node;
3499 +struct stable_node *first_level_insert(struct tree_node *tree_node,
3500 + struct rmap_item *rmap_item,
3501 + struct rmap_item *tree_rmap_item,
3502 + struct page **kpage, u32 hash,
3503 + int *success1, int *success2)
3506 + struct page *tree_page;
3508 + struct stable_node *stable_node, *new_snode;
3509 + struct rb_node *parent = NULL, **new;
3511 + /* this tree node contains no sub-tree yet */
3512 + stable_node = rb_entry(tree_node->sub_root.rb_node,
3513 + struct stable_node, node);
3515 + tree_page = get_uksm_page(stable_node, 1, 0);
3517 + cmp = memcmp_pages(*kpage, tree_page, 1);
3519 + try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3520 + tree_page, success1, success2);
3521 + put_page(tree_page);
3522 + if (!*success1 && !*success2)
3525 + return stable_node;
3529 + * collision in first level try to create a subtree.
3530 + * A new node need to be created.
3532 + put_page(tree_page);
3534 + stable_node_hash_max(stable_node, tree_page,
3536 + hash_max = rmap_item_hash_max(rmap_item, hash);
3537 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3539 + parent = &stable_node->node;
3541 + new = &parent->rb_left;
3542 + } else if (cmp > 0) {
3543 + new = &parent->rb_right;
3550 + /* the only stable_node deleted, we reuse its tree_node.
3553 + new = &tree_node->sub_root.rb_node;
3556 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
3560 + rb_link_node(&new_snode->node, parent, new);
3561 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
3562 + tree_node->count++;
3563 + *success1 = *success2 = 1;
3572 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3573 + struct rmap_item *rmap_item,
3574 + struct rmap_item *tree_rmap_item,
3575 + struct page **kpage, u32 hash,
3576 + int *success1, int *success2)
3578 + struct page *tree_page;
3580 + struct stable_node *stable_node, *new_snode;
3581 + struct rb_node *parent, **new;
3585 + new = &tree_node->sub_root.rb_node;
3587 + hash_max = rmap_item_hash_max(rmap_item, hash);
3591 + stable_node = rb_entry(*new, struct stable_node, node);
3593 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3597 + new = &parent->rb_left;
3598 + } else if (cmp > 0) {
3600 + new = &parent->rb_right;
3602 + tree_page = get_uksm_page(stable_node, 1, 0);
3604 + cmp = memcmp_pages(*kpage, tree_page, 1);
3606 + try_merge_with_stable(rmap_item,
3607 + tree_rmap_item, kpage,
3608 + tree_page, success1, success2);
3610 + put_page(tree_page);
3611 + if (!*success1 && !*success2)
3614 + * successfully merged with a stable
3617 + return stable_node;
3619 + put_page(tree_page);
3624 + * stable node may be deleted,
3625 + * and subtree maybe
3626 + * restructed, cannot
3627 + * continue, research it.
3629 + if (tree_node->count) {
3632 + /* reuse the tree node*/
3634 + new = &tree_node->sub_root.rb_node;
3640 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
3644 + rb_link_node(&new_snode->node, parent, new);
3645 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
3646 + tree_node->count++;
3647 + *success1 = *success2 = 1;
3657 + * stable_tree_insert() - try to insert a merged page in unstable tree to
3660 + * @kpage: the page need to be inserted
3661 + * @hash: the current hash of this page
3662 + * @rmap_item: the rmap_item being scanned
3663 + * @tree_rmap_item: the rmap_item found on unstable tree
3664 + * @success1: return if rmap_item is merged
3665 + * @success2: return if tree_rmap_item is merged
3667 + * @return the stable_node on stable tree if at least one
3668 + * rmap_item is inserted into stable tree, NULL
3671 +static struct stable_node *
3672 +stable_tree_insert(struct page **kpage, u32 hash,
3673 + struct rmap_item *rmap_item,
3674 + struct rmap_item *tree_rmap_item,
3675 + int *success1, int *success2)
3677 + struct rb_node **new = &root_stable_treep->rb_node;
3678 + struct rb_node *parent = NULL;
3679 + struct stable_node *stable_node;
3680 + struct tree_node *tree_node;
3683 + *success1 = *success2 = 0;
3688 + tree_node = rb_entry(*new, struct tree_node, node);
3690 + cmp = hash_cmp(hash, tree_node->hash);
3694 + new = &parent->rb_left;
3695 + } else if (cmp > 0) {
3697 + new = &parent->rb_right;
3703 + if (tree_node->count == 1) {
3704 + stable_node = first_level_insert(tree_node, rmap_item,
3705 + tree_rmap_item, kpage,
3706 + hash, success1, success2);
3708 + stable_node = stable_subtree_insert(tree_node,
3709 + rmap_item, tree_rmap_item, kpage,
3710 + hash, success1, success2);
3714 + /* no tree node found */
3715 + tree_node = alloc_tree_node(stable_tree_node_listp);
3717 + stable_node = NULL;
3721 + stable_node = new_stable_node(tree_node, *kpage, hash_max);
3722 + if (!stable_node) {
3723 + free_tree_node(tree_node);
3727 + tree_node->hash = hash;
3728 + rb_link_node(&tree_node->node, parent, new);
3729 + rb_insert_color(&tree_node->node, root_stable_treep);
3731 + new = &tree_node->sub_root.rb_node;
3733 + rb_link_node(&stable_node->node, parent, new);
3734 + rb_insert_color(&stable_node->node, &tree_node->sub_root);
3735 + tree_node->count++;
3736 + *success1 = *success2 = 1;
3740 + return stable_node;
3745 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3747 + * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
3748 + * -EINVAL if the page mapping has been changed.
3750 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3754 + err = get_mergeable_page_lock_mmap(tree_rmap_item);
3756 + if (err == -EINVAL) {
3757 + /* its page map has been changed, remove it */
3758 + remove_rmap_item_from_tree(tree_rmap_item);
3761 + /* The page is gotten and mmap_sem is locked now. */
3767 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3768 + * same hash value. Get its page and trylock the mmap_sem
3771 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3775 + struct rb_node **new = &root_unstable_tree.rb_node;
3776 + struct rb_node *parent = NULL;
3777 + struct tree_node *tree_node;
3779 + struct rmap_item *tree_rmap_item;
3784 + tree_node = rb_entry(*new, struct tree_node, node);
3786 + cmp = hash_cmp(hash, tree_node->hash);
3790 + new = &parent->rb_left;
3791 + } else if (cmp > 0) {
3793 + new = &parent->rb_right;
3799 + /* got the tree_node */
3800 + if (tree_node->count == 1) {
3801 + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3802 + struct rmap_item, node);
3803 + BUG_ON(!tree_rmap_item);
3805 + goto get_page_out;
3808 + /* well, search the collision subtree */
3809 + new = &tree_node->sub_root.rb_node;
3811 + hash_max = rmap_item_hash_max(rmap_item, hash);
3816 + tree_rmap_item = rb_entry(*new, struct rmap_item,
3819 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3822 + new = &parent->rb_left;
3824 + new = &parent->rb_right;
3826 + goto get_page_out;
3829 + /* alloc a new tree_node */
3830 + tree_node = alloc_tree_node(&unstable_tree_node_list);
3834 + tree_node->hash = hash;
3835 + rb_link_node(&tree_node->node, parent, new);
3836 + rb_insert_color(&tree_node->node, &root_unstable_tree);
3838 + new = &tree_node->sub_root.rb_node;
3841 + /* did not found even in sub-tree */
3842 + rmap_item->tree_node = tree_node;
3843 + rmap_item->address |= UNSTABLE_FLAG;
3844 + rmap_item->hash_round = uksm_hash_round;
3845 + rb_link_node(&rmap_item->node, parent, new);
3846 + rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3848 + uksm_pages_unshared++;
3852 + if (tree_rmap_item->page == rmap_item->page)
3855 + if (get_tree_rmap_item_page(tree_rmap_item))
3858 + return tree_rmap_item;
3861 +static void hold_anon_vma(struct rmap_item *rmap_item,
3862 + struct anon_vma *anon_vma)
3864 + rmap_item->anon_vma = anon_vma;
3865 + get_anon_vma(anon_vma);
3870 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3871 + * ratio statistics is done in this function.
3874 +static void stable_tree_append(struct rmap_item *rmap_item,
3875 + struct stable_node *stable_node, int logdedup)
3877 + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3878 + unsigned long key = (unsigned long)rmap_item->slot;
3879 + unsigned long factor = rmap_item->slot->rung->step;
3881 + BUG_ON(!stable_node);
3882 + rmap_item->address |= STABLE_FLAG;
3884 + if (hlist_empty(&stable_node->hlist)) {
3885 + uksm_pages_shared++;
3886 + goto node_vma_new;
3888 + uksm_pages_sharing++;
3891 + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3892 + if (node_vma->key >= key)
3896 + node_vma->slot->pages_bemerged += factor;
3897 + if (list_empty(&node_vma->slot->dedup_list))
3898 + list_add(&node_vma->slot->dedup_list,
3904 + if (node_vma->key == key) {
3905 + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3907 + } else if (node_vma->key > key) {
3908 + node_vma_cont = node_vma;
3913 + /* no same vma already in node, alloc a new node_vma */
3914 + new_node_vma = alloc_node_vma();
3915 + BUG_ON(!new_node_vma);
3916 + new_node_vma->head = stable_node;
3917 + new_node_vma->slot = rmap_item->slot;
3920 + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3921 + } else if (node_vma->key != key) {
3922 + if (node_vma->key < key)
3923 + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3925 + hlist_add_before(&new_node_vma->hlist,
3926 + &node_vma->hlist);
3930 + node_vma = new_node_vma;
3932 +node_vma_ok: /* ok, ready to add to the list */
3933 + rmap_item->head = node_vma;
3934 + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3935 + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3937 + rmap_item->slot->pages_merged++;
3938 + if (node_vma_cont) {
3939 + node_vma = node_vma_cont;
3940 + hlist_for_each_entry_continue(node_vma, hlist) {
3941 + node_vma->slot->pages_bemerged += factor;
3942 + if (list_empty(&node_vma->slot->dedup_list))
3943 + list_add(&node_vma->slot->dedup_list,
3951 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3953 + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3956 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3957 + * in case the application has unmapped and remapped mm,addr meanwhile.
3958 + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
3959 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3961 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3963 + struct page *page;
3968 + page = follow_page(vma, addr, FOLL_GET);
3969 + if (IS_ERR_OR_NULL(page))
3971 + if (PageKsm(page)) {
3972 + ret = handle_mm_fault(vma->vm_mm, vma, addr,
3973 + FAULT_FLAG_WRITE);
3975 + ret = VM_FAULT_WRITE;
3977 + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3979 + * We must loop because handle_mm_fault() may back out if there's
3980 + * any difficulty e.g. if pte accessed bit gets updated concurrently.
3982 + * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3983 + * COW has been broken, even if the vma does not permit VM_WRITE;
3984 + * but note that a concurrent fault might break PageKsm for us.
3986 + * VM_FAULT_SIGBUS could occur if we race with truncation of the
3987 + * backing file, which also invalidates anonymous pages: that's
3988 + * okay, that truncation will have unmapped the PageKsm for us.
3990 + * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3991 + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3992 + * current task has TIF_MEMDIE set, and will be OOM killed on return
3993 + * to user; and ksmd, having no mm, would never be chosen for that.
3995 + * But if the mm is in a limited mem_cgroup, then the fault may fail
3996 + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3997 + * even ksmd can fail in this way - though it's usually breaking ksm
3998 + * just to undo a merge it made a moment before, so unlikely to oom.
4000 + * That's a pity: we might therefore have more kernel pages allocated
4001 + * than we're counting as nodes in the stable tree; but uksm_do_scan
4002 + * will retry to break_cow on each pass, so should recover the page
4003 + * in due course. The important thing is to not let VM_MERGEABLE
4004 + * be cleared while any such pages might remain in the area.
4006 + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4009 +static void break_cow(struct rmap_item *rmap_item)
4011 + struct vm_area_struct *vma = rmap_item->slot->vma;
4012 + struct mm_struct *mm = vma->vm_mm;
4013 + unsigned long addr = get_rmap_addr(rmap_item);
4015 + if (uksm_test_exit(mm))
4018 + break_ksm(vma, addr);
4024 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4025 + * than check every pte of a given vma, the locking doesn't quite work for
4026 + * that - an rmap_item is assigned to the stable tree after inserting ksm
4027 + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
4028 + * rmap_items from parent to child at fork time (so as not to waste time
4029 + * if exit comes before the next scan reaches it).
4031 + * Similarly, although we'd like to remove rmap_items (so updating counts
4032 + * and freeing memory) when unmerging an area, it's easier to leave that
4033 + * to the next pass of ksmd - consider, for example, how ksmd might be
4034 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
4036 +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4037 + unsigned long start, unsigned long end)
4039 + unsigned long addr;
4042 + for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4043 + if (uksm_test_exit(vma->vm_mm))
4045 + if (signal_pending(current))
4046 + err = -ERESTARTSYS;
4048 + err = break_ksm(vma, addr);
4053 +static inline void inc_uksm_pages_scanned(void)
4058 + if (uksm_pages_scanned == U64_MAX) {
4061 + delta = uksm_pages_scanned >> pages_scanned_base;
4063 + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4064 + pages_scanned_stored >>= 1;
4066 + pages_scanned_base++;
4069 + pages_scanned_stored += delta;
4071 + uksm_pages_scanned = uksm_pages_scanned_last = 0;
4074 + uksm_pages_scanned++;
4077 +static inline int find_zero_page_hash(int strength, u32 hash)
4079 + return (zero_hash_table[strength] == hash);
4083 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4085 + struct page *zero_page = empty_uksm_zero_page;
4086 + struct mm_struct *mm = vma->vm_mm;
4087 + pte_t orig_pte = __pte(0);
4088 + int err = -EFAULT;
4090 + if (uksm_test_exit(mm))
4093 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
4095 + BUG_ON(PageTransCompound(page));
4097 + if (!PageAnon(page))
4100 + if (!trylock_page(page))
4103 + if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4104 + if (is_page_full_zero(page))
4105 + err = replace_page(vma, page, zero_page, orig_pte);
4108 + unlock_page(page);
4114 + * cmp_and_merge_page() - first see if page can be merged into the stable
4115 + * tree; if not, compare hash to previous and if it's the same, see if page
4116 + * can be inserted into the unstable tree, or merged with a page already there
4117 + * and both transferred to the stable tree.
4119 + * @page: the page that we are searching identical page to.
4120 + * @rmap_item: the reverse mapping into the virtual address of this page
4122 +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4124 + struct rmap_item *tree_rmap_item;
4125 + struct page *page;
4126 + struct page *kpage = NULL;
4129 + unsigned int success1, success2;
4130 + struct stable_node *snode;
4132 + struct rb_node *parent = NULL, **new;
4134 + remove_rmap_item_from_tree(rmap_item);
4135 + page = rmap_item->page;
4137 + /* We first start with searching the page inside the stable tree */
4138 + kpage = stable_tree_search(rmap_item, hash);
4140 + err = try_to_merge_with_uksm_page(rmap_item, kpage,
4144 + * The page was successfully merged, add
4145 + * its rmap_item to the stable tree.
4146 + * page lock is needed because it's
4147 + * racing with try_to_unmap_ksm(), etc.
4150 + snode = page_stable_node(kpage);
4151 + stable_tree_append(rmap_item, snode, 1);
4152 + unlock_page(kpage);
4154 + return; /* success */
4159 + * if it's a collision and it has been search in sub-rbtree
4160 + * (hash_max != 0), we want to abort, because if it is
4161 + * successfully merged in unstable tree, the collision trends to
4164 + if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4169 + unstable_tree_search_insert(rmap_item, hash);
4170 + if (tree_rmap_item) {
4171 + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4173 + * As soon as we merge this page, we want to remove the
4174 + * rmap_item of the page we have merged with from the unstable
4175 + * tree, and insert it instead as new node in the stable tree.
4179 + remove_rmap_item_from_tree(tree_rmap_item);
4181 + snode = stable_tree_insert(&kpage, hash,
4182 + rmap_item, tree_rmap_item,
4183 + &success1, &success2);
4186 + * Do not log dedup for tree item, it's not counted as
4187 + * scanned in this round.
4190 + stable_tree_append(tree_rmap_item, snode, 0);
4193 + * The order of these two stable append is important:
4194 + * we are scanning rmap_item.
4197 + stable_tree_append(rmap_item, snode, 1);
4200 + * The original kpage may be unlocked inside
4201 + * stable_tree_insert() already. This page
4202 + * should be unlocked before doing
4205 + unlock_page(kpage);
4208 + break_cow(rmap_item);
4211 + break_cow(tree_rmap_item);
4213 + } else if (err == MERGE_ERR_COLLI) {
4214 + BUG_ON(tree_rmap_item->tree_node->count > 1);
4216 + rmap_item_hash_max(tree_rmap_item,
4217 + tree_rmap_item->tree_node->hash);
4219 + hash_max = rmap_item_hash_max(rmap_item, hash);
4220 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4221 + parent = &tree_rmap_item->node;
4223 + new = &parent->rb_left;
4225 + new = &parent->rb_right;
4229 + rmap_item->tree_node = tree_rmap_item->tree_node;
4230 + rmap_item->address |= UNSTABLE_FLAG;
4231 + rmap_item->hash_round = uksm_hash_round;
4232 + rb_link_node(&rmap_item->node, parent, new);
4233 + rb_insert_color(&rmap_item->node,
4234 + &tree_rmap_item->tree_node->sub_root);
4235 + rmap_item->tree_node->count++;
4238 + * either one of the page has changed or they collide
4239 + * at the max hash, we consider them as ill items.
4241 + remove_rmap_item_from_tree(tree_rmap_item);
4244 + put_page(tree_rmap_item->page);
4245 + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
4252 +static inline unsigned long get_pool_index(struct vma_slot *slot,
4253 + unsigned long index)
4255 + unsigned long pool_index;
4257 + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4258 + if (pool_index >= slot->pool_size)
4260 + return pool_index;
4263 +static inline unsigned long index_page_offset(unsigned long index)
4265 + return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4269 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4270 + unsigned long index, int need_alloc)
4272 + unsigned long pool_index;
4273 + struct page *page;
4277 + pool_index = get_pool_index(slot, index);
4278 + if (!slot->rmap_list_pool[pool_index]) {
4282 + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4286 + slot->rmap_list_pool[pool_index] = page;
4289 + addr = kmap(slot->rmap_list_pool[pool_index]);
4290 + addr += index_page_offset(index);
4295 +static inline void put_rmap_list_entry(struct vma_slot *slot,
4296 + unsigned long index)
4298 + unsigned long pool_index;
4300 + pool_index = get_pool_index(slot, index);
4301 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4302 + kunmap(slot->rmap_list_pool[pool_index]);
4305 +static inline int entry_is_new(struct rmap_list_entry *entry)
4307 + return !entry->item;
4310 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4311 + unsigned long index)
4313 + return slot->vma->vm_start + (index << PAGE_SHIFT);
4316 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4318 + unsigned long addr;
4320 + if (is_addr(entry->addr))
4321 + addr = get_clean_addr(entry->addr);
4322 + else if (entry->item)
4323 + addr = get_rmap_addr(entry->item);
4330 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4332 + if (is_addr(entry->addr))
4335 + return entry->item;
4338 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4339 + unsigned long index)
4341 + unsigned long pool_index;
4343 + pool_index = get_pool_index(slot, index);
4344 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4345 + slot->pool_counts[pool_index]++;
4348 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4349 + unsigned long index)
4351 + unsigned long pool_index;
4353 + pool_index = get_pool_index(slot, index);
4354 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4355 + BUG_ON(!slot->pool_counts[pool_index]);
4356 + slot->pool_counts[pool_index]--;
4359 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
4361 + return !is_addr(entry->addr) && entry->item;
4364 +static inline void swap_entries(struct rmap_list_entry *entry1,
4365 + unsigned long index1,
4366 + struct rmap_list_entry *entry2,
4367 + unsigned long index2)
4369 + struct rmap_list_entry tmp;
4371 + /* swapping two new entries is meaningless */
4372 + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4375 + *entry1 = *entry2;
4378 + if (entry_has_rmap(entry1))
4379 + entry1->item->entry_index = index1;
4381 + if (entry_has_rmap(entry2))
4382 + entry2->item->entry_index = index2;
4384 + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4385 + inc_rmap_list_pool_count(entry1->item->slot, index1);
4386 + dec_rmap_list_pool_count(entry1->item->slot, index2);
4387 + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4388 + inc_rmap_list_pool_count(entry2->item->slot, index2);
4389 + dec_rmap_list_pool_count(entry2->item->slot, index1);
4393 +static inline void free_entry_item(struct rmap_list_entry *entry)
4395 + unsigned long index;
4396 + struct rmap_item *item;
4398 + if (!is_addr(entry->addr)) {
4399 + BUG_ON(!entry->item);
4400 + item = entry->item;
4401 + entry->addr = get_rmap_addr(item);
4402 + set_is_addr(entry->addr);
4403 + index = item->entry_index;
4404 + remove_rmap_item_from_tree(item);
4405 + dec_rmap_list_pool_count(item->slot, index);
4406 + free_rmap_item(item);
4410 +static inline int pool_entry_boundary(unsigned long index)
4412 + unsigned long linear_addr;
4414 + linear_addr = sizeof(struct rmap_list_entry *) * index;
4415 + return index && !offset_in_page(linear_addr);
4418 +static inline void try_free_last_pool(struct vma_slot *slot,
4419 + unsigned long index)
4421 + unsigned long pool_index;
4423 + pool_index = get_pool_index(slot, index);
4424 + if (slot->rmap_list_pool[pool_index] &&
4425 + !slot->pool_counts[pool_index]) {
4426 + __free_page(slot->rmap_list_pool[pool_index]);
4427 + slot->rmap_list_pool[pool_index] = NULL;
4428 + slot->flags |= UKSM_SLOT_NEED_SORT;
4433 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4434 + struct rmap_item *item)
4436 + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4439 +static int within_same_pool(struct vma_slot *slot,
4440 + unsigned long i, unsigned long j)
4442 + unsigned long pool_i, pool_j;
4444 + pool_i = get_pool_index(slot, i);
4445 + pool_j = get_pool_index(slot, j);
4447 + return (pool_i == pool_j);
4450 +static void sort_rmap_entry_list(struct vma_slot *slot)
4452 + unsigned long i, j;
4453 + struct rmap_list_entry *entry, *swap_entry;
4455 + entry = get_rmap_list_entry(slot, 0, 0);
4456 + for (i = 0; i < slot->pages; ) {
4459 + goto skip_whole_pool;
4461 + if (entry_is_new(entry))
4464 + if (is_addr(entry->addr)) {
4469 + j = vma_item_index(slot->vma, entry->item);
4473 + if (within_same_pool(slot, i, j))
4474 + swap_entry = entry + j - i;
4476 + swap_entry = get_rmap_list_entry(slot, j, 1);
4478 + swap_entries(entry, i, swap_entry, j);
4479 + if (!within_same_pool(slot, i, j))
4480 + put_rmap_list_entry(slot, j);
4484 + i += PAGE_SIZE / sizeof(*entry);
4485 + if (i < slot->pages)
4486 + entry = get_rmap_list_entry(slot, i, 0);
4490 + if (i >= slot->pages - 1 ||
4491 + !within_same_pool(slot, i, i + 1)) {
4492 + put_rmap_list_entry(slot, i);
4493 + if (i + 1 < slot->pages)
4494 + entry = get_rmap_list_entry(slot, i + 1, 0);
4501 + /* free empty pool entries which contain no rmap_item */
4502 + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4503 + for (i = 0; i < slot->pool_size; i++) {
4504 + unsigned char has_rmap;
4507 + if (!slot->rmap_list_pool[i])
4511 + addr = kmap(slot->rmap_list_pool[i]);
4513 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4514 + entry = (struct rmap_list_entry *)addr + j;
4515 + if (is_addr(entry->addr))
4521 + kunmap(slot->rmap_list_pool[i]);
4523 + BUG_ON(slot->pool_counts[i]);
4524 + __free_page(slot->rmap_list_pool[i]);
4525 + slot->rmap_list_pool[i] = NULL;
4529 + slot->flags &= ~UKSM_SLOT_NEED_SORT;
4533 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
4535 +static inline int vma_fully_scanned(struct vma_slot *slot)
4537 + return slot->pages_scanned == slot->pages;
4541 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4542 + * its random permutation. This function is embedded with the random
4543 + * permutation index management code.
4545 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4547 + unsigned long rand_range, addr, swap_index, scan_index;
4548 + struct rmap_item *item = NULL;
4549 + struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4550 + struct page *page;
4552 + scan_index = swap_index = slot->pages_scanned % slot->pages;
4554 + if (pool_entry_boundary(scan_index))
4555 + try_free_last_pool(slot, scan_index - 1);
4557 + if (vma_fully_scanned(slot)) {
4558 + if (slot->flags & UKSM_SLOT_NEED_SORT)
4559 + slot->flags |= UKSM_SLOT_NEED_RERAND;
4561 + slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4562 + if (slot->flags & UKSM_SLOT_NEED_SORT)
4563 + sort_rmap_entry_list(slot);
4566 + scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4570 + if (entry_is_new(scan_entry)) {
4571 + scan_entry->addr = get_index_orig_addr(slot, scan_index);
4572 + set_is_addr(scan_entry->addr);
4575 + if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4576 + rand_range = slot->pages - scan_index;
4577 + BUG_ON(!rand_range);
4578 + swap_index = scan_index + (prandom_u32() % rand_range);
4581 + if (swap_index != scan_index) {
4582 + swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4583 + if (entry_is_new(swap_entry)) {
4584 + swap_entry->addr = get_index_orig_addr(slot,
4586 + set_is_addr(swap_entry->addr);
4588 + swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4591 + addr = get_entry_address(scan_entry);
4592 + item = get_entry_item(scan_entry);
4593 + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4595 + page = follow_page(slot->vma, addr, FOLL_GET);
4596 + if (IS_ERR_OR_NULL(page))
4599 + if (!PageAnon(page) && !page_trans_compound_anon(page))
4602 + /*check is zero_page pfn or uksm_zero_page*/
4603 + if ((page_to_pfn(page) == zero_pfn)
4604 + || (page_to_pfn(page) == uksm_zero_pfn))
4607 + flush_anon_page(slot->vma, page, addr);
4608 + flush_dcache_page(page);
4611 + *hash = page_hash(page, hash_strength, 1);
4612 + inc_uksm_pages_scanned();
4613 + /*if the page content all zero, re-map to zero-page*/
4614 + if (find_zero_page_hash(hash_strength, *hash)) {
4615 + if (!cmp_and_merge_zero_page(slot->vma, page)) {
4616 + slot->pages_merged++;
4617 + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
4618 + dec_mm_counter(slot->mm, MM_ANONPAGES);
4620 + /* For full-zero pages, no need to create rmap item */
4623 + inc_rshash_neg(memcmp_cost / 2);
4628 + item = alloc_rmap_item();
4630 + /* It has already been zeroed */
4631 + item->slot = slot;
4632 + item->address = addr;
4633 + item->entry_index = scan_index;
4634 + scan_entry->item = item;
4635 + inc_rmap_list_pool_count(slot, scan_index);
4640 + BUG_ON(item->slot != slot);
4641 + /* the page may have changed */
4642 + item->page = page;
4643 + put_rmap_list_entry(slot, scan_index);
4645 + put_rmap_list_entry(slot, swap_index);
4652 + /* no page, store addr back and free rmap_item if possible */
4653 + free_entry_item(scan_entry);
4654 + put_rmap_list_entry(slot, scan_index);
4656 + put_rmap_list_entry(slot, swap_index);
4660 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4662 + return rmap_item->address & STABLE_FLAG;
4666 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4667 + * mmap_sem locked.
4669 +static noinline void scan_vma_one_page(struct vma_slot *slot)
4672 + struct mm_struct *mm;
4673 + struct rmap_item *rmap_item = NULL;
4674 + struct vm_area_struct *vma = slot->vma;
4680 + rmap_item = get_next_rmap_item(slot, &hash);
4684 + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4687 + cmp_and_merge_page(rmap_item, hash);
4689 + put_page(rmap_item->page);
4691 + slot->pages_scanned++;
4692 + if (slot->fully_scanned_round != fully_scanned_round)
4693 + scanned_virtual_pages++;
4695 + if (vma_fully_scanned(slot))
4696 + slot->fully_scanned_round = fully_scanned_round;
4699 +static inline unsigned long rung_get_pages(struct scan_rung *rung)
4701 + struct slot_tree_node *node;
4703 + if (!rung->vma_root.rnode)
4706 + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4708 + return node->size;
4711 +#define RUNG_SAMPLED_MIN 3
4714 +void uksm_calc_rung_step(struct scan_rung *rung,
4715 + unsigned long page_time, unsigned long ratio)
4717 + unsigned long sampled, pages;
4719 + /* will be fully scanned ? */
4720 + if (!rung->cover_msecs) {
4725 + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4726 + * ratio / page_time;
4729 + * Before we finsish a scan round and expensive per-round jobs,
4730 + * we need to have a chance to estimate the per page time. So
4731 + * the sampled number can not be too small.
4733 + if (sampled < RUNG_SAMPLED_MIN)
4734 + sampled = RUNG_SAMPLED_MIN;
4736 + pages = rung_get_pages(rung);
4737 + if (likely(pages > sampled))
4738 + rung->step = pages / sampled;
4743 +static inline int step_need_recalc(struct scan_rung *rung)
4745 + unsigned long pages, stepmax;
4747 + pages = rung_get_pages(rung);
4748 + stepmax = pages / RUNG_SAMPLED_MIN;
4750 + return pages && (rung->step > pages ||
4751 + (stepmax && rung->step > stepmax));
4755 +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4757 + struct vma_slot *slot;
4760 + rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4762 + if (step_recalc || step_need_recalc(rung)) {
4763 + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4764 + BUG_ON(step_need_recalc(rung));
4767 + slot_iter_index = prandom_u32() % rung->step;
4768 + BUG_ON(!rung->vma_root.rnode);
4769 + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4772 + rung->current_scan = slot;
4773 + rung->current_offset = slot_iter_index;
4776 +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4778 + return &slot->rung->vma_root;
4782 + * return if resetted.
4784 +static int advance_current_scan(struct scan_rung *rung)
4787 + struct vma_slot *slot, *next = NULL;
4789 + BUG_ON(!rung->vma_root.num);
4791 + slot = rung->current_scan;
4792 + n = (slot->pages - rung->current_offset) % rung->step;
4793 + slot_iter_index = rung->step - n;
4794 + next = sradix_tree_next(&rung->vma_root, slot->snode,
4795 + slot->sindex, slot_iter);
4798 + rung->current_offset = slot_iter_index;
4799 + rung->current_scan = next;
4802 + reset_current_scan(rung, 1, 0);
4807 +static inline void rung_rm_slot(struct vma_slot *slot)
4809 + struct scan_rung *rung = slot->rung;
4810 + struct sradix_tree_root *root;
4812 + if (rung->current_scan == slot)
4813 + advance_current_scan(rung);
4815 + root = slot_get_root(slot);
4816 + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4817 + slot->snode = NULL;
4818 + if (step_need_recalc(rung)) {
4819 + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4820 + BUG_ON(step_need_recalc(rung));
4823 + /* In case advance_current_scan loop back to this slot again */
4824 + if (rung->vma_root.num && rung->current_scan == slot)
4825 + reset_current_scan(slot->rung, 1, 0);
4828 +static inline void rung_add_new_slots(struct scan_rung *rung,
4829 + struct vma_slot **slots, unsigned long num)
4832 + struct vma_slot *slot;
4834 + struct sradix_tree_root *root = &rung->vma_root;
4836 + err = sradix_tree_enter(root, (void **)slots, num);
4839 + for (i = 0; i < num; i++) {
4841 + slot->rung = rung;
4842 + BUG_ON(vma_fully_scanned(slot));
4845 + if (rung->vma_root.num == num)
4846 + reset_current_scan(rung, 0, 1);
4849 +static inline int rung_add_one_slot(struct scan_rung *rung,
4850 + struct vma_slot *slot)
4854 + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4858 + slot->rung = rung;
4859 + if (rung->vma_root.num == 1)
4860 + reset_current_scan(rung, 0, 1);
4866 + * Return true if the slot is deleted from its rung.
4868 +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4870 + struct scan_rung *old_rung = slot->rung;
4873 + if (old_rung == rung)
4876 + rung_rm_slot(slot);
4877 + err = rung_add_one_slot(rung, slot);
4879 + err = rung_add_one_slot(old_rung, slot);
4880 + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4886 +static inline int vma_rung_up(struct vma_slot *slot)
4888 + struct scan_rung *rung;
4890 + rung = slot->rung;
4891 + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4894 + return vma_rung_enter(slot, rung);
4897 +static inline int vma_rung_down(struct vma_slot *slot)
4899 + struct scan_rung *rung;
4901 + rung = slot->rung;
4902 + if (slot->rung != &uksm_scan_ladder[0])
4905 + return vma_rung_enter(slot, rung);
4909 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4911 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4913 + unsigned long ret;
4915 + BUG_ON(slot->pages_scanned == slot->last_scanned);
4917 + ret = slot->pages_merged;
4919 + /* Thrashing area filtering */
4920 + if (ret && uksm_thrash_threshold) {
4921 + if (slot->pages_cowed * 100 / slot->pages_merged
4922 + > uksm_thrash_threshold) {
4925 + ret = slot->pages_merged - slot->pages_cowed;
4933 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4935 +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4937 + unsigned long ret;
4938 + unsigned long pages_scanned;
4940 + pages_scanned = slot->pages_scanned;
4941 + if (!pages_scanned) {
4942 + if (uksm_thrash_threshold)
4945 + pages_scanned = slot->pages_scanned;
4948 + ret = slot->pages_bemerged * 100 / pages_scanned;
4950 + /* Thrashing area filtering */
4951 + if (ret && uksm_thrash_threshold) {
4952 + if (slot->pages_cowed * 100 / slot->pages_bemerged
4953 + > uksm_thrash_threshold) {
4956 + ret = slot->pages_bemerged - slot->pages_cowed;
4964 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4965 + * stable tree need to be restructured, this is the function re-inserting the
4968 +static inline void stable_node_reinsert(struct stable_node *new_node,
4969 + struct page *page,
4970 + struct rb_root *root_treep,
4971 + struct list_head *tree_node_listp,
4974 + struct rb_node **new = &root_treep->rb_node;
4975 + struct rb_node *parent = NULL;
4976 + struct stable_node *stable_node;
4977 + struct tree_node *tree_node;
4978 + struct page *tree_page;
4984 + tree_node = rb_entry(*new, struct tree_node, node);
4986 + cmp = hash_cmp(hash, tree_node->hash);
4990 + new = &parent->rb_left;
4991 + } else if (cmp > 0) {
4993 + new = &parent->rb_right;
4999 + /* find a stable tree node with same first level hash value */
5000 + stable_node_hash_max(new_node, page, hash);
5001 + if (tree_node->count == 1) {
5002 + stable_node = rb_entry(tree_node->sub_root.rb_node,
5003 + struct stable_node, node);
5004 + tree_page = get_uksm_page(stable_node, 1, 0);
5006 + stable_node_hash_max(stable_node,
5008 + put_page(tree_page);
5010 + /* prepare for stable node insertion */
5012 + cmp = hash_cmp(new_node->hash_max,
5013 + stable_node->hash_max);
5014 + parent = &stable_node->node;
5016 + new = &parent->rb_left;
5018 + new = &parent->rb_right;
5024 + /* the only stable_node deleted, the tree node
5025 + * was not deleted.
5027 + goto tree_node_reuse;
5031 + /* well, search the collision subtree */
5032 + new = &tree_node->sub_root.rb_node;
5038 + stable_node = rb_entry(*new, struct stable_node, node);
5040 + cmp = hash_cmp(new_node->hash_max,
5041 + stable_node->hash_max);
5045 + new = &parent->rb_left;
5046 + } else if (cmp > 0) {
5048 + new = &parent->rb_right;
5050 + /* oh, no, still a collision */
5058 + /* no tree node found */
5059 + tree_node = alloc_tree_node(tree_node_listp);
5061 + printk(KERN_ERR "UKSM: memory allocation error!\n");
5064 + tree_node->hash = hash;
5065 + rb_link_node(&tree_node->node, parent, new);
5066 + rb_insert_color(&tree_node->node, root_treep);
5069 + /* prepare for stable node insertion */
5071 + new = &tree_node->sub_root.rb_node;
5075 + rb_link_node(&new_node->node, parent, new);
5076 + rb_insert_color(&new_node->node, &tree_node->sub_root);
5077 + new_node->tree_node = tree_node;
5078 + tree_node->count++;
5082 + /* This can only happen when two nodes have collided
5085 + new_node->tree_node = NULL;
5089 +static inline void free_all_tree_nodes(struct list_head *list)
5091 + struct tree_node *node, *tmp;
5093 + list_for_each_entry_safe(node, tmp, list, all_list) {
5094 + free_tree_node(node);
5099 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5100 + * strength to the current hash_strength. It re-structures the hole tree.
5102 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5104 + struct stable_node *node, *tmp;
5105 + struct rb_root *root_new_treep;
5106 + struct list_head *new_tree_node_listp;
5108 + stable_tree_index = (stable_tree_index + 1) % 2;
5109 + root_new_treep = &root_stable_tree[stable_tree_index];
5110 + new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5111 + *root_new_treep = RB_ROOT;
5112 + BUG_ON(!list_empty(new_tree_node_listp));
5115 + * we need to be safe, the node could be removed by get_uksm_page()
5117 + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5119 + struct page *node_page;
5123 + * We are completely re-structuring the stable nodes to a new
5124 + * stable tree. We don't want to touch the old tree unlinks and
5125 + * old tree_nodes. The old tree_nodes will be freed at once.
5127 + node_page = get_uksm_page(node, 0, 0);
5131 + if (node->tree_node) {
5132 + hash = node->tree_node->hash;
5134 + addr = kmap_atomic(node_page);
5136 + hash = delta_hash(addr, prev_hash_strength,
5137 + hash_strength, hash);
5138 + kunmap_atomic(addr);
5141 + *it was not inserted to rbtree due to collision in last
5144 + hash = page_hash(node_page, hash_strength, 0);
5147 + stable_node_reinsert(node, node_page, root_new_treep,
5148 + new_tree_node_listp, hash);
5149 + put_page(node_page);
5152 + root_stable_treep = root_new_treep;
5153 + free_all_tree_nodes(stable_tree_node_listp);
5154 + BUG_ON(!list_empty(stable_tree_node_listp));
5155 + stable_tree_node_listp = new_tree_node_listp;
5158 +static inline void inc_hash_strength(unsigned long delta)
5160 + hash_strength += 1 << delta;
5161 + if (hash_strength > HASH_STRENGTH_MAX)
5162 + hash_strength = HASH_STRENGTH_MAX;
5165 +static inline void dec_hash_strength(unsigned long delta)
5167 + unsigned long change = 1 << delta;
5169 + if (hash_strength <= change + 1)
5170 + hash_strength = 1;
5172 + hash_strength -= change;
5175 +static inline void inc_hash_strength_delta(void)
5177 + hash_strength_delta++;
5178 + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5179 + hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5183 +static inline unsigned long get_current_neg_ratio(void)
5185 + if (!rshash_pos || rshash_neg > rshash_pos)
5188 + return div64_u64(100 * rshash_neg , rshash_pos);
5192 +static inline unsigned long get_current_neg_ratio(void)
5194 + u64 pos = benefit.pos;
5195 + u64 neg = benefit.neg;
5200 + if (!pos || neg > pos)
5203 + if (neg > div64_u64(U64_MAX, 100))
5204 + pos = div64_u64(pos, 100);
5208 + return div64_u64(neg, pos);
5211 +static inline unsigned long get_current_benefit(void)
5213 + u64 pos = benefit.pos;
5214 + u64 neg = benefit.neg;
5215 + u64 scanned = benefit.scanned;
5220 + return div64_u64((pos - neg), scanned);
5223 +static inline int judge_rshash_direction(void)
5225 + u64 current_neg_ratio, stable_benefit;
5226 + u64 current_benefit, delta = 0;
5229 + /* Try to probe a value after the boot, and in case the system
5230 + are still for a long time. */
5231 + if ((fully_scanned_round & 0xFFULL) == 10) {
5236 + current_neg_ratio = get_current_neg_ratio();
5238 + if (current_neg_ratio == 0) {
5239 + rshash_neg_cont_zero++;
5240 + if (rshash_neg_cont_zero > 2)
5245 + rshash_neg_cont_zero = 0;
5247 + if (current_neg_ratio > 90) {
5252 + current_benefit = get_current_benefit();
5253 + stable_benefit = rshash_state.stable_benefit;
5255 + if (!stable_benefit) {
5260 + if (current_benefit > stable_benefit)
5261 + delta = current_benefit - stable_benefit;
5262 + else if (current_benefit < stable_benefit)
5263 + delta = stable_benefit - current_benefit;
5265 + delta = div64_u64(100 * delta , stable_benefit);
5268 + rshash_cont_obscure++;
5269 + if (rshash_cont_obscure > 2)
5276 + rshash_cont_obscure = 0;
5281 + * rshash_adjust() - The main function to control the random sampling state
5282 + * machine for hash strength adapting.
5284 + * return true if hash_strength has changed.
5286 +static inline int rshash_adjust(void)
5288 + unsigned long prev_hash_strength = hash_strength;
5290 + if (!encode_benefit())
5293 + switch (rshash_state.state) {
5294 + case RSHASH_STILL:
5295 + switch (judge_rshash_direction()) {
5297 + if (rshash_state.pre_direct == GO_DOWN)
5298 + hash_strength_delta = 0;
5300 + inc_hash_strength(hash_strength_delta);
5301 + inc_hash_strength_delta();
5302 + rshash_state.stable_benefit = get_current_benefit();
5303 + rshash_state.pre_direct = GO_UP;
5307 + if (rshash_state.pre_direct == GO_UP)
5308 + hash_strength_delta = 0;
5310 + dec_hash_strength(hash_strength_delta);
5311 + inc_hash_strength_delta();
5312 + rshash_state.stable_benefit = get_current_benefit();
5313 + rshash_state.pre_direct = GO_DOWN;
5317 + rshash_state.stable_point = hash_strength;
5318 + rshash_state.turn_point_down = hash_strength;
5319 + rshash_state.turn_point_up = hash_strength;
5320 + rshash_state.turn_benefit_down = get_current_benefit();
5321 + rshash_state.turn_benefit_up = get_current_benefit();
5322 + rshash_state.lookup_window_index = 0;
5323 + rshash_state.state = RSHASH_TRYDOWN;
5324 + dec_hash_strength(hash_strength_delta);
5325 + inc_hash_strength_delta();
5335 + case RSHASH_TRYDOWN:
5336 + if (rshash_state.lookup_window_index++ % 5 == 0)
5337 + rshash_state.below_count = 0;
5339 + if (get_current_benefit() < rshash_state.stable_benefit)
5340 + rshash_state.below_count++;
5341 + else if (get_current_benefit() >
5342 + rshash_state.turn_benefit_down) {
5343 + rshash_state.turn_point_down = hash_strength;
5344 + rshash_state.turn_benefit_down = get_current_benefit();
5347 + if (rshash_state.below_count >= 3 ||
5348 + judge_rshash_direction() == GO_UP ||
5349 + hash_strength == 1) {
5350 + hash_strength = rshash_state.stable_point;
5351 + hash_strength_delta = 0;
5352 + inc_hash_strength(hash_strength_delta);
5353 + inc_hash_strength_delta();
5354 + rshash_state.lookup_window_index = 0;
5355 + rshash_state.state = RSHASH_TRYUP;
5356 + hash_strength_delta = 0;
5358 + dec_hash_strength(hash_strength_delta);
5359 + inc_hash_strength_delta();
5363 + case RSHASH_TRYUP:
5364 + if (rshash_state.lookup_window_index++ % 5 == 0)
5365 + rshash_state.below_count = 0;
5367 + if (get_current_benefit() < rshash_state.turn_benefit_down)
5368 + rshash_state.below_count++;
5369 + else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5370 + rshash_state.turn_point_up = hash_strength;
5371 + rshash_state.turn_benefit_up = get_current_benefit();
5374 + if (rshash_state.below_count >= 3 ||
5375 + judge_rshash_direction() == GO_DOWN ||
5376 + hash_strength == HASH_STRENGTH_MAX) {
5377 + hash_strength = rshash_state.turn_benefit_up >
5378 + rshash_state.turn_benefit_down ?
5379 + rshash_state.turn_point_up :
5380 + rshash_state.turn_point_down;
5382 + rshash_state.state = RSHASH_PRE_STILL;
5384 + inc_hash_strength(hash_strength_delta);
5385 + inc_hash_strength_delta();
5391 + case RSHASH_PRE_STILL:
5392 + rshash_state.stable_benefit = get_current_benefit();
5393 + rshash_state.state = RSHASH_STILL;
5394 + hash_strength_delta = 0;
5400 + /* rshash_neg = rshash_pos = 0; */
5403 + if (prev_hash_strength != hash_strength)
5404 + stable_tree_delta_hash(prev_hash_strength);
5406 + return prev_hash_strength != hash_strength;
5410 + * round_update_ladder() - The main function to do update of all the
5411 + * adjustments whenever a scan round is finished.
5413 +static noinline void round_update_ladder(void)
5416 + unsigned long dedup;
5417 + struct vma_slot *slot, *tmp_slot;
5419 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5420 + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5423 + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5425 + /* slot may be rung_rm_slot() when mm exits */
5426 + if (slot->snode) {
5427 + dedup = cal_dedup_ratio_old(slot);
5428 + if (dedup && dedup >= uksm_abundant_threshold)
5429 + vma_rung_up(slot);
5432 + slot->pages_bemerged = 0;
5433 + slot->pages_cowed = 0;
5435 + list_del_init(&slot->dedup_list);
5439 +static void uksm_del_vma_slot(struct vma_slot *slot)
5442 + struct rmap_list_entry *entry;
5444 + if (slot->snode) {
5446 + * In case it just failed when entering the rung, it's not
5449 + rung_rm_slot(slot);
5452 + if (!list_empty(&slot->dedup_list))
5453 + list_del(&slot->dedup_list);
5455 + if (!slot->rmap_list_pool || !slot->pool_counts) {
5456 + /* In case it OOMed in uksm_vma_enter() */
5460 + for (i = 0; i < slot->pool_size; i++) {
5463 + if (!slot->rmap_list_pool[i])
5466 + addr = kmap(slot->rmap_list_pool[i]);
5467 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5468 + entry = (struct rmap_list_entry *)addr + j;
5469 + if (is_addr(entry->addr))
5474 + remove_rmap_item_from_tree(entry->item);
5475 + free_rmap_item(entry->item);
5476 + slot->pool_counts[i]--;
5478 + BUG_ON(slot->pool_counts[i]);
5479 + kunmap(slot->rmap_list_pool[i]);
5480 + __free_page(slot->rmap_list_pool[i]);
5482 + kfree(slot->rmap_list_pool);
5483 + kfree(slot->pool_counts);
5486 + slot->rung = NULL;
5487 + BUG_ON(uksm_pages_total < slot->pages);
5488 + if (slot->flags & UKSM_SLOT_IN_UKSM)
5489 + uksm_pages_total -= slot->pages;
5491 + if (slot->fully_scanned_round == fully_scanned_round)
5492 + scanned_virtual_pages -= slot->pages;
5494 + scanned_virtual_pages -= slot->pages_scanned;
5495 + free_vma_slot(slot);
5499 +#define SPIN_LOCK_PERIOD 32
5500 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5501 +static inline void cleanup_vma_slots(void)
5503 + struct vma_slot *slot;
5507 + spin_lock(&vma_slot_list_lock);
5508 + while (!list_empty(&vma_slot_del)) {
5509 + slot = list_entry(vma_slot_del.next,
5510 + struct vma_slot, slot_list);
5511 + list_del(&slot->slot_list);
5512 + cleanup_slots[i++] = slot;
5513 + if (i == SPIN_LOCK_PERIOD) {
5514 + spin_unlock(&vma_slot_list_lock);
5516 + uksm_del_vma_slot(cleanup_slots[i]);
5518 + spin_lock(&vma_slot_list_lock);
5521 + spin_unlock(&vma_slot_list_lock);
5524 + uksm_del_vma_slot(cleanup_slots[i]);
5528 +*expotional moving average formula
5530 +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5533 + * For a very high burst, even the ema cannot work well, a false very
5534 + * high per-page time estimation can result in feedback in very high
5535 + * overhead of context swith and rung update -- this will then lead
5536 + * to higher per-paper time, this may not converge.
5538 + * Instead, we try to approach this value in a binary manner.
5540 + if (curr > last_ema * 10)
5541 + return last_ema * 2;
5543 + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5547 + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5548 + * nanoseconds based on current uksm_sleep_jiffies.
5550 +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5552 + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5553 + (TIME_RATIO_SCALE - ratio) * ratio;
5557 +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5559 + unsigned long ret;
5561 + BUG_ON(!cpu_time_ratio);
5563 + if (cpu_time_ratio > 0)
5564 + ret = cpu_time_ratio;
5566 + ret = (unsigned long)(-cpu_time_ratio) *
5567 + uksm_max_cpu_percentage / 100UL;
5569 + return ret ? ret : 1;
5572 +static noinline void uksm_calc_scan_pages(void)
5574 + struct scan_rung *ladder = uksm_scan_ladder;
5575 + unsigned long sleep_usecs, nsecs;
5576 + unsigned long ratio;
5578 + unsigned long per_page;
5580 + if (uksm_ema_page_time > 100000 ||
5581 + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5582 + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5584 + per_page = uksm_ema_page_time;
5585 + BUG_ON(!per_page);
5588 + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5589 + * based on saved user input.
5591 + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5592 + uksm_sleep_jiffies = uksm_sleep_saved;
5594 + /* We require a rung scan at least 1 page in a period. */
5596 + ratio = rung_real_ratio(ladder[0].cpu_ratio);
5597 + if (cpu_ratio_to_nsec(ratio) < nsecs) {
5598 + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5600 + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5603 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5604 + ratio = rung_real_ratio(ladder[i].cpu_ratio);
5605 + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5607 + BUG_ON(!ladder[i].pages_to_scan);
5608 + uksm_calc_rung_step(&ladder[i], per_page, ratio);
5613 + * From the scan time of this round (ns) to next expected min sleep time
5614 + * (ms), be careful of the possible overflows. ratio is taken from
5615 + * rung_real_ratio()
5618 +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5620 + scan_time >>= 20; /* to msec level now */
5621 + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5623 + return (unsigned int) ((unsigned long) scan_time *
5624 + (TIME_RATIO_SCALE - ratio) / ratio);
5627 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5628 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5630 +static inline unsigned long vma_pool_size(struct vma_slot *slot)
5632 + return round_up(sizeof(struct rmap_list_entry) * slot->pages,
5633 + PAGE_SIZE) >> PAGE_SHIFT;
5636 +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5638 + struct scan_rung *rung;
5639 + unsigned long pool_size, i;
5640 + struct vma_slot *slot;
5643 + rung = &uksm_scan_ladder[0];
5646 + for (i = 0; i < num; i++) {
5649 + pool_size = vma_pool_size(slot);
5650 + slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5651 + pool_size, GFP_KERNEL);
5652 + if (!slot->rmap_list_pool)
5655 + slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
5657 + if (!slot->pool_counts) {
5658 + kfree(slot->rmap_list_pool);
5662 + slot->pool_size = pool_size;
5663 + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
5664 + slot->flags |= UKSM_SLOT_IN_UKSM;
5665 + uksm_pages_total += slot->pages;
5669 + rung_add_new_slots(rung, slots, i);
5674 +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5676 +static void uksm_enter_all_slots(void)
5678 + struct vma_slot *slot;
5679 + unsigned long index;
5680 + struct list_head empty_vma_list;
5685 + INIT_LIST_HEAD(&empty_vma_list);
5687 + spin_lock(&vma_slot_list_lock);
5688 + while (!list_empty(&vma_slot_new)) {
5689 + slot = list_entry(vma_slot_new.next,
5690 + struct vma_slot, slot_list);
5692 + if (!slot->vma->anon_vma) {
5693 + list_move(&slot->slot_list, &empty_vma_list);
5694 + } else if (vma_can_enter(slot->vma)) {
5695 + batch_slots[index++] = slot;
5696 + list_del_init(&slot->slot_list);
5698 + list_move(&slot->slot_list, &vma_slot_noadd);
5701 + if (++i == SPIN_LOCK_PERIOD ||
5702 + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5703 + spin_unlock(&vma_slot_list_lock);
5705 + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5706 + uksm_vma_enter(batch_slots, index);
5711 + spin_lock(&vma_slot_list_lock);
5715 + list_splice(&empty_vma_list, &vma_slot_new);
5717 + spin_unlock(&vma_slot_list_lock);
5720 + uksm_vma_enter(batch_slots, index);
5724 +static inline int rung_round_finished(struct scan_rung *rung)
5726 + return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5729 +static inline void judge_slot(struct vma_slot *slot)
5731 + struct scan_rung *rung = slot->rung;
5732 + unsigned long dedup;
5735 + dedup = cal_dedup_ratio(slot);
5736 + if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5737 + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5738 + else if (dedup && dedup >= uksm_abundant_threshold)
5739 + deleted = vma_rung_up(slot);
5741 + deleted = vma_rung_down(slot);
5743 + slot->pages_merged = 0;
5744 + slot->pages_cowed = 0;
5746 + if (vma_fully_scanned(slot))
5747 + slot->pages_scanned = 0;
5749 + slot->last_scanned = slot->pages_scanned;
5751 + /* If its deleted in above, then rung was already advanced. */
5753 + advance_current_scan(rung);
5757 +static inline int hash_round_finished(void)
5759 + if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5760 + scanned_virtual_pages = 0;
5761 + if (uksm_pages_scanned)
5762 + fully_scanned_round++;
5770 +#define UKSM_MMSEM_BATCH 5
5771 +#define BUSY_RETRY 100
5774 + * uksm_do_scan() - the main worker function.
5776 +static noinline void uksm_do_scan(void)
5778 + struct vma_slot *slot, *iter;
5779 + struct mm_struct *busy_mm;
5780 + unsigned char round_finished, all_rungs_emtpy;
5781 + int i, err, mmsem_batch;
5782 + unsigned long pcost;
5783 + long long delta_exec;
5784 + unsigned long vpages, max_cpu_ratio;
5785 + unsigned long long start_time, end_time, scan_time;
5786 + unsigned int expected_jiffies;
5792 + start_time = task_sched_runtime(current);
5793 + max_cpu_ratio = 0;
5796 + for (i = 0; i < SCAN_LADDER_SIZE;) {
5797 + struct scan_rung *rung = &uksm_scan_ladder[i];
5798 + unsigned long ratio;
5801 + if (!rung->pages_to_scan) {
5806 + if (!rung->vma_root.num) {
5807 + rung->pages_to_scan = 0;
5812 + ratio = rung_real_ratio(rung->cpu_ratio);
5813 + if (ratio > max_cpu_ratio)
5814 + max_cpu_ratio = ratio;
5816 + busy_retry = BUSY_RETRY;
5818 + * Do not consider rung_round_finished() here, just used up the
5819 + * rung->pages_to_scan quota.
5821 + while (rung->pages_to_scan && rung->vma_root.num &&
5822 + likely(!freezing(current))) {
5825 + slot = rung->current_scan;
5827 + BUG_ON(vma_fully_scanned(slot));
5829 + if (mmsem_batch) {
5832 + err = try_down_read_slot_mmap_sem(slot);
5835 + if (err == -ENOENT) {
5837 + rung_rm_slot(slot);
5841 + busy_mm = slot->mm;
5843 + if (err == -EBUSY) {
5844 + /* skip other vmas on the same mm */
5846 + reset = advance_current_scan(rung);
5847 + iter = rung->current_scan;
5849 + if (iter->vma->vm_mm != busy_mm ||
5850 + !busy_retry || reset)
5854 + if (iter->vma->vm_mm != busy_mm) {
5857 + /* scan round finsished */
5862 + BUG_ON(!vma_can_enter(slot->vma));
5863 + if (uksm_test_exit(slot->vma->vm_mm)) {
5865 + up_read(&slot->vma->vm_mm->mmap_sem);
5872 + mmsem_batch = UKSM_MMSEM_BATCH;
5874 + /* Ok, we have take the mmap_sem, ready to scan */
5875 + scan_vma_one_page(slot);
5876 + rung->pages_to_scan--;
5879 + if (rung->current_offset + rung->step > slot->pages - 1
5880 + || vma_fully_scanned(slot)) {
5881 + up_read(&slot->vma->vm_mm->mmap_sem);
5885 + rung->current_offset += rung->step;
5887 + up_read(&slot->vma->vm_mm->mmap_sem);
5890 + busy_retry = BUSY_RETRY;
5894 + if (mmsem_batch) {
5895 + up_read(&slot->vma->vm_mm->mmap_sem);
5899 + if (freezing(current))
5904 + end_time = task_sched_runtime(current);
5905 + delta_exec = end_time - start_time;
5907 + if (freezing(current))
5910 + cleanup_vma_slots();
5911 + uksm_enter_all_slots();
5913 + round_finished = 1;
5914 + all_rungs_emtpy = 1;
5915 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5916 + struct scan_rung *rung = &uksm_scan_ladder[i];
5918 + if (rung->vma_root.num) {
5919 + all_rungs_emtpy = 0;
5920 + if (!rung_round_finished(rung))
5921 + round_finished = 0;
5925 + if (all_rungs_emtpy)
5926 + round_finished = 0;
5928 + if (round_finished) {
5929 + round_update_ladder();
5930 + uksm_eval_round++;
5932 + if (hash_round_finished() && rshash_adjust()) {
5933 + /* Reset the unstable root iff hash strength changed */
5934 + uksm_hash_round++;
5935 + root_unstable_tree = RB_ROOT;
5936 + free_all_tree_nodes(&unstable_tree_node_list);
5940 + * A number of pages can hang around indefinitely on per-cpu
5941 + * pagevecs, raised page count preventing write_protect_page
5942 + * from merging them. Though it doesn't really matter much,
5943 + * it is puzzling to see some stuck in pages_volatile until
5944 + * other activity jostles them out, and they also prevented
5945 + * LTP's KSM test from succeeding deterministically; so drain
5946 + * them here (here rather than on entry to uksm_do_scan(),
5947 + * so we don't IPI too often when pages_to_scan is set low).
5949 + lru_add_drain_all();
5953 + if (vpages && delta_exec > 0) {
5954 + pcost = (unsigned long) delta_exec / vpages;
5955 + if (likely(uksm_ema_page_time))
5956 + uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5958 + uksm_ema_page_time = pcost;
5961 + uksm_calc_scan_pages();
5962 + uksm_sleep_real = uksm_sleep_jiffies;
5963 + /* in case of radical cpu bursts, apply the upper bound */
5964 + end_time = task_sched_runtime(current);
5965 + if (max_cpu_ratio && end_time > start_time) {
5966 + scan_time = end_time - start_time;
5967 + expected_jiffies = msecs_to_jiffies(
5968 + scan_time_to_sleep(scan_time, max_cpu_ratio));
5970 + if (expected_jiffies > uksm_sleep_real)
5971 + uksm_sleep_real = expected_jiffies;
5973 + /* We have a 1 second up bound for responsiveness. */
5974 + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5975 + uksm_sleep_real = msecs_to_jiffies(1000);
5981 +static int ksmd_should_run(void)
5983 + return uksm_run & UKSM_RUN_MERGE;
5986 +static int uksm_scan_thread(void *nothing)
5989 + set_user_nice(current, 5);
5991 + while (!kthread_should_stop()) {
5992 + mutex_lock(&uksm_thread_mutex);
5993 + if (ksmd_should_run()) {
5996 + mutex_unlock(&uksm_thread_mutex);
6000 + if (ksmd_should_run()) {
6001 + schedule_timeout_interruptible(uksm_sleep_real);
6002 + uksm_sleep_times++;
6004 + wait_event_freezable(uksm_thread_wait,
6005 + ksmd_should_run() || kthread_should_stop());
6011 +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6013 + struct stable_node *stable_node;
6014 + struct node_vma *node_vma;
6015 + struct rmap_item *rmap_item;
6016 + int ret = SWAP_AGAIN;
6017 + int search_new_forks = 0;
6018 + unsigned long address;
6020 + VM_BUG_ON_PAGE(!PageKsm(page), page);
6021 + VM_BUG_ON_PAGE(!PageLocked(page), page);
6023 + stable_node = page_stable_node(page);
6027 + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6028 + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6029 + struct anon_vma *anon_vma = rmap_item->anon_vma;
6030 + struct anon_vma_chain *vmac;
6031 + struct vm_area_struct *vma;
6033 + anon_vma_lock_read(anon_vma);
6034 + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6037 + address = get_rmap_addr(rmap_item);
6039 + if (address < vma->vm_start ||
6040 + address >= vma->vm_end)
6043 + if ((rmap_item->slot->vma == vma) ==
6047 + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6050 + ret = rwc->rmap_one(page, vma, address, rwc->arg);
6051 + if (ret != SWAP_AGAIN) {
6052 + anon_vma_unlock_read(anon_vma);
6056 + if (rwc->done && rwc->done(page)) {
6057 + anon_vma_unlock_read(anon_vma);
6061 + anon_vma_unlock_read(anon_vma);
6064 + if (!search_new_forks++)
6070 +#ifdef CONFIG_MIGRATION
6071 +/* Common ksm interface but may be specific to uksm */
6072 +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6074 + struct stable_node *stable_node;
6076 + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6077 + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6078 + VM_BUG_ON(newpage->mapping != oldpage->mapping);
6080 + stable_node = page_stable_node(newpage);
6081 + if (stable_node) {
6082 + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6083 + stable_node->kpfn = page_to_pfn(newpage);
6086 +#endif /* CONFIG_MIGRATION */
6088 +#ifdef CONFIG_MEMORY_HOTREMOVE
6089 +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6090 + unsigned long end_pfn)
6092 + struct rb_node *node;
6094 + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6095 + struct stable_node *stable_node;
6097 + stable_node = rb_entry(node, struct stable_node, node);
6098 + if (stable_node->kpfn >= start_pfn &&
6099 + stable_node->kpfn < end_pfn)
6100 + return stable_node;
6105 +static int uksm_memory_callback(struct notifier_block *self,
6106 + unsigned long action, void *arg)
6108 + struct memory_notify *mn = arg;
6109 + struct stable_node *stable_node;
6112 + case MEM_GOING_OFFLINE:
6114 + * Keep it very simple for now: just lock out ksmd and
6115 + * MADV_UNMERGEABLE while any memory is going offline.
6116 + * mutex_lock_nested() is necessary because lockdep was alarmed
6117 + * that here we take uksm_thread_mutex inside notifier chain
6118 + * mutex, and later take notifier chain mutex inside
6119 + * uksm_thread_mutex to unlock it. But that's safe because both
6120 + * are inside mem_hotplug_mutex.
6122 + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6127 + * Most of the work is done by page migration; but there might
6128 + * be a few stable_nodes left over, still pointing to struct
6129 + * pages which have been offlined: prune those from the tree.
6131 + while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6132 + mn->start_pfn + mn->nr_pages)) != NULL)
6133 + remove_node_from_stable_tree(stable_node, 1, 1);
6136 + case MEM_CANCEL_OFFLINE:
6137 + mutex_unlock(&uksm_thread_mutex);
6142 +#endif /* CONFIG_MEMORY_HOTREMOVE */
6144 +#ifdef CONFIG_SYSFS
6146 + * This all compiles without CONFIG_SYSFS, but is a waste of space.
6149 +#define UKSM_ATTR_RO(_name) \
6150 + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6151 +#define UKSM_ATTR(_name) \
6152 + static struct kobj_attribute _name##_attr = \
6153 + __ATTR(_name, 0644, _name##_show, _name##_store)
6155 +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6156 + struct kobj_attribute *attr, char *buf)
6158 + return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6161 +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6162 + struct kobj_attribute *attr,
6163 + const char *buf, size_t count)
6165 + unsigned long max_cpu_percentage;
6168 + err = kstrtoul(buf, 10, &max_cpu_percentage);
6169 + if (err || max_cpu_percentage > 100)
6172 + if (max_cpu_percentage == 100)
6173 + max_cpu_percentage = 99;
6174 + else if (max_cpu_percentage < 10)
6175 + max_cpu_percentage = 10;
6177 + uksm_max_cpu_percentage = max_cpu_percentage;
6181 +UKSM_ATTR(max_cpu_percentage);
6183 +static ssize_t sleep_millisecs_show(struct kobject *kobj,
6184 + struct kobj_attribute *attr, char *buf)
6186 + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6189 +static ssize_t sleep_millisecs_store(struct kobject *kobj,
6190 + struct kobj_attribute *attr,
6191 + const char *buf, size_t count)
6193 + unsigned long msecs;
6196 + err = kstrtoul(buf, 10, &msecs);
6197 + if (err || msecs > MSEC_PER_SEC)
6200 + uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6201 + uksm_sleep_saved = uksm_sleep_jiffies;
6205 +UKSM_ATTR(sleep_millisecs);
6208 +static ssize_t cpu_governor_show(struct kobject *kobj,
6209 + struct kobj_attribute *attr, char *buf)
6211 + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6215 + for (i = 0; i < n ; i++) {
6216 + if (uksm_cpu_governor == i)
6219 + strcat(buf, uksm_cpu_governor_str[i]);
6221 + if (uksm_cpu_governor == i)
6226 + strcat(buf, "\n");
6228 + return strlen(buf);
6231 +static inline void init_performance_values(void)
6234 + struct scan_rung *rung;
6235 + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6238 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6239 + rung = uksm_scan_ladder + i;
6240 + rung->cpu_ratio = preset->cpu_ratio[i];
6241 + rung->cover_msecs = preset->cover_msecs[i];
6244 + uksm_max_cpu_percentage = preset->max_cpu;
6247 +static ssize_t cpu_governor_store(struct kobject *kobj,
6248 + struct kobj_attribute *attr,
6249 + const char *buf, size_t count)
6251 + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6253 + for (n--; n >=0 ; n--) {
6254 + if (!strncmp(buf, uksm_cpu_governor_str[n],
6255 + strlen(uksm_cpu_governor_str[n])))
6262 + uksm_cpu_governor = n;
6264 + init_performance_values();
6268 +UKSM_ATTR(cpu_governor);
6270 +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6273 + return sprintf(buf, "%u\n", uksm_run);
6276 +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6277 + const char *buf, size_t count)
6280 + unsigned long flags;
6282 + err = kstrtoul(buf, 10, &flags);
6283 + if (err || flags > UINT_MAX)
6285 + if (flags > UKSM_RUN_MERGE)
6288 + mutex_lock(&uksm_thread_mutex);
6289 + if (uksm_run != flags) {
6292 + mutex_unlock(&uksm_thread_mutex);
6294 + if (flags & UKSM_RUN_MERGE)
6295 + wake_up_interruptible(&uksm_thread_wait);
6301 +static ssize_t abundant_threshold_show(struct kobject *kobj,
6302 + struct kobj_attribute *attr, char *buf)
6304 + return sprintf(buf, "%u\n", uksm_abundant_threshold);
6307 +static ssize_t abundant_threshold_store(struct kobject *kobj,
6308 + struct kobj_attribute *attr,
6309 + const char *buf, size_t count)
6312 + unsigned long flags;
6314 + err = kstrtoul(buf, 10, &flags);
6315 + if (err || flags > 99)
6318 + uksm_abundant_threshold = flags;
6322 +UKSM_ATTR(abundant_threshold);
6324 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6325 + struct kobj_attribute *attr, char *buf)
6327 + return sprintf(buf, "%u\n", uksm_thrash_threshold);
6330 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6331 + struct kobj_attribute *attr,
6332 + const char *buf, size_t count)
6335 + unsigned long flags;
6337 + err = kstrtoul(buf, 10, &flags);
6338 + if (err || flags > 99)
6341 + uksm_thrash_threshold = flags;
6345 +UKSM_ATTR(thrash_threshold);
6347 +static ssize_t cpu_ratios_show(struct kobject *kobj,
6348 + struct kobj_attribute *attr, char *buf)
6351 + struct scan_rung *rung;
6354 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6355 + rung = &uksm_scan_ladder[i];
6357 + if (rung->cpu_ratio > 0)
6358 + size = sprintf(p, "%d ", rung->cpu_ratio);
6360 + size = sprintf(p, "MAX/%d ",
6361 + TIME_RATIO_SCALE / -rung->cpu_ratio);
6372 +static ssize_t cpu_ratios_store(struct kobject *kobj,
6373 + struct kobj_attribute *attr,
6374 + const char *buf, size_t count)
6376 + int i, cpuratios[SCAN_LADDER_SIZE], err;
6377 + unsigned long value;
6378 + struct scan_rung *rung;
6379 + char *p, *end = NULL;
6381 + p = kzalloc(count, GFP_KERNEL);
6385 + memcpy(p, buf, count);
6387 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6388 + if (i != SCAN_LADDER_SIZE -1) {
6389 + end = strchr(p, ' ');
6396 + if (strstr(p, "MAX/")) {
6397 + p = strchr(p, '/') + 1;
6398 + err = kstrtoul(p, 10, &value);
6399 + if (err || value > TIME_RATIO_SCALE || !value)
6402 + cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
6404 + err = kstrtoul(p, 10, &value);
6405 + if (err || value > TIME_RATIO_SCALE || !value)
6408 + cpuratios[i] = value;
6414 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6415 + rung = &uksm_scan_ladder[i];
6417 + rung->cpu_ratio = cpuratios[i];
6422 +UKSM_ATTR(cpu_ratios);
6424 +static ssize_t eval_intervals_show(struct kobject *kobj,
6425 + struct kobj_attribute *attr, char *buf)
6428 + struct scan_rung *rung;
6431 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6432 + rung = &uksm_scan_ladder[i];
6433 + size = sprintf(p, "%u ", rung->cover_msecs);
6443 +static ssize_t eval_intervals_store(struct kobject *kobj,
6444 + struct kobj_attribute *attr,
6445 + const char *buf, size_t count)
6448 + unsigned long values[SCAN_LADDER_SIZE];
6449 + struct scan_rung *rung;
6450 + char *p, *end = NULL;
6452 + p = kzalloc(count, GFP_KERNEL);
6456 + memcpy(p, buf, count);
6458 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6459 + if (i != SCAN_LADDER_SIZE -1) {
6460 + end = strchr(p, ' ');
6467 + err = kstrtoul(p, 10, &values[i]);
6474 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6475 + rung = &uksm_scan_ladder[i];
6477 + rung->cover_msecs = values[i];
6482 +UKSM_ATTR(eval_intervals);
6484 +static ssize_t ema_per_page_time_show(struct kobject *kobj,
6485 + struct kobj_attribute *attr, char *buf)
6487 + return sprintf(buf, "%lu\n", uksm_ema_page_time);
6489 +UKSM_ATTR_RO(ema_per_page_time);
6491 +static ssize_t pages_shared_show(struct kobject *kobj,
6492 + struct kobj_attribute *attr, char *buf)
6494 + return sprintf(buf, "%lu\n", uksm_pages_shared);
6496 +UKSM_ATTR_RO(pages_shared);
6498 +static ssize_t pages_sharing_show(struct kobject *kobj,
6499 + struct kobj_attribute *attr, char *buf)
6501 + return sprintf(buf, "%lu\n", uksm_pages_sharing);
6503 +UKSM_ATTR_RO(pages_sharing);
6505 +static ssize_t pages_unshared_show(struct kobject *kobj,
6506 + struct kobj_attribute *attr, char *buf)
6508 + return sprintf(buf, "%lu\n", uksm_pages_unshared);
6510 +UKSM_ATTR_RO(pages_unshared);
6512 +static ssize_t full_scans_show(struct kobject *kobj,
6513 + struct kobj_attribute *attr, char *buf)
6515 + return sprintf(buf, "%llu\n", fully_scanned_round);
6517 +UKSM_ATTR_RO(full_scans);
6519 +static ssize_t pages_scanned_show(struct kobject *kobj,
6520 + struct kobj_attribute *attr, char *buf)
6522 + unsigned long base = 0;
6525 + if (pages_scanned_stored) {
6526 + base = pages_scanned_base;
6527 + ret = pages_scanned_stored;
6528 + delta = uksm_pages_scanned >> base;
6529 + if (CAN_OVERFLOW_U64(ret, delta)) {
6536 + ret = uksm_pages_scanned;
6539 + while (ret > ULONG_MAX) {
6545 + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6547 + return sprintf(buf, "%lu\n", (unsigned long)ret);
6549 +UKSM_ATTR_RO(pages_scanned);
6551 +static ssize_t hash_strength_show(struct kobject *kobj,
6552 + struct kobj_attribute *attr, char *buf)
6554 + return sprintf(buf, "%lu\n", hash_strength);
6556 +UKSM_ATTR_RO(hash_strength);
6558 +static ssize_t sleep_times_show(struct kobject *kobj,
6559 + struct kobj_attribute *attr, char *buf)
6561 + return sprintf(buf, "%llu\n", uksm_sleep_times);
6563 +UKSM_ATTR_RO(sleep_times);
6566 +static struct attribute *uksm_attrs[] = {
6567 + &max_cpu_percentage_attr.attr,
6568 + &sleep_millisecs_attr.attr,
6569 + &cpu_governor_attr.attr,
6571 + &ema_per_page_time_attr.attr,
6572 + &pages_shared_attr.attr,
6573 + &pages_sharing_attr.attr,
6574 + &pages_unshared_attr.attr,
6575 + &full_scans_attr.attr,
6576 + &pages_scanned_attr.attr,
6577 + &hash_strength_attr.attr,
6578 + &sleep_times_attr.attr,
6579 + &thrash_threshold_attr.attr,
6580 + &abundant_threshold_attr.attr,
6581 + &cpu_ratios_attr.attr,
6582 + &eval_intervals_attr.attr,
6586 +static struct attribute_group uksm_attr_group = {
6587 + .attrs = uksm_attrs,
6590 +#endif /* CONFIG_SYSFS */
6592 +static inline void init_scan_ladder(void)
6595 + struct scan_rung *rung;
6597 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6598 + rung = uksm_scan_ladder + i;
6599 + slot_tree_init_root(&rung->vma_root);
6602 + init_performance_values();
6603 + uksm_calc_scan_pages();
6606 +static inline int cal_positive_negative_costs(void)
6608 + struct page *p1, *p2;
6609 + unsigned char *addr1, *addr2;
6610 + unsigned long i, time_start, hash_cost;
6611 + unsigned long loopnum = 0;
6613 + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6614 + volatile u32 hash;
6617 + p1 = alloc_page(GFP_KERNEL);
6621 + p2 = alloc_page(GFP_KERNEL);
6625 + addr1 = kmap_atomic(p1);
6626 + addr2 = kmap_atomic(p2);
6627 + memset(addr1, prandom_u32(), PAGE_SIZE);
6628 + memcpy(addr2, addr1, PAGE_SIZE);
6630 + /* make sure that the two pages differ in last byte */
6631 + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6632 + kunmap_atomic(addr2);
6633 + kunmap_atomic(addr1);
6635 + time_start = jiffies;
6636 + while (jiffies - time_start < 100) {
6637 + for (i = 0; i < 100; i++)
6638 + hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6641 + hash_cost = (jiffies - time_start);
6643 + time_start = jiffies;
6644 + for (i = 0; i < loopnum; i++)
6645 + ret = pages_identical(p1, p2);
6646 + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6647 + memcmp_cost /= hash_cost;
6648 + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
6649 + "hash=%u cmp_ret=%d.\n",
6650 + memcmp_cost, hash, ret);
6657 +static int init_zeropage_hash_table(void)
6659 + struct page *page;
6663 + page = alloc_page(GFP_KERNEL);
6667 + addr = kmap_atomic(page);
6668 + memset(addr, 0, PAGE_SIZE);
6669 + kunmap_atomic(addr);
6671 + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6673 + if (!zero_hash_table)
6676 + for (i = 0; i < HASH_STRENGTH_MAX; i++)
6677 + zero_hash_table[i] = page_hash(page, i, 0);
6679 + __free_page(page);
6684 +static inline int init_random_sampling(void)
6687 + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6691 + for (i = 0; i < HASH_STRENGTH_FULL; i++)
6692 + random_nums[i] = i;
6694 + for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6695 + unsigned long rand_range, swap_index, tmp;
6697 + rand_range = HASH_STRENGTH_FULL - i;
6698 + swap_index = i + prandom_u32() % rand_range;
6699 + tmp = random_nums[i];
6700 + random_nums[i] = random_nums[swap_index];
6701 + random_nums[swap_index] = tmp;
6704 + rshash_state.state = RSHASH_NEW;
6705 + rshash_state.below_count = 0;
6706 + rshash_state.lookup_window_index = 0;
6708 + return cal_positive_negative_costs();
6711 +static int __init uksm_slab_init(void)
6713 + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6714 + if (!rmap_item_cache)
6717 + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6718 + if (!stable_node_cache)
6721 + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6722 + if (!node_vma_cache)
6725 + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6726 + if (!vma_slot_cache)
6729 + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6730 + if (!tree_node_cache)
6736 + kmem_cache_destroy(vma_slot_cache);
6738 + kmem_cache_destroy(node_vma_cache);
6740 + kmem_cache_destroy(stable_node_cache);
6742 + kmem_cache_destroy(rmap_item_cache);
6747 +static void __init uksm_slab_free(void)
6749 + kmem_cache_destroy(stable_node_cache);
6750 + kmem_cache_destroy(rmap_item_cache);
6751 + kmem_cache_destroy(node_vma_cache);
6752 + kmem_cache_destroy(vma_slot_cache);
6753 + kmem_cache_destroy(tree_node_cache);
6756 +/* Common interface to ksm, different to it. */
6757 +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6758 + unsigned long end, int advice, unsigned long *vm_flags)
6763 + case MADV_MERGEABLE:
6764 + return 0; /* just ignore the advice */
6766 + case MADV_UNMERGEABLE:
6767 + if (!(*vm_flags & VM_MERGEABLE))
6768 + return 0; /* just ignore the advice */
6770 + if (vma->anon_vma) {
6771 + err = unmerge_uksm_pages(vma, start, end);
6776 + uksm_remove_vma(vma);
6777 + *vm_flags &= ~VM_MERGEABLE;
6784 +/* Common interface to ksm, actually the same. */
6785 +struct page *ksm_might_need_to_copy(struct page *page,
6786 + struct vm_area_struct *vma, unsigned long address)
6788 + struct anon_vma *anon_vma = page_anon_vma(page);
6789 + struct page *new_page;
6791 + if (PageKsm(page)) {
6792 + if (page_stable_node(page))
6793 + return page; /* no need to copy it */
6794 + } else if (!anon_vma) {
6795 + return page; /* no need to copy it */
6796 + } else if (anon_vma->root == vma->anon_vma->root &&
6797 + page->index == linear_page_index(vma, address)) {
6798 + return page; /* still no need to copy it */
6800 + if (!PageUptodate(page))
6801 + return page; /* let do_swap_page report the error */
6803 + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6805 + copy_user_highpage(new_page, page, address, vma);
6807 + SetPageDirty(new_page);
6808 + __SetPageUptodate(new_page);
6809 + __set_page_locked(new_page);
6815 +static int __init uksm_init(void)
6817 + struct task_struct *uksm_thread;
6820 + uksm_sleep_jiffies = msecs_to_jiffies(100);
6821 + uksm_sleep_saved = uksm_sleep_jiffies;
6824 + init_scan_ladder();
6827 + err = init_random_sampling();
6831 + err = uksm_slab_init();
6835 + err = init_zeropage_hash_table();
6839 + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6840 + if (IS_ERR(uksm_thread)) {
6841 + printk(KERN_ERR "uksm: creating kthread failed\n");
6842 + err = PTR_ERR(uksm_thread);
6846 +#ifdef CONFIG_SYSFS
6847 + err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6849 + printk(KERN_ERR "uksm: register sysfs failed\n");
6850 + kthread_stop(uksm_thread);
6854 + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
6856 +#endif /* CONFIG_SYSFS */
6858 +#ifdef CONFIG_MEMORY_HOTREMOVE
6860 + * Choose a high priority since the callback takes uksm_thread_mutex:
6861 + * later callbacks could only be taking locks which nest within that.
6863 + hotplug_memory_notifier(uksm_memory_callback, 100);
6868 + kfree(zero_hash_table);
6872 + kfree(random_nums);
6874 + kfree(uksm_scan_ladder);
6879 +subsys_initcall(ksm_init);
6881 +late_initcall(uksm_init);
6884 diff --git a/mm/vmstat.c b/mm/vmstat.c
6885 index 1b12d39..24b803f 100644
6888 @@ -795,6 +795,9 @@ const char * const vmstat_text[] = {
6889 "nr_anon_transparent_hugepages",
6893 + "nr_uksm_zero_pages",
6895 /* enum writeback_stat_item counters */
6896 "nr_dirty_threshold",
6897 "nr_dirty_background_threshold",