]> git.pld-linux.org Git - packages/kernel.git/blob - uksm-0.1.2.3-for-v3.18.patch
- up to 3.18.37
[packages/kernel.git] / uksm-0.1.2.3-for-v3.18.patch
1 diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
2 index 081c497..75bde3d 100644
3 --- a/Documentation/vm/00-INDEX
4 +++ b/Documentation/vm/00-INDEX
5 @@ -16,6 +16,8 @@ hwpoison.txt
6         - explains what hwpoison is
7  ksm.txt
8         - how to use the Kernel Samepage Merging feature.
9 +uksm.txt
10 +       - Introduction to Ultra KSM
11  numa
12         - information about NUMA specific code in the Linux vm.
13  numa_memory_policy.txt
14 diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
15 new file mode 100644
16 index 0000000..08bd645
17 --- /dev/null
18 +++ b/Documentation/vm/uksm.txt
19 @@ -0,0 +1,58 @@
20 +The Ultra Kernel Samepage Merging feature
21 +----------------------------------------------
22 +/*
23 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
24 + *
25 + * This is an improvement upon KSM. Some basic data structures and routines
26 + * are borrowed from ksm.c .
27 + *
28 + * Its new features:
29 + * 1. Full system scan:
30 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
31 + *      interaction to submit a memory area to KSM is no longer needed.
32 + *
33 + * 2. Rich area detection:
34 + *      It automatically detects rich areas containing abundant duplicated
35 + *      pages based. Rich areas are given a full scan speed. Poor areas are
36 + *      sampled at a reasonable speed with very low CPU consumption.
37 + *
38 + * 3. Ultra Per-page scan speed improvement:
39 + *      A new hash algorithm is proposed. As a result, on a machine with
40 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
41 + *      can scan memory areas that does not contain duplicated pages at speed of
42 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
43 + *      477MB/sec ~ 923MB/sec.
44 + *
45 + * 4. Thrashing area avoidance:
46 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
47 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
48 + *      hash value based volatile page detection.
49 + *
50 + *
51 + * 5. Misc changes upon KSM:
52 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
53 + *        comparison. It's much faster than default C version on x86.
54 + *      * rmap_item now has an struct *page member to loosely cache a
55 + *        address-->page mapping, which reduces too much time-costly
56 + *        follow_page().
57 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
58 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
59 + *        ksm is needed for this case.
60 + *
61 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
62 + *    Now uksmd consider full zero pages as special pages and merge them to an
63 + *    special unswappable uksm zero page.
64 + */
65 +
66 +ChangeLog:
67 +
68 +2012-05-05 The creation of this Doc
69 +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
70 +2012-05-28 UKSM 0.1.1.2 bug fix release
71 +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
72 +2012-07-2  UKSM 0.1.2-beta2
73 +2012-07-10 UKSM 0.1.2-beta3
74 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
75 +2012-10-13 UKSM 0.1.2.1 Bug fixes.
76 +2012-12-31 UKSM 0.1.2.2 Minor bug fixes
77 +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug"
78 diff --git a/fs/exec.c b/fs/exec.c
79 index 7302b75..84b0df5 100644
80 --- a/fs/exec.c
81 +++ b/fs/exec.c
82 @@ -19,7 +19,7 @@
83   * current->executable is only used by the procfs.  This allows a dispatch
84   * table to check for several different types  of binary formats.  We keep
85   * trying until we recognize the file or we run out of supported binary
86 - * formats. 
87 + * formats.
88   */
89  
90  #include <linux/slab.h>
91 @@ -56,6 +56,7 @@
92  #include <linux/pipe_fs_i.h>
93  #include <linux/oom.h>
94  #include <linux/compat.h>
95 +#include <linux/ksm.h>
96  
97  #include <asm/uaccess.h>
98  #include <asm/mmu_context.h>
99 @@ -1128,6 +1129,7 @@ void setup_new_exec(struct linux_binprm * bprm)
100         /* An exec changes our domain. We are no longer part of the thread
101            group */
102         current->self_exec_id++;
103 +
104         flush_signal_handlers(current, 0);
105         do_close_on_exec(current->files);
106  }
107 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
108 index aa1eee0..5a9149c 100644
109 --- a/fs/proc/meminfo.c
110 +++ b/fs/proc/meminfo.c
111 @@ -121,6 +121,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
112                 "SUnreclaim:     %8lu kB\n"
113                 "KernelStack:    %8lu kB\n"
114                 "PageTables:     %8lu kB\n"
115 +#ifdef CONFIG_UKSM
116 +               "KsmZeroPages:   %8lu kB\n"
117 +#endif
118  #ifdef CONFIG_QUICKLIST
119                 "Quicklists:     %8lu kB\n"
120  #endif
121 @@ -175,6 +178,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
122                 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
123                 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
124                 K(global_page_state(NR_PAGETABLE)),
125 +#ifdef CONFIG_UKSM
126 +               K(global_page_state(NR_UKSM_ZERO_PAGES)),
127 +#endif
128  #ifdef CONFIG_QUICKLIST
129                 K(quicklist_total_size()),
130  #endif
131 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
132 index 752e30d..1e7c826 100644
133 --- a/include/asm-generic/pgtable.h
134 +++ b/include/asm-generic/pgtable.h
135 @@ -537,12 +537,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
136                         unsigned long size);
137  #endif
138  
139 +#ifdef CONFIG_UKSM
140 +static inline int is_uksm_zero_pfn(unsigned long pfn)
141 +{
142 +       extern unsigned long uksm_zero_pfn;
143 +        return pfn == uksm_zero_pfn;
144 +}
145 +#else
146 +static inline int is_uksm_zero_pfn(unsigned long pfn)
147 +{
148 +        return 0;
149 +}
150 +#endif
151 +
152  #ifdef __HAVE_COLOR_ZERO_PAGE
153  static inline int is_zero_pfn(unsigned long pfn)
154  {
155         extern unsigned long zero_pfn;
156         unsigned long offset_from_zero_pfn = pfn - zero_pfn;
157 -       return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
158 +       return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
159  }
160  
161  #define my_zero_pfn(addr)      page_to_pfn(ZERO_PAGE(addr))
162 @@ -551,7 +564,7 @@ static inline int is_zero_pfn(unsigned long pfn)
163  static inline int is_zero_pfn(unsigned long pfn)
164  {
165         extern unsigned long zero_pfn;
166 -       return pfn == zero_pfn;
167 +       return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
168  }
169  
170  static inline unsigned long my_zero_pfn(unsigned long addr)
171 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
172 index 3be6bb1..51557d1 100644
173 --- a/include/linux/ksm.h
174 +++ b/include/linux/ksm.h
175 @@ -19,21 +19,6 @@ struct mem_cgroup;
176  #ifdef CONFIG_KSM
177  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
178                 unsigned long end, int advice, unsigned long *vm_flags);
179 -int __ksm_enter(struct mm_struct *mm);
180 -void __ksm_exit(struct mm_struct *mm);
181 -
182 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
183 -{
184 -       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
185 -               return __ksm_enter(mm);
186 -       return 0;
187 -}
188 -
189 -static inline void ksm_exit(struct mm_struct *mm)
190 -{
191 -       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
192 -               __ksm_exit(mm);
193 -}
194  
195  /*
196   * A KSM page is one of those write-protected "shared pages" or "merged pages"
197 @@ -76,6 +61,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
198  int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
199  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
200  
201 +#ifdef CONFIG_KSM_LEGACY
202 +int __ksm_enter(struct mm_struct *mm);
203 +void __ksm_exit(struct mm_struct *mm);
204 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
205 +{
206 +       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
207 +               return __ksm_enter(mm);
208 +       return 0;
209 +}
210 +
211 +static inline void ksm_exit(struct mm_struct *mm)
212 +{
213 +       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
214 +               __ksm_exit(mm);
215 +}
216 +
217 +#elif defined(CONFIG_UKSM)
218 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
219 +{
220 +       return 0;
221 +}
222 +
223 +static inline void ksm_exit(struct mm_struct *mm)
224 +{
225 +}
226 +#endif /* !CONFIG_UKSM */
227 +
228  #else  /* !CONFIG_KSM */
229  
230  static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
231 @@ -123,4 +135,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
232  #endif /* CONFIG_MMU */
233  #endif /* !CONFIG_KSM */
234  
235 +#include <linux/uksm.h>
236 +
237  #endif /* __LINUX_KSM_H */
238 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
239 index 6e0b286..ebd6243 100644
240 --- a/include/linux/mm_types.h
241 +++ b/include/linux/mm_types.h
242 @@ -308,6 +308,9 @@ struct vm_area_struct {
243  #ifdef CONFIG_NUMA
244         struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
245  #endif
246 +#ifdef CONFIG_UKSM
247 +       struct vma_slot *uksm_vma_slot;
248 +#endif
249  };
250  
251  struct core_thread {
252 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
253 index ffe66e3..355f0e9 100644
254 --- a/include/linux/mmzone.h
255 +++ b/include/linux/mmzone.h
256 @@ -157,6 +157,9 @@ enum zone_stat_item {
257         WORKINGSET_NODERECLAIM,
258         NR_ANON_TRANSPARENT_HUGEPAGES,
259         NR_FREE_CMA_PAGES,
260 +#ifdef CONFIG_UKSM
261 +       NR_UKSM_ZERO_PAGES,
262 +#endif
263         NR_VM_ZONE_STAT_ITEMS };
264  
265  /*
266 @@ -865,7 +868,7 @@ static inline int is_highmem_idx(enum zone_type idx)
267  }
268  
269  /**
270 - * is_highmem - helper function to quickly check if a struct zone is a 
271 + * is_highmem - helper function to quickly check if a struct zone is a
272   *              highmem zone or not.  This is an attempt to keep references
273   *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
274   * @zone - pointer to struct zone variable
275 diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
276 new file mode 100644
277 index 0000000..6780fdb
278 --- /dev/null
279 +++ b/include/linux/sradix-tree.h
280 @@ -0,0 +1,77 @@
281 +#ifndef _LINUX_SRADIX_TREE_H
282 +#define _LINUX_SRADIX_TREE_H
283 +
284 +
285 +#define INIT_SRADIX_TREE(root, mask)                                   \
286 +do {                                                                   \
287 +       (root)->height = 0;                                             \
288 +       (root)->gfp_mask = (mask);                                      \
289 +       (root)->rnode = NULL;                                           \
290 +} while (0)
291 +
292 +#define ULONG_BITS     (sizeof(unsigned long) * 8)
293 +#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
294 +//#define SRADIX_TREE_MAP_SHIFT        6
295 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
296 +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
297 +
298 +struct sradix_tree_node {
299 +       unsigned int    height;         /* Height from the bottom */
300 +       unsigned int    count;          
301 +       unsigned int    fulls;          /* Number of full sublevel trees */ 
302 +       struct sradix_tree_node *parent;
303 +       void *stores[0];
304 +};
305 +
306 +/* A simple radix tree implementation */
307 +struct sradix_tree_root {
308 +        unsigned int            height;
309 +        struct sradix_tree_node *rnode;
310 +
311 +       /* Where found to have available empty stores in its sublevels */
312 +        struct sradix_tree_node *enter_node;
313 +       unsigned int shift;
314 +       unsigned int stores_size;
315 +       unsigned int mask;
316 +       unsigned long min;      /* The first hole index */
317 +       unsigned long num;
318 +       //unsigned long *height_to_maxindex;
319 +
320 +       /* How the node is allocated and freed. */
321 +       struct sradix_tree_node *(*alloc)(void); 
322 +       void (*free)(struct sradix_tree_node *node);
323 +
324 +       /* When a new node is added and removed */
325 +       void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
326 +       void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
327 +       void (*rm)(struct sradix_tree_node *node, unsigned offset);
328 +};
329 +
330 +struct sradix_tree_path {
331 +       struct sradix_tree_node *node;
332 +       int offset;
333 +};
334 +
335 +static inline 
336 +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
337 +{
338 +       root->height = 0;
339 +       root->rnode = NULL;
340 +       root->shift = shift;
341 +       root->stores_size = 1UL << shift;
342 +       root->mask = root->stores_size - 1;
343 +}
344 +
345 +
346 +extern void *sradix_tree_next(struct sradix_tree_root *root,
347 +                      struct sradix_tree_node *node, unsigned long index,
348 +                      int (*iter)(void *, unsigned long));
349 +
350 +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
351 +
352 +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
353 +                       struct sradix_tree_node *node, unsigned long index);
354 +
355 +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
356 +
357 +#endif /* _LINUX_SRADIX_TREE_H */
358 diff --git a/include/linux/uksm.h b/include/linux/uksm.h
359 new file mode 100644
360 index 0000000..a644bca
361 --- /dev/null
362 +++ b/include/linux/uksm.h
363 @@ -0,0 +1,146 @@
364 +#ifndef __LINUX_UKSM_H
365 +#define __LINUX_UKSM_H
366 +/*
367 + * Memory merging support.
368 + *
369 + * This code enables dynamic sharing of identical pages found in different
370 + * memory areas, even if they are not shared by fork().
371 + */
372 +
373 +/* if !CONFIG_UKSM this file should not be compiled at all. */
374 +#ifdef CONFIG_UKSM
375 +
376 +#include <linux/bitops.h>
377 +#include <linux/mm.h>
378 +#include <linux/pagemap.h>
379 +#include <linux/rmap.h>
380 +#include <linux/sched.h>
381 +
382 +extern unsigned long zero_pfn __read_mostly;
383 +extern unsigned long uksm_zero_pfn __read_mostly;
384 +extern struct page *empty_uksm_zero_page;
385 +
386 +/* must be done before linked to mm */
387 +extern void uksm_vma_add_new(struct vm_area_struct *vma);
388 +extern void uksm_remove_vma(struct vm_area_struct *vma);
389 +
390 +#define UKSM_SLOT_NEED_SORT    (1 << 0)
391 +#define UKSM_SLOT_NEED_RERAND  (1 << 1)
392 +#define UKSM_SLOT_SCANNED      (1 << 2) /* It's scanned in this round */
393 +#define UKSM_SLOT_FUL_SCANNED  (1 << 3)
394 +#define UKSM_SLOT_IN_UKSM      (1 << 4)
395 +
396 +struct vma_slot {
397 +       struct sradix_tree_node *snode;
398 +       unsigned long sindex;
399 +
400 +       struct list_head slot_list;
401 +       unsigned long fully_scanned_round;
402 +       unsigned long dedup_num;
403 +       unsigned long pages_scanned;
404 +       unsigned long last_scanned;
405 +       unsigned long pages_to_scan;
406 +       struct scan_rung *rung;
407 +       struct page **rmap_list_pool;
408 +       unsigned int *pool_counts;
409 +       unsigned long pool_size;
410 +       struct vm_area_struct *vma;
411 +       struct mm_struct *mm;
412 +       unsigned long ctime_j;
413 +       unsigned long pages;
414 +       unsigned long flags;
415 +       unsigned long pages_cowed; /* pages cowed this round */
416 +       unsigned long pages_merged; /* pages merged this round */
417 +       unsigned long pages_bemerged;
418 +
419 +       /* when it has page merged in this eval round */
420 +       struct list_head dedup_list;
421 +};
422 +
423 +static inline void uksm_unmap_zero_page(pte_t pte)
424 +{
425 +       if (pte_pfn(pte) == uksm_zero_pfn)
426 +               __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
427 +}
428 +
429 +static inline void uksm_map_zero_page(pte_t pte)
430 +{
431 +       if (pte_pfn(pte) == uksm_zero_pfn)
432 +               __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
433 +}
434 +
435 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
436 +{
437 +       if (vma->uksm_vma_slot && PageKsm(page))
438 +               vma->uksm_vma_slot->pages_cowed++;
439 +}
440 +
441 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
442 +{
443 +       if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
444 +               vma->uksm_vma_slot->pages_cowed++;
445 +}
446 +
447 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
448 +{
449 +#ifndef VM_SAO
450 +#define VM_SAO 0
451 +#endif
452 +       return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
453 +                            VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP |
454 +                            VM_SHARED  | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO));
455 +}
456 +
457 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
458 +{
459 +       if (uksm_flags_can_scan(*vm_flags_p))
460 +               *vm_flags_p |= VM_MERGEABLE;
461 +}
462 +
463 +/*
464 + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
465 + * be removed when uksm zero page patch is stable enough.
466 + */
467 +static inline void uksm_bugon_zeropage(pte_t pte)
468 +{
469 +       BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
470 +}
471 +#else
472 +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
473 +{
474 +}
475 +
476 +static inline void uksm_remove_vma(struct vm_area_struct *vma)
477 +{
478 +}
479 +
480 +static inline void uksm_unmap_zero_page(pte_t pte)
481 +{
482 +}
483 +
484 +static inline void uksm_map_zero_page(pte_t pte)
485 +{
486 +}
487 +
488 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
489 +{
490 +}
491 +
492 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
493 +{
494 +}
495 +
496 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
497 +{
498 +       return 0;
499 +}
500 +
501 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
502 +{
503 +}
504 +
505 +static inline void uksm_bugon_zeropage(pte_t pte)
506 +{
507 +}
508 +#endif /* !CONFIG_UKSM */
509 +#endif /* __LINUX_UKSM_H */
510 diff --git a/kernel/fork.c b/kernel/fork.c
511 index 9b7d746..73ad90d 100644
512 --- a/kernel/fork.c
513 +++ b/kernel/fork.c
514 @@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
515                                 goto fail_nomem;
516                         charge = len;
517                 }
518 -               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
519 +               tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
520                 if (!tmp)
521                         goto fail_nomem;
522                 *tmp = *mpnt;
523 @@ -467,7 +467,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
524                 __vma_link_rb(mm, tmp, rb_link, rb_parent);
525                 rb_link = &tmp->vm_rb.rb_right;
526                 rb_parent = &tmp->vm_rb;
527 -
528 +               uksm_vma_add_new(tmp);
529                 mm->map_count++;
530                 retval = copy_page_range(mm, oldmm, mpnt);
531  
532 diff --git a/lib/Makefile b/lib/Makefile
533 index 0211d2b..426536f 100644
534 --- a/lib/Makefile
535 +++ b/lib/Makefile
536 @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
537  endif
538  
539  lib-y := ctype.o string.o vsprintf.o cmdline.o \
540 -        rbtree.o radix-tree.o dump_stack.o timerqueue.o\
541 +        rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
542          idr.o int_sqrt.o extable.o \
543          sha1.o md5.o irq_regs.o argv_split.o \
544          proportions.o flex_proportions.o ratelimit.o show_mem.o \
545 diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
546 new file mode 100644
547 index 0000000..8d06329
548 --- /dev/null
549 +++ b/lib/sradix-tree.c
550 @@ -0,0 +1,476 @@
551 +#include <linux/errno.h>
552 +#include <linux/mm.h>
553 +#include <linux/mman.h>
554 +#include <linux/spinlock.h>
555 +#include <linux/slab.h>
556 +#include <linux/gcd.h>
557 +#include <linux/sradix-tree.h>
558 +
559 +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
560 +{
561 +       return node->fulls == root->stores_size || 
562 +               (node->height == 1 && node->count == root->stores_size);
563 +}
564 +
565 +/*
566 + *     Extend a sradix tree so it can store key @index.
567 + */
568 +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
569 +{
570 +       struct sradix_tree_node *node;
571 +       unsigned int height;
572 +
573 +       if (unlikely(root->rnode == NULL)) {
574 +               if (!(node = root->alloc()))
575 +                       return -ENOMEM;
576 +
577 +               node->height = 1;
578 +               root->rnode = node;
579 +               root->height = 1;
580 +       }
581 +
582 +       /* Figure out what the height should be.  */
583 +       height = root->height;
584 +       index >>= root->shift * height;
585 +
586 +       while (index) {
587 +               index >>= root->shift;
588 +               height++;
589 +       }
590 +
591 +       while (height > root->height) {
592 +               unsigned int newheight;
593 +               if (!(node = root->alloc()))
594 +                       return -ENOMEM;
595 +
596 +               /* Increase the height.  */
597 +               node->stores[0] = root->rnode;
598 +               root->rnode->parent = node;
599 +               if (root->extend)
600 +                       root->extend(node, root->rnode);
601 +
602 +               newheight = root->height + 1;
603 +               node->height = newheight;
604 +               node->count = 1;
605 +               if (sradix_node_full(root, root->rnode))
606 +                       node->fulls = 1;
607 +
608 +               root->rnode = node;
609 +               root->height = newheight;
610 +       }
611 +
612 +       return 0;
613 +}
614 +
615 +/*
616 + * Search the next item from the current node, that is not NULL
617 + * and can satify root->iter().
618 + */
619 +void *sradix_tree_next(struct sradix_tree_root *root,
620 +                      struct sradix_tree_node *node, unsigned long index,
621 +                      int (*iter)(void *item, unsigned long height))
622 +{
623 +       unsigned long offset;
624 +       void *item;
625 +
626 +       if (unlikely(node == NULL)) {
627 +               node = root->rnode;
628 +               for (offset = 0; offset < root->stores_size; offset++) {
629 +                       item = node->stores[offset];
630 +                       if (item && (!iter || iter(item, node->height)))
631 +                               break;
632 +               }
633 +
634 +               if (unlikely(offset >= root->stores_size))
635 +                       return NULL;
636 +
637 +               if (node->height == 1)
638 +                       return item;
639 +               else
640 +                       goto go_down;
641 +       }
642 +
643 +       while (node) {
644 +               offset = (index & root->mask) + 1;                                      
645 +               for (;offset < root->stores_size; offset++) {
646 +                       item = node->stores[offset];
647 +                       if (item && (!iter || iter(item, node->height)))
648 +                               break;
649 +               }
650 +
651 +               if (offset < root->stores_size)
652 +                       break;
653 +
654 +               node = node->parent;
655 +               index >>= root->shift;
656 +       }
657 +
658 +       if (!node)
659 +               return NULL;
660 +
661 +       while (node->height > 1) {
662 +go_down:
663 +               node = item;
664 +               for (offset = 0; offset < root->stores_size; offset++) {
665 +                       item = node->stores[offset];
666 +                       if (item && (!iter || iter(item, node->height)))
667 +                               break;
668 +               }
669 +
670 +               if (unlikely(offset >= root->stores_size))
671 +                       return NULL;
672 +       }
673 +
674 +       BUG_ON(offset > root->stores_size);
675 +
676 +       return item;
677 +}
678 +
679 +/*
680 + * Blindly insert the item to the tree. Typically, we reuse the
681 + * first empty store item.
682 + */
683 +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
684 +{
685 +       unsigned long index;
686 +       unsigned int height;
687 +       struct sradix_tree_node *node, *tmp = NULL;
688 +       int offset, offset_saved;
689 +       void **store = NULL;
690 +       int error, i, j, shift;
691 +
692 +go_on:
693 +       index = root->min;
694 +
695 +       if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
696 +               node = root->enter_node;
697 +               BUG_ON((index >> (root->shift * root->height)));
698 +       } else {
699 +               node = root->rnode;
700 +               if (node == NULL || (index >> (root->shift * root->height))
701 +                   || sradix_node_full(root, node)) {
702 +                       error = sradix_tree_extend(root, index);
703 +                       if (error)
704 +                               return error;
705 +
706 +                       node = root->rnode;
707 +               }
708 +       }
709 +
710 +
711 +       height = node->height;
712 +       shift = (height - 1) * root->shift;
713 +       offset = (index >> shift) & root->mask;
714 +       while (shift > 0) {
715 +               offset_saved = offset;
716 +               for (; offset < root->stores_size; offset++) {
717 +                       store = &node->stores[offset];
718 +                       tmp = *store;
719 +
720 +                       if (!tmp || !sradix_node_full(root, tmp))
721 +                               break;
722 +               }
723 +               BUG_ON(offset >= root->stores_size);
724 +
725 +               if (offset != offset_saved) {
726 +                       index += (offset - offset_saved) << shift;
727 +                       index &= ~((1UL << shift) - 1);
728 +               }
729 +
730 +               if (!tmp) {
731 +                       if (!(tmp = root->alloc()))
732 +                               return -ENOMEM;
733 +
734 +                       tmp->height = shift / root->shift;
735 +                       *store = tmp;
736 +                       tmp->parent = node;
737 +                       node->count++;
738 +//                     if (root->extend)
739 +//                             root->extend(node, tmp);
740 +               }
741 +
742 +               node = tmp;
743 +               shift -= root->shift;
744 +               offset = (index >> shift) & root->mask;
745 +       }
746 +
747 +       BUG_ON(node->height != 1);
748 +
749 +
750 +       store = &node->stores[offset];
751 +       for (i = 0, j = 0;
752 +             j < root->stores_size - node->count && 
753 +             i < root->stores_size - offset && j < num; i++) {
754 +               if (!store[i]) {
755 +                       store[i] = item[j];
756 +                       if (root->assign)
757 +                               root->assign(node, index + i, item[j]);
758 +                       j++;
759 +               }
760 +       }
761 +
762 +       node->count += j;
763 +       root->num += j;
764 +       num -= j;
765 +
766 +       while (sradix_node_full(root, node)) {
767 +               node = node->parent;
768 +               if (!node)
769 +                       break;
770 +
771 +               node->fulls++;
772 +       }
773 +
774 +       if (unlikely(!node)) {
775 +               /* All nodes are full */
776 +               root->min = 1 << (root->height * root->shift);
777 +               root->enter_node = NULL;
778 +       } else {
779 +               root->min = index + i - 1;
780 +               root->min |= (1UL << (node->height - 1)) - 1;
781 +               root->min++;
782 +               root->enter_node = node;
783 +       }
784 +
785 +       if (num) {
786 +               item += j;
787 +               goto go_on;
788 +       }
789 +
790 +       return 0;
791 +}
792 +
793 +
794 +/**
795 + *     sradix_tree_shrink    -    shrink height of a sradix tree to minimal
796 + *      @root          sradix tree root
797 + *  
798 + */
799 +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
800 +{
801 +       /* try to shrink tree height */
802 +       while (root->height > 1) {
803 +               struct sradix_tree_node *to_free = root->rnode;
804 +
805 +               /*
806 +                * The candidate node has more than one child, or its child
807 +                * is not at the leftmost store, we cannot shrink.
808 +                */
809 +               if (to_free->count != 1 || !to_free->stores[0])
810 +                       break;
811 +
812 +               root->rnode = to_free->stores[0];
813 +               root->rnode->parent = NULL;
814 +               root->height--;
815 +               if (unlikely(root->enter_node == to_free)) {
816 +                       root->enter_node = NULL;
817 +               }
818 +               root->free(to_free);
819 +       }
820 +}
821 +
822 +/*
823 + * Del the item on the known leaf node and index
824 + */
825 +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
826 +                                 struct sradix_tree_node *node, unsigned long index)
827 +{
828 +       unsigned int offset;
829 +       struct sradix_tree_node *start, *end;
830 +
831 +       BUG_ON(node->height != 1);
832 +
833 +       start = node;
834 +       while (node && !(--node->count))
835 +               node = node->parent;
836 +
837 +       end = node;
838 +       if (!node) {
839 +               root->rnode = NULL;
840 +               root->height = 0;
841 +               root->min = 0;
842 +               root->num = 0;
843 +               root->enter_node = NULL;
844 +       } else {
845 +               offset = (index >> (root->shift * (node->height - 1))) & root->mask;
846 +               if (root->rm)
847 +                       root->rm(node, offset);
848 +               node->stores[offset] = NULL;
849 +               root->num--;
850 +               if (root->min > index) {
851 +                       root->min = index;
852 +                       root->enter_node = node;
853 +               }
854 +       }
855 +
856 +       if (start != end) {
857 +               do {
858 +                       node = start;
859 +                       start = start->parent;
860 +                       if (unlikely(root->enter_node == node))
861 +                               root->enter_node = end;
862 +                       root->free(node);
863 +               } while (start != end);
864 +
865 +               /*
866 +                * Note that shrink may free "end", so enter_node still need to
867 +                * be checked inside.
868 +                */
869 +               sradix_tree_shrink(root);
870 +       } else if (node->count == root->stores_size - 1) {
871 +               /* It WAS a full leaf node. Update the ancestors */
872 +               node = node->parent;
873 +               while (node) {
874 +                       node->fulls--;
875 +                       if (node->fulls != root->stores_size - 1)
876 +                               break;
877 +
878 +                       node = node->parent;
879 +               }
880 +       }
881 +}
882 +
883 +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
884 +{
885 +       unsigned int height, offset;
886 +       struct sradix_tree_node *node;
887 +       int shift;
888 +
889 +       node = root->rnode;
890 +       if (node == NULL || (index >> (root->shift * root->height)))
891 +               return NULL;
892 +
893 +       height = root->height;
894 +       shift = (height - 1) * root->shift;
895 +
896 +       do {
897 +               offset = (index >> shift) & root->mask;
898 +               node = node->stores[offset];
899 +               if (!node)
900 +                       return NULL;
901 +
902 +               shift -= root->shift;
903 +       } while (shift >= 0);
904 +
905 +       return node;
906 +}
907 +
908 +/*
909 + * Return the item if it exists, otherwise create it in place
910 + * and return the created item.
911 + */
912 +void *sradix_tree_lookup_create(struct sradix_tree_root *root, 
913 +                       unsigned long index, void *(*item_alloc)(void))
914 +{
915 +       unsigned int height, offset;
916 +       struct sradix_tree_node *node, *tmp;
917 +       void *item;
918 +       int shift, error;
919 +
920 +       if (root->rnode == NULL || (index >> (root->shift * root->height))) {
921 +               if (item_alloc) {
922 +                       error = sradix_tree_extend(root, index);
923 +                       if (error)
924 +                               return NULL;
925 +               } else {
926 +                       return NULL;
927 +               }
928 +       }
929 +
930 +       node = root->rnode;
931 +       height = root->height;
932 +       shift = (height - 1) * root->shift;
933 +
934 +       do {
935 +               offset = (index >> shift) & root->mask;
936 +               if (!node->stores[offset]) {
937 +                       if (!(tmp = root->alloc()))
938 +                               return NULL;
939 +
940 +                       tmp->height = shift / root->shift;
941 +                       node->stores[offset] = tmp;
942 +                       tmp->parent = node;
943 +                       node->count++;
944 +                       node = tmp;
945 +               } else {
946 +                       node = node->stores[offset];
947 +               }
948 +
949 +               shift -= root->shift;
950 +       } while (shift > 0);
951 +
952 +       BUG_ON(node->height != 1);
953 +       offset = index & root->mask;
954 +       if (node->stores[offset]) {
955 +               return node->stores[offset];
956 +       } else if (item_alloc) {
957 +               if (!(item = item_alloc()))
958 +                       return NULL;
959 +
960 +               node->stores[offset] = item;
961 +
962 +               /*
963 +                * NOTE: we do NOT call root->assign here, since this item is
964 +                * newly created by us having no meaning. Caller can call this
965 +                * if it's necessary to do so.
966 +                */
967 +
968 +               node->count++;
969 +               root->num++;
970 +
971 +               while (sradix_node_full(root, node)) {
972 +                       node = node->parent;
973 +                       if (!node)
974 +                               break;
975 +
976 +                       node->fulls++;
977 +               }
978 +
979 +               if (unlikely(!node)) {
980 +                       /* All nodes are full */
981 +                       root->min = 1 << (root->height * root->shift);
982 +               } else {
983 +                       if (root->min == index) {
984 +                               root->min |= (1UL << (node->height - 1)) - 1;
985 +                               root->min++;
986 +                               root->enter_node = node;
987 +                       }
988 +               }
989 +
990 +               return item;
991 +       } else {
992 +               return NULL;
993 +       }
994 +
995 +}
996 +
997 +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
998 +{
999 +       unsigned int height, offset;
1000 +       struct sradix_tree_node *node;
1001 +       int shift;
1002 +
1003 +       node = root->rnode;
1004 +       if (node == NULL || (index >> (root->shift * root->height)))
1005 +               return -ENOENT;
1006 +
1007 +       height = root->height;
1008 +       shift = (height - 1) * root->shift;
1009 +
1010 +       do {
1011 +               offset = (index >> shift) & root->mask;
1012 +               node = node->stores[offset];
1013 +               if (!node)
1014 +                       return -ENOENT;
1015 +
1016 +               shift -= root->shift;
1017 +       } while (shift > 0);
1018 +
1019 +       offset = index & root->mask;
1020 +       if (!node->stores[offset])
1021 +               return -ENOENT;
1022 +
1023 +       sradix_tree_delete_from_leaf(root, node, index);
1024 +
1025 +       return 0;
1026 +}
1027 diff --git a/mm/Kconfig b/mm/Kconfig
1028 index 1d1ae6b..511dde6 100644
1029 --- a/mm/Kconfig
1030 +++ b/mm/Kconfig
1031 @@ -339,6 +339,32 @@ config KSM
1032           See Documentation/vm/ksm.txt for more information: KSM is inactive
1033           until a program has madvised that an area is MADV_MERGEABLE, and
1034           root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1035 +choice
1036 +       prompt "Choose UKSM/KSM strategy"
1037 +       default UKSM
1038 +       depends on KSM
1039 +       help
1040 +         This option allows to select a UKSM/KSM stragety.
1041 +
1042 +config UKSM
1043 +       bool "Ultra-KSM for page merging"
1044 +       depends on KSM
1045 +       help
1046 +       UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1047 +       page Merging), but with a fundamentally rewritten core algorithm. With
1048 +       an advanced algorithm, UKSM now can transparently scans all anonymously
1049 +       mapped user space applications with an significantly improved scan speed
1050 +       and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1051 +       UKSM. Now UKSM has its first stable release and first real world enterprise user.
1052 +       For more information, please goto its project page.
1053 +       (www.kerneldedup.org)
1054 +
1055 +config KSM_LEGACY
1056 +       bool "Legacy KSM implementation"
1057 +       depends on KSM
1058 +       help
1059 +       The legacy KSM implementation from Redhat.
1060 +endchoice
1061  
1062  config DEFAULT_MMAP_MIN_ADDR
1063          int "Low address space to protect from user allocation"
1064 diff --git a/mm/Makefile b/mm/Makefile
1065 index 8405eb0..7689f0c 100644
1066 --- a/mm/Makefile
1067 +++ b/mm/Makefile
1068 @@ -44,7 +44,8 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
1069  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1070  obj-$(CONFIG_SLOB) += slob.o
1071  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1072 -obj-$(CONFIG_KSM) += ksm.o
1073 +obj-$(CONFIG_KSM_LEGACY) += ksm.o
1074 +obj-$(CONFIG_UKSM) += uksm.o
1075  obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
1076  obj-$(CONFIG_SLAB) += slab.o
1077  obj-$(CONFIG_SLUB) += slub.o
1078 diff --git a/mm/memory.c b/mm/memory.c
1079 index d5f2ae9..86b5d09 100644
1080 --- a/mm/memory.c
1081 +++ b/mm/memory.c
1082 @@ -120,6 +120,28 @@ unsigned long highest_memmap_pfn __read_mostly;
1083  
1084  EXPORT_SYMBOL(zero_pfn);
1085  
1086 +#ifdef CONFIG_UKSM
1087 +unsigned long uksm_zero_pfn __read_mostly;
1088 +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1089 +struct page *empty_uksm_zero_page;
1090 +
1091 +static int __init setup_uksm_zero_page(void)
1092 +{
1093 +       unsigned long addr;
1094 +       addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
1095 +       if (!addr)
1096 +               panic("Oh boy, that early out of memory?");
1097 +
1098 +       empty_uksm_zero_page = virt_to_page((void *) addr);
1099 +       SetPageReserved(empty_uksm_zero_page);
1100 +
1101 +       uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1102 +
1103 +       return 0;
1104 +}
1105 +core_initcall(setup_uksm_zero_page);
1106 +#endif
1107 +
1108  /*
1109   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1110   */
1111 @@ -131,6 +153,7 @@ static int __init init_zero_pfn(void)
1112  core_initcall(init_zero_pfn);
1113  
1114  
1115 +
1116  #if defined(SPLIT_RSS_COUNTING)
1117  
1118  void sync_mm_rss(struct mm_struct *mm)
1119 @@ -878,6 +901,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1120                         rss[MM_ANONPAGES]++;
1121                 else
1122                         rss[MM_FILEPAGES]++;
1123 +
1124 +               /* Should return NULL in vm_normal_page() */
1125 +               uksm_bugon_zeropage(pte);
1126 +       } else {
1127 +               uksm_map_zero_page(pte);
1128         }
1129  
1130  out_set_pte:
1131 @@ -1120,8 +1148,10 @@ again:
1132                         ptent = ptep_get_and_clear_full(mm, addr, pte,
1133                                                         tlb->fullmm);
1134                         tlb_remove_tlb_entry(tlb, pte, addr);
1135 -                       if (unlikely(!page))
1136 +                       if (unlikely(!page)) {
1137 +                               uksm_unmap_zero_page(ptent);
1138                                 continue;
1139 +                       }
1140                         if (unlikely(details) && details->nonlinear_vma
1141                             && linear_page_index(details->nonlinear_vma,
1142                                                 addr) != page->index) {
1143 @@ -1983,8 +2013,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1144                         clear_page(kaddr);
1145                 kunmap_atomic(kaddr);
1146                 flush_dcache_page(dst);
1147 -       } else
1148 +       } else {
1149                 copy_user_highpage(dst, src, va, vma);
1150 +               uksm_cow_page(vma, src);
1151 +       }
1152  }
1153  
1154  /*
1155 @@ -2198,6 +2230,7 @@ gotten:
1156                 new_page = alloc_zeroed_user_highpage_movable(vma, address);
1157                 if (!new_page)
1158                         goto oom;
1159 +               uksm_cow_pte(vma, orig_pte);
1160         } else {
1161                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1162                 if (!new_page)
1163 @@ -2223,8 +2256,11 @@ gotten:
1164                                 dec_mm_counter_fast(mm, MM_FILEPAGES);
1165                                 inc_mm_counter_fast(mm, MM_ANONPAGES);
1166                         }
1167 -               } else
1168 +                       uksm_bugon_zeropage(orig_pte);
1169 +               } else {
1170 +                       uksm_unmap_zero_page(orig_pte);
1171                         inc_mm_counter_fast(mm, MM_ANONPAGES);
1172 +               }
1173                 flush_cache_page(vma, address, pte_pfn(orig_pte));
1174                 entry = mk_pte(new_page, vma->vm_page_prot);
1175                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1176 diff --git a/mm/mmap.c b/mm/mmap.c
1177 index ae91989..844f366 100644
1178 --- a/mm/mmap.c
1179 +++ b/mm/mmap.c
1180 @@ -41,6 +41,7 @@
1181  #include <linux/notifier.h>
1182  #include <linux/memory.h>
1183  #include <linux/printk.h>
1184 +#include <linux/ksm.h>
1185  
1186  #include <asm/uaccess.h>
1187  #include <asm/cacheflush.h>
1188 @@ -279,6 +280,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1189         if (vma->vm_file)
1190                 fput(vma->vm_file);
1191         mpol_put(vma_policy(vma));
1192 +       uksm_remove_vma(vma);
1193         kmem_cache_free(vm_area_cachep, vma);
1194         return next;
1195  }
1196 @@ -739,9 +741,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1197         long adjust_next = 0;
1198         int remove_next = 0;
1199  
1200 +/*
1201 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1202 + * acquired
1203 + */
1204 +       uksm_remove_vma(vma);
1205 +
1206         if (next && !insert) {
1207                 struct vm_area_struct *exporter = NULL;
1208  
1209 +               uksm_remove_vma(next);
1210                 if (end >= next->vm_end) {
1211                         /*
1212                          * vma expands, overlapping all the next, and
1213 @@ -838,6 +847,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
1214                 end_changed = true;
1215         }
1216         vma->vm_pgoff = pgoff;
1217 +
1218         if (adjust_next) {
1219                 next->vm_start += adjust_next << PAGE_SHIFT;
1220                 next->vm_pgoff += adjust_next;
1221 @@ -908,16 +918,22 @@ again:                    remove_next = 1 + (end > next->vm_end);
1222                  * up the code too much to do both in one go.
1223                  */
1224                 next = vma->vm_next;
1225 -               if (remove_next == 2)
1226 +               if (remove_next == 2) {
1227 +                       uksm_remove_vma(next);
1228                         goto again;
1229 -               else if (next)
1230 +               } else if (next) {
1231                         vma_gap_update(next);
1232 -               else
1233 +               } else {
1234                         mm->highest_vm_end = end;
1235 +               }
1236 +       } else {
1237 +               if (next && !insert)
1238 +                       uksm_vma_add_new(next);
1239         }
1240         if (insert && file)
1241                 uprobe_mmap(insert);
1242  
1243 +       uksm_vma_add_new(vma);
1244         validate_mm(mm);
1245  
1246         return 0;
1247 @@ -1310,6 +1326,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1248         vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1249                         mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1250  
1251 +       /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
1252 +       uksm_vm_flags_mod(&vm_flags);
1253 +
1254         if (flags & MAP_LOCKED)
1255                 if (!can_do_mlock())
1256                         return -EPERM;
1257 @@ -1651,6 +1670,7 @@ munmap_back:
1258                         allow_write_access(file);
1259         }
1260         file = vma->vm_file;
1261 +       uksm_vma_add_new(vma);
1262  out:
1263         perf_event_mmap(vma);
1264  
1265 @@ -1692,6 +1712,7 @@ allow_write_and_free_vma:
1266         if (vm_flags & VM_DENYWRITE)
1267                 allow_write_access(file);
1268  free_vma:
1269 +       uksm_remove_vma(vma);
1270         kmem_cache_free(vm_area_cachep, vma);
1271  unacct_error:
1272         if (charged)
1273 @@ -2488,6 +2509,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1274         else
1275                 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1276  
1277 +       uksm_vma_add_new(new);
1278 +
1279         /* Success. */
1280         if (!err)
1281                 return 0;
1282 @@ -2654,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1283                 return addr;
1284  
1285         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1286 +       uksm_vm_flags_mod(&flags);
1287  
1288         error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1289         if (error & ~PAGE_MASK)
1290 @@ -2712,6 +2736,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1291         vma->vm_flags = flags;
1292         vma->vm_page_prot = vm_get_page_prot(flags);
1293         vma_link(mm, vma, prev, rb_link, rb_parent);
1294 +       uksm_vma_add_new(vma);
1295  out:
1296         perf_event_mmap(vma);
1297         mm->total_vm += len >> PAGE_SHIFT;
1298 @@ -2747,6 +2772,12 @@ void exit_mmap(struct mm_struct *mm)
1299         /* mm's last user has gone, and its about to be pulled down */
1300         mmu_notifier_release(mm);
1301  
1302 +       /*
1303 +        * Taking write lock on mmap_sem does not harm others,
1304 +        * but it's crucial for uksm to avoid races.
1305 +        */
1306 +       down_write(&mm->mmap_sem);
1307 +
1308         if (mm->locked_vm) {
1309                 vma = mm->mmap;
1310                 while (vma) {
1311 @@ -2783,6 +2814,11 @@ void exit_mmap(struct mm_struct *mm)
1312         }
1313         vm_unacct_memory(nr_accounted);
1314  
1315 +       mm->mmap = NULL;
1316 +       mm->mm_rb = RB_ROOT;
1317 +       vmacache_invalidate(mm);
1318 +       up_write(&mm->mmap_sem);
1319 +
1320         WARN_ON(atomic_long_read(&mm->nr_ptes) >
1321                         (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1322  }
1323 @@ -2891,6 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1324                                 new_vma->vm_ops->open(new_vma);
1325                         vma_link(mm, new_vma, prev, rb_link, rb_parent);
1326                         *need_rmap_locks = false;
1327 +                       uksm_vma_add_new(new_vma);
1328                 }
1329         }
1330         return new_vma;
1331 @@ -3004,10 +3041,10 @@ static struct vm_area_struct *__install_special_mapping(
1332         ret = insert_vm_struct(mm, vma);
1333         if (ret)
1334                 goto out;
1335 -
1336         mm->total_vm += len >> PAGE_SHIFT;
1337  
1338         perf_event_mmap(vma);
1339 +       uksm_vma_add_new(vma);
1340  
1341         return vma;
1342  
1343 diff --git a/mm/rmap.c b/mm/rmap.c
1344 index 3e4c721..d39d8a3 100644
1345 --- a/mm/rmap.c
1346 +++ b/mm/rmap.c
1347 @@ -908,9 +908,9 @@ void page_move_anon_rmap(struct page *page,
1348  
1349  /**
1350   * __page_set_anon_rmap - set up new anonymous rmap
1351 - * @page:      Page to add to rmap     
1352 + * @page:      Page to add to rmap
1353   * @vma:       VM area to add page to.
1354 - * @address:   User virtual address of the mapping     
1355 + * @address:   User virtual address of the mapping
1356   * @exclusive: the page is exclusively owned by the current process
1357   */
1358  static void __page_set_anon_rmap(struct page *page,
1359 diff --git a/mm/uksm.c b/mm/uksm.c
1360 new file mode 100644
1361 index 0000000..c76fcfc
1362 --- /dev/null
1363 +++ b/mm/uksm.c
1364 @@ -0,0 +1,5519 @@
1365 +/*
1366 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1367 + *
1368 + * This is an improvement upon KSM. Some basic data structures and routines
1369 + * are borrowed from ksm.c .
1370 + *
1371 + * Its new features:
1372 + * 1. Full system scan:
1373 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
1374 + *      interaction to submit a memory area to KSM is no longer needed.
1375 + *
1376 + * 2. Rich area detection:
1377 + *      It automatically detects rich areas containing abundant duplicated
1378 + *      pages based. Rich areas are given a full scan speed. Poor areas are
1379 + *      sampled at a reasonable speed with very low CPU consumption.
1380 + *
1381 + * 3. Ultra Per-page scan speed improvement:
1382 + *      A new hash algorithm is proposed. As a result, on a machine with
1383 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1384 + *      can scan memory areas that does not contain duplicated pages at speed of
1385 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1386 + *      477MB/sec ~ 923MB/sec.
1387 + *
1388 + * 4. Thrashing area avoidance:
1389 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
1390 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
1391 + *      hash value based volatile page detection.
1392 + *
1393 + *
1394 + * 5. Misc changes upon KSM:
1395 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1396 + *        comparison. It's much faster than default C version on x86.
1397 + *      * rmap_item now has an struct *page member to loosely cache a
1398 + *        address-->page mapping, which reduces too much time-costly
1399 + *        follow_page().
1400 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1401 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1402 + *        ksm is needed for this case.
1403 + *
1404 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
1405 + *    Now uksmd consider full zero pages as special pages and merge them to an
1406 + *    special unswappable uksm zero page.
1407 + */
1408 +
1409 +#include <linux/errno.h>
1410 +#include <linux/mm.h>
1411 +#include <linux/fs.h>
1412 +#include <linux/mman.h>
1413 +#include <linux/sched.h>
1414 +#include <linux/rwsem.h>
1415 +#include <linux/pagemap.h>
1416 +#include <linux/rmap.h>
1417 +#include <linux/spinlock.h>
1418 +#include <linux/jhash.h>
1419 +#include <linux/delay.h>
1420 +#include <linux/kthread.h>
1421 +#include <linux/wait.h>
1422 +#include <linux/slab.h>
1423 +#include <linux/rbtree.h>
1424 +#include <linux/memory.h>
1425 +#include <linux/mmu_notifier.h>
1426 +#include <linux/swap.h>
1427 +#include <linux/ksm.h>
1428 +#include <linux/crypto.h>
1429 +#include <linux/scatterlist.h>
1430 +#include <crypto/hash.h>
1431 +#include <linux/random.h>
1432 +#include <linux/math64.h>
1433 +#include <linux/gcd.h>
1434 +#include <linux/freezer.h>
1435 +#include <linux/sradix-tree.h>
1436 +
1437 +#include <asm/tlbflush.h>
1438 +#include "internal.h"
1439 +
1440 +#ifdef CONFIG_X86
1441 +#undef memcmp
1442 +
1443 +#ifdef CONFIG_X86_32
1444 +#define memcmp memcmpx86_32
1445 +/*
1446 + * Compare 4-byte-aligned address s1 and s2, with length n
1447 + */
1448 +int memcmpx86_32(void *s1, void *s2, size_t n)
1449 +{
1450 +       size_t num = n / 4;
1451 +       register int res;
1452 +
1453 +       __asm__ __volatile__
1454 +       (
1455 +        "testl %3,%3\n\t"
1456 +        "repe; cmpsd\n\t"
1457 +        "je        1f\n\t"
1458 +        "sbbl      %0,%0\n\t"
1459 +        "orl       $1,%0\n"
1460 +        "1:"
1461 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1462 +        : "0" (0)
1463 +        : "cc");
1464 +
1465 +       return res;
1466 +}
1467 +
1468 +/*
1469 + * Check the page is all zero ?
1470 + */
1471 +static int is_full_zero(const void *s1, size_t len)
1472 +{
1473 +       unsigned char same;
1474 +
1475 +       len /= 4;
1476 +
1477 +       __asm__ __volatile__
1478 +       ("repe; scasl;"
1479 +        "sete %0"
1480 +        : "=qm" (same), "+D" (s1), "+c" (len)
1481 +        : "a" (0)
1482 +        : "cc");
1483 +
1484 +       return same;
1485 +}
1486 +
1487 +
1488 +#elif defined(CONFIG_X86_64)
1489 +#define memcmp memcmpx86_64
1490 +/*
1491 + * Compare 8-byte-aligned address s1 and s2, with length n
1492 + */
1493 +int memcmpx86_64(void *s1, void *s2, size_t n)
1494 +{
1495 +       size_t num = n / 8;
1496 +       register int res;
1497 +
1498 +       __asm__ __volatile__
1499 +       (
1500 +        "testq %q3,%q3\n\t"
1501 +        "repe; cmpsq\n\t"
1502 +        "je        1f\n\t"
1503 +        "sbbq      %q0,%q0\n\t"
1504 +        "orq       $1,%q0\n"
1505 +        "1:"
1506 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1507 +        : "0" (0)
1508 +        : "cc");
1509 +
1510 +       return res;
1511 +}
1512 +
1513 +static int is_full_zero(const void *s1, size_t len)
1514 +{
1515 +       unsigned char same;
1516 +
1517 +       len /= 8;
1518 +
1519 +       __asm__ __volatile__
1520 +       ("repe; scasq;"
1521 +        "sete %0"
1522 +        : "=qm" (same), "+D" (s1), "+c" (len)
1523 +        : "a" (0)
1524 +        : "cc");
1525 +
1526 +       return same;
1527 +}
1528 +
1529 +#endif
1530 +#else
1531 +static int is_full_zero(const void *s1, size_t len)
1532 +{
1533 +       unsigned long *src = s1;
1534 +       int i;
1535 +
1536 +       len /= sizeof(*src);
1537 +
1538 +       for (i = 0; i < len; i++) {
1539 +               if (src[i])
1540 +                       return 0;
1541 +       }
1542 +
1543 +       return 1;
1544 +}
1545 +#endif
1546 +
1547 +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
1548 +#define TIME_RATIO_SCALE       10000
1549 +
1550 +#define SLOT_TREE_NODE_SHIFT   8
1551 +#define SLOT_TREE_NODE_STORE_SIZE      (1UL << SLOT_TREE_NODE_SHIFT)
1552 +struct slot_tree_node {
1553 +       unsigned long size;
1554 +       struct sradix_tree_node snode;
1555 +       void *stores[SLOT_TREE_NODE_STORE_SIZE];
1556 +};
1557 +
1558 +static struct kmem_cache *slot_tree_node_cachep;
1559 +
1560 +static struct sradix_tree_node *slot_tree_node_alloc(void)
1561 +{
1562 +       struct slot_tree_node *p;
1563 +       p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
1564 +       if (!p)
1565 +               return NULL;
1566 +
1567 +       return &p->snode;
1568 +}
1569 +
1570 +static void slot_tree_node_free(struct sradix_tree_node *node)
1571 +{
1572 +       struct slot_tree_node *p;
1573 +
1574 +       p = container_of(node, struct slot_tree_node, snode);
1575 +       kmem_cache_free(slot_tree_node_cachep, p);
1576 +}
1577 +
1578 +static void slot_tree_node_extend(struct sradix_tree_node *parent,
1579 +                                 struct sradix_tree_node *child)
1580 +{
1581 +       struct slot_tree_node *p, *c;
1582 +
1583 +       p = container_of(parent, struct slot_tree_node, snode);
1584 +       c = container_of(child, struct slot_tree_node, snode);
1585 +
1586 +       p->size += c->size;
1587 +}
1588 +
1589 +void slot_tree_node_assign(struct sradix_tree_node *node,
1590 +                          unsigned index, void *item)
1591 +{
1592 +       struct vma_slot *slot = item;
1593 +       struct slot_tree_node *cur;
1594 +
1595 +       slot->snode = node;
1596 +       slot->sindex = index;
1597 +
1598 +       while (node) {
1599 +               cur = container_of(node, struct slot_tree_node, snode);
1600 +               cur->size += slot->pages;
1601 +               node = node->parent;
1602 +       }
1603 +}
1604 +
1605 +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
1606 +{
1607 +       struct vma_slot *slot;
1608 +       struct slot_tree_node *cur;
1609 +       unsigned long pages;
1610 +
1611 +       if (node->height == 1) {
1612 +               slot = node->stores[offset];
1613 +               pages = slot->pages;
1614 +       } else {
1615 +               cur = container_of(node->stores[offset],
1616 +                                  struct slot_tree_node, snode);
1617 +               pages = cur->size;
1618 +       }
1619 +
1620 +       while (node) {
1621 +               cur = container_of(node, struct slot_tree_node, snode);
1622 +               cur->size -= pages;
1623 +               node = node->parent;
1624 +       }
1625 +}
1626 +
1627 +unsigned long slot_iter_index;
1628 +int slot_iter(void *item,  unsigned long height)
1629 +{
1630 +       struct slot_tree_node *node;
1631 +       struct vma_slot *slot;
1632 +
1633 +       if (height == 1) {
1634 +               slot = item;
1635 +               if (slot_iter_index < slot->pages) {
1636 +                       /*in this one*/
1637 +                       return 1;
1638 +               } else {
1639 +                       slot_iter_index -= slot->pages;
1640 +                       return 0;
1641 +               }
1642 +
1643 +       } else {
1644 +               node = container_of(item, struct slot_tree_node, snode);
1645 +               if (slot_iter_index < node->size) {
1646 +                       /*in this one*/
1647 +                       return 1;
1648 +               } else {
1649 +                       slot_iter_index -= node->size;
1650 +                       return 0;
1651 +               }
1652 +       }
1653 +}
1654 +
1655 +
1656 +static inline void slot_tree_init_root(struct sradix_tree_root *root)
1657 +{
1658 +       init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1659 +       root->alloc = slot_tree_node_alloc;
1660 +       root->free = slot_tree_node_free;
1661 +       root->extend = slot_tree_node_extend;
1662 +       root->assign = slot_tree_node_assign;
1663 +       root->rm = slot_tree_node_rm;
1664 +}
1665 +
1666 +void slot_tree_init(void)
1667 +{
1668 +       slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1669 +                               sizeof(struct slot_tree_node), 0,
1670 +                               SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1671 +                               NULL);
1672 +}
1673 +
1674 +
1675 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1676 +struct scan_rung {
1677 +       //struct list_head scanned_list;
1678 +       struct sradix_tree_root vma_root;
1679 +       struct sradix_tree_root vma_root2;
1680 +
1681 +       struct vma_slot *current_scan;
1682 +       unsigned long current_offset;
1683 +
1684 +       /*
1685 +        * The initial value for current_offset, it should loop over
1686 +        * [0~ step - 1] to let all slot have its chance to be scanned.
1687 +        */
1688 +       unsigned long offset_init;
1689 +       unsigned long step; /* dynamic step for current_offset */
1690 +       unsigned int flags;
1691 +       unsigned long pages_to_scan;
1692 +       //unsigned long fully_scanned_slots;
1693 +       /*
1694 +        * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1695 +        * the cpu time ratio it can spend in rung_i for every scan
1696 +        * period. if < 0, then it is the cpu time ratio relative to the
1697 +        * max cpu percentage user specified. Both in unit of
1698 +        * 1/TIME_RATIO_SCALE
1699 +        */
1700 +       int cpu_ratio;
1701 +
1702 +       /*
1703 +        * How long it will take for all slots in this rung to be fully
1704 +        * scanned? If it's zero, we don't care about the cover time:
1705 +        * it's fully scanned.
1706 +        */
1707 +       unsigned int cover_msecs;
1708 +       //unsigned long vma_num;
1709 +       //unsigned long pages; /* Sum of all slot's pages in rung */
1710 +};
1711 +
1712 +/**
1713 + * node of either the stable or unstale rbtree
1714 + *
1715 + */
1716 +struct tree_node {
1717 +       struct rb_node node; /* link in the main (un)stable rbtree */
1718 +       struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1719 +       u32 hash;
1720 +       unsigned long count; /* TODO: merged with sub_root */
1721 +       struct list_head all_list; /* all tree nodes in stable/unstable tree */
1722 +};
1723 +
1724 +/**
1725 + * struct stable_node - node of the stable rbtree
1726 + * @node: rb node of this ksm page in the stable tree
1727 + * @hlist: hlist head of rmap_items using this ksm page
1728 + * @kpfn: page frame number of this ksm page
1729 + */
1730 +struct stable_node {
1731 +       struct rb_node node; /* link in sub-rbtree */
1732 +       struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1733 +       struct hlist_head hlist;
1734 +       unsigned long kpfn;
1735 +       u32 hash_max; /* if ==0 then it's not been calculated yet */
1736 +       struct list_head all_list; /* in a list for all stable nodes */
1737 +};
1738 +
1739 +/**
1740 + * struct node_vma - group rmap_items linked in a same stable
1741 + * node together.
1742 + */
1743 +struct node_vma {
1744 +       union {
1745 +               struct vma_slot *slot;
1746 +               unsigned long key;  /* slot is used as key sorted on hlist */
1747 +       };
1748 +       struct hlist_node hlist;
1749 +       struct hlist_head rmap_hlist;
1750 +       struct stable_node *head;
1751 +};
1752 +
1753 +/**
1754 + * struct rmap_item - reverse mapping item for virtual addresses
1755 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1756 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1757 + * @mm: the memory structure this rmap_item is pointing into
1758 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1759 + * @node: rb node of this rmap_item in the unstable tree
1760 + * @head: pointer to stable_node heading this list in the stable tree
1761 + * @hlist: link into hlist of rmap_items hanging off that stable_node
1762 + */
1763 +struct rmap_item {
1764 +       struct vma_slot *slot;
1765 +       struct page *page;
1766 +       unsigned long address;  /* + low bits used for flags below */
1767 +       unsigned long hash_round;
1768 +       unsigned long entry_index;
1769 +       union {
1770 +               struct {/* when in unstable tree */
1771 +                       struct rb_node node;
1772 +                       struct tree_node *tree_node;
1773 +                       u32 hash_max;
1774 +               };
1775 +               struct { /* when in stable tree */
1776 +                       struct node_vma *head;
1777 +                       struct hlist_node hlist;
1778 +                       struct anon_vma *anon_vma;
1779 +               };
1780 +       };
1781 +} __attribute__((aligned(4)));
1782 +
1783 +struct rmap_list_entry {
1784 +       union {
1785 +               struct rmap_item *item;
1786 +               unsigned long addr;
1787 +       };
1788 +       /* lowest bit is used for is_addr tag */
1789 +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
1790 +
1791 +
1792 +/* Basic data structure definition ends */
1793 +
1794 +
1795 +/*
1796 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1797 + * The flags use the low bits of rmap_item.address
1798 + */
1799 +#define UNSTABLE_FLAG  0x1
1800 +#define STABLE_FLAG    0x2
1801 +#define get_rmap_addr(x)       ((x)->address & PAGE_MASK)
1802 +
1803 +/*
1804 + * rmap_list_entry helpers
1805 + */
1806 +#define IS_ADDR_FLAG   1
1807 +#define is_addr(ptr)           ((unsigned long)(ptr) & IS_ADDR_FLAG)
1808 +#define set_is_addr(ptr)       ((ptr) |= IS_ADDR_FLAG)
1809 +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1810 +
1811 +
1812 +/*
1813 + * High speed caches for frequently allocated and freed structs
1814 + */
1815 +static struct kmem_cache *rmap_item_cache;
1816 +static struct kmem_cache *stable_node_cache;
1817 +static struct kmem_cache *node_vma_cache;
1818 +static struct kmem_cache *vma_slot_cache;
1819 +static struct kmem_cache *tree_node_cache;
1820 +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1821 +               sizeof(struct __struct), __alignof__(struct __struct),\
1822 +               (__flags), NULL)
1823 +
1824 +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1825 +#define SCAN_LADDER_SIZE 4
1826 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1827 +
1828 +/* The evaluation rounds uksmd has finished */
1829 +static unsigned long long uksm_eval_round = 1;
1830 +
1831 +/*
1832 + * we add 1 to this var when we consider we should rebuild the whole
1833 + * unstable tree.
1834 + */
1835 +static unsigned long uksm_hash_round = 1;
1836 +
1837 +/*
1838 + * How many times the whole memory is scanned.
1839 + */
1840 +static unsigned long long fully_scanned_round = 1;
1841 +
1842 +/* The total number of virtual pages of all vma slots */
1843 +static u64 uksm_pages_total;
1844 +
1845 +/* The number of pages has been scanned since the start up */
1846 +static u64 uksm_pages_scanned;
1847 +
1848 +static u64 scanned_virtual_pages;
1849 +
1850 +/* The number of pages has been scanned since last encode_benefit call */
1851 +static u64 uksm_pages_scanned_last;
1852 +
1853 +/* If the scanned number is tooo large, we encode it here */
1854 +static u64 pages_scanned_stored;
1855 +
1856 +static unsigned long pages_scanned_base;
1857 +
1858 +/* The number of nodes in the stable tree */
1859 +static unsigned long uksm_pages_shared;
1860 +
1861 +/* The number of page slots additionally sharing those nodes */
1862 +static unsigned long uksm_pages_sharing;
1863 +
1864 +/* The number of nodes in the unstable tree */
1865 +static unsigned long uksm_pages_unshared;
1866 +
1867 +/*
1868 + * Milliseconds ksmd should sleep between scans,
1869 + * >= 100ms to be consistent with
1870 + * scan_time_to_sleep_msec()
1871 + */
1872 +static unsigned int uksm_sleep_jiffies;
1873 +
1874 +/* The real value for the uksmd next sleep */
1875 +static unsigned int uksm_sleep_real;
1876 +
1877 +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1878 +static unsigned int uksm_sleep_saved;
1879 +
1880 +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1881 +static unsigned int uksm_max_cpu_percentage;
1882 +
1883 +static int uksm_cpu_governor;
1884 +
1885 +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1886 +
1887 +struct uksm_cpu_preset_s {
1888 +       int cpu_ratio[SCAN_LADDER_SIZE];
1889 +       unsigned int cover_msecs[SCAN_LADDER_SIZE];
1890 +       unsigned int max_cpu; /* percentage */
1891 +};
1892 +
1893 +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1894 +       { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1895 +       { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1896 +       { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1897 +       { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1898 +};
1899 +
1900 +/* The default value for uksm_ema_page_time if it's not initialized */
1901 +#define UKSM_PAGE_TIME_DEFAULT 500
1902 +
1903 +/*cost to scan one page by expotional moving average in nsecs */
1904 +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1905 +
1906 +/* The expotional moving average alpha weight, in percentage. */
1907 +#define EMA_ALPHA      20
1908 +
1909 +/*
1910 + * The threshold used to filter out thrashing areas,
1911 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1912 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1913 + * will be considered as having a zero duplication ratio.
1914 + */
1915 +static unsigned int uksm_thrash_threshold = 50;
1916 +
1917 +/* How much dedup ratio is considered to be abundant*/
1918 +static unsigned int uksm_abundant_threshold = 10;
1919 +
1920 +/* All slots having merged pages in this eval round. */
1921 +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1922 +
1923 +/* How many times the ksmd has slept since startup */
1924 +static unsigned long long uksm_sleep_times;
1925 +
1926 +#define UKSM_RUN_STOP  0
1927 +#define UKSM_RUN_MERGE 1
1928 +static unsigned int uksm_run = 1;
1929 +
1930 +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1931 +static DEFINE_MUTEX(uksm_thread_mutex);
1932 +
1933 +/*
1934 + * List vma_slot_new is for newly created vma_slot waiting to be added by
1935 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1936 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1937 + * VMA has been removed/freed.
1938 + */
1939 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1940 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1941 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1942 +static DEFINE_SPINLOCK(vma_slot_list_lock);
1943 +
1944 +/* The unstable tree heads */
1945 +static struct rb_root root_unstable_tree = RB_ROOT;
1946 +
1947 +/*
1948 + * All tree_nodes are in a list to be freed at once when unstable tree is
1949 + * freed after each scan round.
1950 + */
1951 +static struct list_head unstable_tree_node_list =
1952 +                               LIST_HEAD_INIT(unstable_tree_node_list);
1953 +
1954 +/* List contains all stable nodes */
1955 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1956 +
1957 +/*
1958 + * When the hash strength is changed, the stable tree must be delta_hashed and
1959 + * re-structured. We use two set of below structs to speed up the
1960 + * re-structuring of stable tree.
1961 + */
1962 +static struct list_head
1963 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1964 +                           LIST_HEAD_INIT(stable_tree_node_list[1])};
1965 +
1966 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1967 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1968 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
1969 +static unsigned long stable_tree_index;
1970 +
1971 +/* The hash strength needed to hash a full page */
1972 +#define HASH_STRENGTH_FULL             (PAGE_SIZE / sizeof(u32))
1973 +
1974 +/* The hash strength needed for loop-back hashing */
1975 +#define HASH_STRENGTH_MAX              (HASH_STRENGTH_FULL + 10)
1976 +
1977 +/* The random offsets in a page */
1978 +static u32 *random_nums;
1979 +
1980 +/* The hash strength */
1981 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1982 +
1983 +/* The delta value each time the hash strength increases or decreases */
1984 +static unsigned long hash_strength_delta;
1985 +#define HASH_STRENGTH_DELTA_MAX        5
1986 +
1987 +/* The time we have saved due to random_sample_hash */
1988 +static u64 rshash_pos;
1989 +
1990 +/* The time we have wasted due to hash collision */
1991 +static u64 rshash_neg;
1992 +
1993 +struct uksm_benefit {
1994 +       u64 pos;
1995 +       u64 neg;
1996 +       u64 scanned;
1997 +       unsigned long base;
1998 +} benefit;
1999 +
2000 +/*
2001 + * The relative cost of memcmp, compared to 1 time unit of random sample
2002 + * hash, this value is tested when ksm module is initialized
2003 + */
2004 +static unsigned long memcmp_cost;
2005 +
2006 +static unsigned long  rshash_neg_cont_zero;
2007 +static unsigned long  rshash_cont_obscure;
2008 +
2009 +/* The possible states of hash strength adjustment heuristic */
2010 +enum rshash_states {
2011 +               RSHASH_STILL,
2012 +               RSHASH_TRYUP,
2013 +               RSHASH_TRYDOWN,
2014 +               RSHASH_NEW,
2015 +               RSHASH_PRE_STILL,
2016 +};
2017 +
2018 +/* The possible direction we are about to adjust hash strength */
2019 +enum rshash_direct {
2020 +       GO_UP,
2021 +       GO_DOWN,
2022 +       OBSCURE,
2023 +       STILL,
2024 +};
2025 +
2026 +/* random sampling hash state machine */
2027 +static struct {
2028 +       enum rshash_states state;
2029 +       enum rshash_direct pre_direct;
2030 +       u8 below_count;
2031 +       /* Keep a lookup window of size 5, iff above_count/below_count > 3
2032 +        * in this window we stop trying.
2033 +        */
2034 +       u8 lookup_window_index;
2035 +       u64 stable_benefit;
2036 +       unsigned long turn_point_down;
2037 +       unsigned long turn_benefit_down;
2038 +       unsigned long turn_point_up;
2039 +       unsigned long turn_benefit_up;
2040 +       unsigned long stable_point;
2041 +} rshash_state;
2042 +
2043 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2044 +static u32 *zero_hash_table;
2045 +
2046 +static inline struct node_vma *alloc_node_vma(void)
2047 +{
2048 +       struct node_vma *node_vma;
2049 +       node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
2050 +       if (node_vma) {
2051 +               INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2052 +               INIT_HLIST_NODE(&node_vma->hlist);
2053 +       }
2054 +       return node_vma;
2055 +}
2056 +
2057 +static inline void free_node_vma(struct node_vma *node_vma)
2058 +{
2059 +       kmem_cache_free(node_vma_cache, node_vma);
2060 +}
2061 +
2062 +
2063 +static inline struct vma_slot *alloc_vma_slot(void)
2064 +{
2065 +       struct vma_slot *slot;
2066 +
2067 +       /*
2068 +        * In case ksm is not initialized by now.
2069 +        * Oops, we need to consider the call site of uksm_init() in the future.
2070 +        */
2071 +       if (!vma_slot_cache)
2072 +               return NULL;
2073 +
2074 +       slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
2075 +       if (slot) {
2076 +               INIT_LIST_HEAD(&slot->slot_list);
2077 +               INIT_LIST_HEAD(&slot->dedup_list);
2078 +               slot->flags |= UKSM_SLOT_NEED_RERAND;
2079 +       }
2080 +       return slot;
2081 +}
2082 +
2083 +static inline void free_vma_slot(struct vma_slot *vma_slot)
2084 +{
2085 +       kmem_cache_free(vma_slot_cache, vma_slot);
2086 +}
2087 +
2088 +
2089 +
2090 +static inline struct rmap_item *alloc_rmap_item(void)
2091 +{
2092 +       struct rmap_item *rmap_item;
2093 +
2094 +       rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
2095 +       if (rmap_item) {
2096 +               /* bug on lowest bit is not clear for flag use */
2097 +               BUG_ON(is_addr(rmap_item));
2098 +       }
2099 +       return rmap_item;
2100 +}
2101 +
2102 +static inline void free_rmap_item(struct rmap_item *rmap_item)
2103 +{
2104 +       rmap_item->slot = NULL; /* debug safety */
2105 +       kmem_cache_free(rmap_item_cache, rmap_item);
2106 +}
2107 +
2108 +static inline struct stable_node *alloc_stable_node(void)
2109 +{
2110 +       struct stable_node *node;
2111 +       node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
2112 +       if (!node)
2113 +               return NULL;
2114 +
2115 +       INIT_HLIST_HEAD(&node->hlist);
2116 +       list_add(&node->all_list, &stable_node_list);
2117 +       return node;
2118 +}
2119 +
2120 +static inline void free_stable_node(struct stable_node *stable_node)
2121 +{
2122 +       list_del(&stable_node->all_list);
2123 +       kmem_cache_free(stable_node_cache, stable_node);
2124 +}
2125 +
2126 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
2127 +{
2128 +       struct tree_node *node;
2129 +       node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
2130 +       if (!node)
2131 +               return NULL;
2132 +
2133 +       list_add(&node->all_list, list);
2134 +       return node;
2135 +}
2136 +
2137 +static inline void free_tree_node(struct tree_node *node)
2138 +{
2139 +       list_del(&node->all_list);
2140 +       kmem_cache_free(tree_node_cache, node);
2141 +}
2142 +
2143 +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2144 +{
2145 +       struct anon_vma *anon_vma = rmap_item->anon_vma;
2146 +
2147 +       put_anon_vma(anon_vma);
2148 +}
2149 +
2150 +
2151 +/**
2152 + * Remove a stable node from stable_tree, may unlink from its tree_node and
2153 + * may remove its parent tree_node if no other stable node is pending.
2154 + *
2155 + * @stable_node        The node need to be removed
2156 + * @unlink_rb          Will this node be unlinked from the rbtree?
2157 + * @remove_tree_       node Will its tree_node be removed if empty?
2158 + */
2159 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
2160 +                                        int unlink_rb,  int remove_tree_node)
2161 +{
2162 +       struct node_vma *node_vma;
2163 +       struct rmap_item *rmap_item;
2164 +       struct hlist_node *n;
2165 +
2166 +       if (!hlist_empty(&stable_node->hlist)) {
2167 +               hlist_for_each_entry_safe(node_vma, n,
2168 +                                         &stable_node->hlist, hlist) {
2169 +                       hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2170 +                               uksm_pages_sharing--;
2171 +
2172 +                               uksm_drop_anon_vma(rmap_item);
2173 +                               rmap_item->address &= PAGE_MASK;
2174 +                       }
2175 +                       free_node_vma(node_vma);
2176 +                       cond_resched();
2177 +               }
2178 +
2179 +               /* the last one is counted as shared */
2180 +               uksm_pages_shared--;
2181 +               uksm_pages_sharing++;
2182 +       }
2183 +
2184 +       if (stable_node->tree_node && unlink_rb) {
2185 +               rb_erase(&stable_node->node,
2186 +                        &stable_node->tree_node->sub_root);
2187 +
2188 +               if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2189 +                   remove_tree_node) {
2190 +                       rb_erase(&stable_node->tree_node->node,
2191 +                                root_stable_treep);
2192 +                       free_tree_node(stable_node->tree_node);
2193 +               } else {
2194 +                       stable_node->tree_node->count--;
2195 +               }
2196 +       }
2197 +
2198 +       free_stable_node(stable_node);
2199 +}
2200 +
2201 +
2202 +/*
2203 + * get_uksm_page: checks if the page indicated by the stable node
2204 + * is still its ksm page, despite having held no reference to it.
2205 + * In which case we can trust the content of the page, and it
2206 + * returns the gotten page; but if the page has now been zapped,
2207 + * remove the stale node from the stable tree and return NULL.
2208 + *
2209 + * You would expect the stable_node to hold a reference to the ksm page.
2210 + * But if it increments the page's count, swapping out has to wait for
2211 + * ksmd to come around again before it can free the page, which may take
2212 + * seconds or even minutes: much too unresponsive.  So instead we use a
2213 + * "keyhole reference": access to the ksm page from the stable node peeps
2214 + * out through its keyhole to see if that page still holds the right key,
2215 + * pointing back to this stable node.  This relies on freeing a PageAnon
2216 + * page to reset its page->mapping to NULL, and relies on no other use of
2217 + * a page to put something that might look like our key in page->mapping.
2218 + *
2219 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2220 + * but this is different - made simpler by uksm_thread_mutex being held, but
2221 + * interesting for assuming that no other use of the struct page could ever
2222 + * put our expected_mapping into page->mapping (or a field of the union which
2223 + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
2224 + * to keep the page_count protocol described with page_cache_get_speculative.
2225 + *
2226 + * Note: it is possible that get_uksm_page() will return NULL one moment,
2227 + * then page the next, if the page is in between page_freeze_refs() and
2228 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2229 + * is on its way to being freed; but it is an anomaly to bear in mind.
2230 + *
2231 + * @unlink_rb:                 if the removal of this node will firstly unlink from
2232 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
2233 + * node from its old tree.
2234 + *
2235 + * @remove_tree_node:  if this is the last one of its tree_node, will the
2236 + * tree_node be freed ? If we are inserting stable node, this tree_node may
2237 + * be reused, so don't free it.
2238 + */
2239 +static struct page *get_uksm_page(struct stable_node *stable_node,
2240 +                                int unlink_rb, int remove_tree_node)
2241 +{
2242 +       struct page *page;
2243 +       void *expected_mapping;
2244 +
2245 +       page = pfn_to_page(stable_node->kpfn);
2246 +       expected_mapping = (void *)stable_node +
2247 +                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
2248 +       rcu_read_lock();
2249 +       if (page->mapping != expected_mapping)
2250 +               goto stale;
2251 +       if (!get_page_unless_zero(page))
2252 +               goto stale;
2253 +       if (page->mapping != expected_mapping) {
2254 +               put_page(page);
2255 +               goto stale;
2256 +       }
2257 +       rcu_read_unlock();
2258 +       return page;
2259 +stale:
2260 +       rcu_read_unlock();
2261 +       remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2262 +
2263 +       return NULL;
2264 +}
2265 +
2266 +/*
2267 + * Removing rmap_item from stable or unstable tree.
2268 + * This function will clean the information from the stable/unstable tree.
2269 + */
2270 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2271 +{
2272 +       if (rmap_item->address & STABLE_FLAG) {
2273 +               struct stable_node *stable_node;
2274 +               struct node_vma *node_vma;
2275 +               struct page *page;
2276 +
2277 +               node_vma = rmap_item->head;
2278 +               stable_node = node_vma->head;
2279 +               page = get_uksm_page(stable_node, 1, 1);
2280 +               if (!page)
2281 +                       goto out;
2282 +
2283 +               /*
2284 +                * page lock is needed because it's racing with
2285 +                * try_to_unmap_ksm(), etc.
2286 +                */
2287 +               lock_page(page);
2288 +               hlist_del(&rmap_item->hlist);
2289 +
2290 +               if (hlist_empty(&node_vma->rmap_hlist)) {
2291 +                       hlist_del(&node_vma->hlist);
2292 +                       free_node_vma(node_vma);
2293 +               }
2294 +               unlock_page(page);
2295 +
2296 +               put_page(page);
2297 +               if (hlist_empty(&stable_node->hlist)) {
2298 +                       /* do NOT call remove_node_from_stable_tree() here,
2299 +                        * it's possible for a forked rmap_item not in
2300 +                        * stable tree while the in-tree rmap_items were
2301 +                        * deleted.
2302 +                        */
2303 +                       uksm_pages_shared--;
2304 +               } else
2305 +                       uksm_pages_sharing--;
2306 +
2307 +
2308 +               uksm_drop_anon_vma(rmap_item);
2309 +       } else if (rmap_item->address & UNSTABLE_FLAG) {
2310 +               if (rmap_item->hash_round == uksm_hash_round) {
2311 +
2312 +                       rb_erase(&rmap_item->node,
2313 +                                &rmap_item->tree_node->sub_root);
2314 +                       if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2315 +                               rb_erase(&rmap_item->tree_node->node,
2316 +                                        &root_unstable_tree);
2317 +
2318 +                               free_tree_node(rmap_item->tree_node);
2319 +                       } else
2320 +                               rmap_item->tree_node->count--;
2321 +               }
2322 +               uksm_pages_unshared--;
2323 +       }
2324 +
2325 +       rmap_item->address &= PAGE_MASK;
2326 +       rmap_item->hash_max = 0;
2327 +
2328 +out:
2329 +       cond_resched();         /* we're called from many long loops */
2330 +}
2331 +
2332 +static inline int slot_in_uksm(struct vma_slot *slot)
2333 +{
2334 +       return list_empty(&slot->slot_list);
2335 +}
2336 +
2337 +/*
2338 + * Test if the mm is exiting
2339 + */
2340 +static inline bool uksm_test_exit(struct mm_struct *mm)
2341 +{
2342 +       return atomic_read(&mm->mm_users) == 0;
2343 +}
2344 +
2345 +/**
2346 + * Need to do two things:
2347 + * 1. check if slot was moved to del list
2348 + * 2. make sure the mmap_sem is manipulated under valid vma.
2349 + *
2350 + * My concern here is that in some cases, this may make
2351 + * vma_slot_list_lock() waiters to serialized further by some
2352 + * sem->wait_lock, can this really be expensive?
2353 + *
2354 + *
2355 + * @return
2356 + * 0: if successfully locked mmap_sem
2357 + * -ENOENT: this slot was moved to del list
2358 + * -EBUSY: vma lock failed
2359 + */
2360 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2361 +{
2362 +       struct vm_area_struct *vma;
2363 +       struct mm_struct *mm;
2364 +       struct rw_semaphore *sem;
2365 +
2366 +       spin_lock(&vma_slot_list_lock);
2367 +
2368 +       /* the slot_list was removed and inited from new list, when it enters
2369 +        * uksm_list. If now it's not empty, then it must be moved to del list
2370 +        */
2371 +       if (!slot_in_uksm(slot)) {
2372 +               spin_unlock(&vma_slot_list_lock);
2373 +               return -ENOENT;
2374 +       }
2375 +
2376 +       BUG_ON(slot->pages != vma_pages(slot->vma));
2377 +       /* Ok, vma still valid */
2378 +       vma = slot->vma;
2379 +       mm = vma->vm_mm;
2380 +       sem = &mm->mmap_sem;
2381 +
2382 +       if (uksm_test_exit(mm)) {
2383 +               spin_unlock(&vma_slot_list_lock);
2384 +               return -ENOENT;
2385 +       }
2386 +
2387 +       if (down_read_trylock(sem)) {
2388 +               spin_unlock(&vma_slot_list_lock);
2389 +               return 0;
2390 +       }
2391 +
2392 +       spin_unlock(&vma_slot_list_lock);
2393 +       return -EBUSY;
2394 +}
2395 +
2396 +static inline unsigned long
2397 +vma_page_address(struct page *page, struct vm_area_struct *vma)
2398 +{
2399 +       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2400 +       unsigned long address;
2401 +
2402 +       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2403 +       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2404 +               /* page should be within @vma mapping range */
2405 +               return -EFAULT;
2406 +       }
2407 +       return address;
2408 +}
2409 +
2410 +
2411 +/* return 0 on success with the item's mmap_sem locked */
2412 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2413 +{
2414 +       struct mm_struct *mm;
2415 +       struct vma_slot *slot = item->slot;
2416 +       int err = -EINVAL;
2417 +
2418 +       struct page *page;
2419 +
2420 +       /*
2421 +        * try_down_read_slot_mmap_sem() returns non-zero if the slot
2422 +        * has been removed by uksm_remove_vma().
2423 +        */
2424 +       if (try_down_read_slot_mmap_sem(slot))
2425 +               return -EBUSY;
2426 +
2427 +       mm = slot->vma->vm_mm;
2428 +
2429 +       if (uksm_test_exit(mm))
2430 +               goto failout_up;
2431 +
2432 +       page = item->page;
2433 +       rcu_read_lock();
2434 +       if (!get_page_unless_zero(page)) {
2435 +               rcu_read_unlock();
2436 +               goto failout_up;
2437 +       }
2438 +
2439 +       /* No need to consider huge page here. */
2440 +       if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2441 +           vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2442 +               /*
2443 +                * TODO:
2444 +                * should we release this item becase of its stale page
2445 +                * mapping?
2446 +                */
2447 +               put_page(page);
2448 +               rcu_read_unlock();
2449 +               goto failout_up;
2450 +       }
2451 +       rcu_read_unlock();
2452 +       return 0;
2453 +
2454 +failout_up:
2455 +       up_read(&mm->mmap_sem);
2456 +       return err;
2457 +}
2458 +
2459 +/*
2460 + * What kind of VMA is considered ?
2461 + */
2462 +static inline int vma_can_enter(struct vm_area_struct *vma)
2463 +{
2464 +       return uksm_flags_can_scan(vma->vm_flags);
2465 +}
2466 +
2467 +/*
2468 + * Called whenever a fresh new vma is created A new vma_slot.
2469 + * is created and inserted into a global list Must be called.
2470 + * after vma is inserted to its mm                         .
2471 + */
2472 +void uksm_vma_add_new(struct vm_area_struct *vma)
2473 +{
2474 +       struct vma_slot *slot;
2475 +
2476 +       if (!vma_can_enter(vma)) {
2477 +               vma->uksm_vma_slot = NULL;
2478 +               return;
2479 +       }
2480 +
2481 +       slot = alloc_vma_slot();
2482 +       if (!slot) {
2483 +               vma->uksm_vma_slot = NULL;
2484 +               return;
2485 +       }
2486 +
2487 +       vma->uksm_vma_slot = slot;
2488 +       vma->vm_flags |= VM_MERGEABLE;
2489 +       slot->vma = vma;
2490 +       slot->mm = vma->vm_mm;
2491 +       slot->ctime_j = jiffies;
2492 +       slot->pages = vma_pages(vma);
2493 +       spin_lock(&vma_slot_list_lock);
2494 +       list_add_tail(&slot->slot_list, &vma_slot_new);
2495 +       spin_unlock(&vma_slot_list_lock);
2496 +}
2497 +
2498 +/*
2499 + * Called after vma is unlinked from its mm
2500 + */
2501 +void uksm_remove_vma(struct vm_area_struct *vma)
2502 +{
2503 +       struct vma_slot *slot;
2504 +
2505 +       if (!vma->uksm_vma_slot)
2506 +               return;
2507 +
2508 +       slot = vma->uksm_vma_slot;
2509 +       spin_lock(&vma_slot_list_lock);
2510 +       if (slot_in_uksm(slot)) {
2511 +               /**
2512 +                * This slot has been added by ksmd, so move to the del list
2513 +                * waiting ksmd to free it.
2514 +                */
2515 +               list_add_tail(&slot->slot_list, &vma_slot_del);
2516 +       } else {
2517 +               /**
2518 +                * It's still on new list. It's ok to free slot directly.
2519 +                */
2520 +               list_del(&slot->slot_list);
2521 +               free_vma_slot(slot);
2522 +       }
2523 +       spin_unlock(&vma_slot_list_lock);
2524 +       vma->uksm_vma_slot = NULL;
2525 +}
2526 +
2527 +/*   32/3 < they < 32/2 */
2528 +#define shiftl 8
2529 +#define shiftr 12
2530 +
2531 +#define HASH_FROM_TO(from, to)                                 \
2532 +for (index = from; index < to; index++) {              \
2533 +       pos = random_nums[index];                       \
2534 +       hash += key[pos];                               \
2535 +       hash += (hash << shiftl);                       \
2536 +       hash ^= (hash >> shiftr);                       \
2537 +}
2538 +
2539 +
2540 +#define HASH_FROM_DOWN_TO(from, to)                    \
2541 +for (index = from - 1; index >= to; index--) {         \
2542 +       hash ^= (hash >> shiftr);                       \
2543 +       hash ^= (hash >> (shiftr*2));                   \
2544 +       hash -= (hash << shiftl);                       \
2545 +       hash += (hash << (shiftl*2));                   \
2546 +       pos = random_nums[index];                       \
2547 +       hash -= key[pos];                               \
2548 +}
2549 +
2550 +/*
2551 + * The main random sample hash function.
2552 + */
2553 +static u32 random_sample_hash(void *addr, u32 hash_strength)
2554 +{
2555 +       u32 hash = 0xdeadbeef;
2556 +       int index, pos, loop = hash_strength;
2557 +       u32 *key = (u32 *)addr;
2558 +
2559 +       if (loop > HASH_STRENGTH_FULL)
2560 +               loop = HASH_STRENGTH_FULL;
2561 +
2562 +       HASH_FROM_TO(0, loop);
2563 +
2564 +       if (hash_strength > HASH_STRENGTH_FULL) {
2565 +               loop = hash_strength - HASH_STRENGTH_FULL;
2566 +               HASH_FROM_TO(0, loop);
2567 +       }
2568 +
2569 +       return hash;
2570 +}
2571 +
2572 +
2573 +/**
2574 + * It's used when hash strength is adjusted
2575 + *
2576 + * @addr The page's virtual address
2577 + * @from The original hash strength
2578 + * @to   The hash strength changed to
2579 + * @hash The hash value generated with "from" hash value
2580 + *
2581 + * return the hash value
2582 + */
2583 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
2584 +{
2585 +       u32 *key = (u32 *)addr;
2586 +       int index, pos; /* make sure they are int type */
2587 +
2588 +       if (to > from) {
2589 +               if (from >= HASH_STRENGTH_FULL) {
2590 +                       from -= HASH_STRENGTH_FULL;
2591 +                       to -= HASH_STRENGTH_FULL;
2592 +                       HASH_FROM_TO(from, to);
2593 +               } else if (to <= HASH_STRENGTH_FULL) {
2594 +                       HASH_FROM_TO(from, to);
2595 +               } else {
2596 +                       HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2597 +                       HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2598 +               }
2599 +       } else {
2600 +               if (from <= HASH_STRENGTH_FULL) {
2601 +                       HASH_FROM_DOWN_TO(from, to);
2602 +               } else if (to >= HASH_STRENGTH_FULL) {
2603 +                       from -= HASH_STRENGTH_FULL;
2604 +                       to -= HASH_STRENGTH_FULL;
2605 +                       HASH_FROM_DOWN_TO(from, to);
2606 +               } else {
2607 +                       HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2608 +                       HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2609 +               }
2610 +       }
2611 +
2612 +       return hash;
2613 +}
2614 +
2615 +
2616 +
2617 +
2618 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2619 +
2620 +/**
2621 + *
2622 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2623 + * has finished.
2624 + *
2625 + * return 0 if no page has been scanned since last call, 1 otherwise.
2626 + */
2627 +static inline int encode_benefit(void)
2628 +{
2629 +       u64 scanned_delta, pos_delta, neg_delta;
2630 +       unsigned long base = benefit.base;
2631 +
2632 +       scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2633 +
2634 +       if (!scanned_delta)
2635 +               return 0;
2636 +
2637 +       scanned_delta >>= base;
2638 +       pos_delta = rshash_pos >> base;
2639 +       neg_delta = rshash_neg >> base;
2640 +
2641 +       if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2642 +           CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2643 +           CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2644 +               benefit.scanned >>= 1;
2645 +               benefit.neg >>= 1;
2646 +               benefit.pos >>= 1;
2647 +               benefit.base++;
2648 +               scanned_delta >>= 1;
2649 +               pos_delta >>= 1;
2650 +               neg_delta >>= 1;
2651 +       }
2652 +
2653 +       benefit.pos += pos_delta;
2654 +       benefit.neg += neg_delta;
2655 +       benefit.scanned += scanned_delta;
2656 +
2657 +       BUG_ON(!benefit.scanned);
2658 +
2659 +       rshash_pos = rshash_neg = 0;
2660 +       uksm_pages_scanned_last = uksm_pages_scanned;
2661 +
2662 +       return 1;
2663 +}
2664 +
2665 +static inline void reset_benefit(void)
2666 +{
2667 +       benefit.pos = 0;
2668 +       benefit.neg = 0;
2669 +       benefit.base = 0;
2670 +       benefit.scanned = 0;
2671 +}
2672 +
2673 +static inline void inc_rshash_pos(unsigned long delta)
2674 +{
2675 +       if (CAN_OVERFLOW_U64(rshash_pos, delta))
2676 +               encode_benefit();
2677 +
2678 +       rshash_pos += delta;
2679 +}
2680 +
2681 +static inline void inc_rshash_neg(unsigned long delta)
2682 +{
2683 +       if (CAN_OVERFLOW_U64(rshash_neg, delta))
2684 +               encode_benefit();
2685 +
2686 +       rshash_neg += delta;
2687 +}
2688 +
2689 +
2690 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2691 +                           int cost_accounting)
2692 +{
2693 +       u32 val;
2694 +       unsigned long delta;
2695 +
2696 +       void *addr = kmap_atomic(page);
2697 +
2698 +       val = random_sample_hash(addr, hash_strength);
2699 +       kunmap_atomic(addr);
2700 +
2701 +       if (cost_accounting) {
2702 +               if (HASH_STRENGTH_FULL > hash_strength)
2703 +                       delta = HASH_STRENGTH_FULL - hash_strength;
2704 +               else
2705 +                       delta = 0;
2706 +
2707 +               inc_rshash_pos(delta);
2708 +       }
2709 +
2710 +       return val;
2711 +}
2712 +
2713 +static int memcmp_pages(struct page *page1, struct page *page2,
2714 +                       int cost_accounting)
2715 +{
2716 +       char *addr1, *addr2;
2717 +       int ret;
2718 +
2719 +       addr1 = kmap_atomic(page1);
2720 +       addr2 = kmap_atomic(page2);
2721 +       ret = memcmp(addr1, addr2, PAGE_SIZE);
2722 +       kunmap_atomic(addr2);
2723 +       kunmap_atomic(addr1);
2724 +
2725 +       if (cost_accounting)
2726 +               inc_rshash_neg(memcmp_cost);
2727 +
2728 +       return ret;
2729 +}
2730 +
2731 +static inline int pages_identical(struct page *page1, struct page *page2)
2732 +{
2733 +       return !memcmp_pages(page1, page2, 0);
2734 +}
2735 +
2736 +static inline int is_page_full_zero(struct page *page)
2737 +{
2738 +       char *addr;
2739 +       int ret;
2740 +
2741 +       addr = kmap_atomic(page);
2742 +       ret = is_full_zero(addr, PAGE_SIZE);
2743 +       kunmap_atomic(addr);
2744 +
2745 +       return ret;
2746 +}
2747 +
2748 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2749 +                             pte_t *orig_pte, pte_t *old_pte)
2750 +{
2751 +       struct mm_struct *mm = vma->vm_mm;
2752 +       unsigned long addr;
2753 +       pte_t *ptep;
2754 +       spinlock_t *ptl;
2755 +       int swapped;
2756 +       int err = -EFAULT;
2757 +       unsigned long mmun_start;       /* For mmu_notifiers */
2758 +       unsigned long mmun_end;         /* For mmu_notifiers */
2759 +
2760 +       addr = page_address_in_vma(page, vma);
2761 +       if (addr == -EFAULT)
2762 +               goto out;
2763 +
2764 +       BUG_ON(PageTransCompound(page));
2765 +
2766 +       mmun_start = addr;
2767 +       mmun_end   = addr + PAGE_SIZE;
2768 +       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2769 +
2770 +       ptep = page_check_address(page, mm, addr, &ptl, 0);
2771 +       if (!ptep)
2772 +               goto out_mn;
2773 +
2774 +       if (old_pte)
2775 +               *old_pte = *ptep;
2776 +
2777 +       if (pte_write(*ptep) || pte_dirty(*ptep)) {
2778 +               pte_t entry;
2779 +
2780 +               swapped = PageSwapCache(page);
2781 +               flush_cache_page(vma, addr, page_to_pfn(page));
2782 +               /*
2783 +                * Ok this is tricky, when get_user_pages_fast() run it doesnt
2784 +                * take any lock, therefore the check that we are going to make
2785 +                * with the pagecount against the mapcount is racey and
2786 +                * O_DIRECT can happen right after the check.
2787 +                * So we clear the pte and flush the tlb before the check
2788 +                * this assure us that no O_DIRECT can happen after the check
2789 +                * or in the middle of the check.
2790 +                */
2791 +               entry = ptep_clear_flush(vma, addr, ptep);
2792 +               /*
2793 +                * Check that no O_DIRECT or similar I/O is in progress on the
2794 +                * page
2795 +                */
2796 +               if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2797 +                       set_pte_at(mm, addr, ptep, entry);
2798 +                       goto out_unlock;
2799 +               }
2800 +               if (pte_dirty(entry))
2801 +                       set_page_dirty(page);
2802 +               entry = pte_mkclean(pte_wrprotect(entry));
2803 +               set_pte_at_notify(mm, addr, ptep, entry);
2804 +       }
2805 +       *orig_pte = *ptep;
2806 +       err = 0;
2807 +
2808 +out_unlock:
2809 +       pte_unmap_unlock(ptep, ptl);
2810 +out_mn:
2811 +       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2812 +out:
2813 +       return err;
2814 +}
2815 +
2816 +#define MERGE_ERR_PGERR                1 /* the page is invalid cannot continue */
2817 +#define MERGE_ERR_COLLI                2 /* there is a collision */
2818 +#define MERGE_ERR_COLLI_MAX    3 /* collision at the max hash strength */
2819 +#define MERGE_ERR_CHANGED      4 /* the page has changed since last hash */
2820 +
2821 +
2822 +/**
2823 + * replace_page - replace page in vma by new ksm page
2824 + * @vma:      vma that holds the pte pointing to page
2825 + * @page:     the page we are replacing by kpage
2826 + * @kpage:    the ksm page we replace page by
2827 + * @orig_pte: the original value of the pte
2828 + *
2829 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2830 + */
2831 +static int replace_page(struct vm_area_struct *vma, struct page *page,
2832 +                       struct page *kpage, pte_t orig_pte)
2833 +{
2834 +       struct mm_struct *mm = vma->vm_mm;
2835 +       pgd_t *pgd;
2836 +       pud_t *pud;
2837 +       pmd_t *pmd;
2838 +       pte_t *ptep;
2839 +       spinlock_t *ptl;
2840 +       pte_t entry;
2841 +
2842 +       unsigned long addr;
2843 +       int err = MERGE_ERR_PGERR;
2844 +       unsigned long mmun_start;       /* For mmu_notifiers */
2845 +       unsigned long mmun_end;         /* For mmu_notifiers */
2846 +
2847 +       addr = page_address_in_vma(page, vma);
2848 +       if (addr == -EFAULT)
2849 +               goto out;
2850 +
2851 +       pgd = pgd_offset(mm, addr);
2852 +       if (!pgd_present(*pgd))
2853 +               goto out;
2854 +
2855 +       pud = pud_offset(pgd, addr);
2856 +       if (!pud_present(*pud))
2857 +               goto out;
2858 +
2859 +       pmd = pmd_offset(pud, addr);
2860 +       BUG_ON(pmd_trans_huge(*pmd));
2861 +       if (!pmd_present(*pmd))
2862 +               goto out;
2863 +
2864 +       mmun_start = addr;
2865 +       mmun_end   = addr + PAGE_SIZE;
2866 +       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2867 +
2868 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2869 +       if (!pte_same(*ptep, orig_pte)) {
2870 +               pte_unmap_unlock(ptep, ptl);
2871 +               goto out_mn;
2872 +       }
2873 +
2874 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
2875 +       ptep_clear_flush(vma, addr, ptep);
2876 +       entry = mk_pte(kpage, vma->vm_page_prot);
2877 +
2878 +       /* special treatment is needed for zero_page */
2879 +       if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2880 +                               (page_to_pfn(kpage) == zero_pfn))
2881 +               entry = pte_mkspecial(entry);
2882 +       else {
2883 +               get_page(kpage);
2884 +               page_add_anon_rmap(kpage, vma, addr);
2885 +       }
2886 +
2887 +       set_pte_at_notify(mm, addr, ptep, entry);
2888 +
2889 +       page_remove_rmap(page);
2890 +       if (!page_mapped(page))
2891 +               try_to_free_swap(page);
2892 +       put_page(page);
2893 +
2894 +       pte_unmap_unlock(ptep, ptl);
2895 +       err = 0;
2896 +out_mn:
2897 +       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2898 +out:
2899 +       return err;
2900 +}
2901 +
2902 +
2903 +/**
2904 + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2905 + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2906 + *  hash_max member has not been calculated.
2907 + *
2908 + * @page The page needs to be hashed
2909 + * @hash_old The hash value calculated with current hash strength
2910 + *
2911 + * return the new hash value calculated at HASH_STRENGTH_MAX
2912 + */
2913 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2914 +{
2915 +       u32 hash_max = 0;
2916 +       void *addr;
2917 +
2918 +       addr = kmap_atomic(page);
2919 +       hash_max = delta_hash(addr, hash_strength,
2920 +                             HASH_STRENGTH_MAX, hash_old);
2921 +
2922 +       kunmap_atomic(addr);
2923 +
2924 +       if (!hash_max)
2925 +               hash_max = 1;
2926 +
2927 +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2928 +       return hash_max;
2929 +}
2930 +
2931 +/*
2932 + * We compare the hash again, to ensure that it is really a hash collision
2933 + * instead of being caused by page write.
2934 + */
2935 +static inline int check_collision(struct rmap_item *rmap_item,
2936 +                                 u32 hash)
2937 +{
2938 +       int err;
2939 +       struct page *page = rmap_item->page;
2940 +
2941 +       /* if this rmap_item has already been hash_maxed, then the collision
2942 +        * must appears in the second-level rbtree search. In this case we check
2943 +        * if its hash_max value has been changed. Otherwise, the collision
2944 +        * happens in the first-level rbtree search, so we check against it's
2945 +        * current hash value.
2946 +        */
2947 +       if (rmap_item->hash_max) {
2948 +               inc_rshash_neg(memcmp_cost);
2949 +               inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2950 +
2951 +               if (rmap_item->hash_max == page_hash_max(page, hash))
2952 +                       err = MERGE_ERR_COLLI;
2953 +               else
2954 +                       err = MERGE_ERR_CHANGED;
2955 +       } else {
2956 +               inc_rshash_neg(memcmp_cost + hash_strength);
2957 +
2958 +               if (page_hash(page, hash_strength, 0) == hash)
2959 +                       err = MERGE_ERR_COLLI;
2960 +               else
2961 +                       err = MERGE_ERR_CHANGED;
2962 +       }
2963 +
2964 +       return err;
2965 +}
2966 +
2967 +static struct page *page_trans_compound_anon(struct page *page)
2968 +{
2969 +       if (PageTransCompound(page)) {
2970 +               struct page *head = compound_head(page);
2971 +               /*
2972 +                * head may actually be splitted and freed from under
2973 +                * us but it's ok here.
2974 +                */
2975 +               if (PageAnon(head))
2976 +                       return head;
2977 +       }
2978 +       return NULL;
2979 +}
2980 +
2981 +static int page_trans_compound_anon_split(struct page *page)
2982 +{
2983 +       int ret = 0;
2984 +       struct page *transhuge_head = page_trans_compound_anon(page);
2985 +       if (transhuge_head) {
2986 +               /* Get the reference on the head to split it. */
2987 +               if (get_page_unless_zero(transhuge_head)) {
2988 +                       /*
2989 +                        * Recheck we got the reference while the head
2990 +                        * was still anonymous.
2991 +                        */
2992 +                       if (PageAnon(transhuge_head))
2993 +                               ret = split_huge_page(transhuge_head);
2994 +                       else
2995 +                               /*
2996 +                                * Retry later if split_huge_page run
2997 +                                * from under us.
2998 +                                */
2999 +                               ret = 1;
3000 +                       put_page(transhuge_head);
3001 +               } else
3002 +                       /* Retry later if split_huge_page run from under us. */
3003 +                       ret = 1;
3004 +       }
3005 +       return ret;
3006 +}
3007 +
3008 +/**
3009 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3010 + * already be a ksm page.
3011 + *
3012 + * @return 0 if the pages were merged, -EFAULT otherwise.
3013 + */
3014 +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3015 +                                     struct page *kpage, u32 hash)
3016 +{
3017 +       struct vm_area_struct *vma = rmap_item->slot->vma;
3018 +       struct mm_struct *mm = vma->vm_mm;
3019 +       pte_t orig_pte = __pte(0);
3020 +       int err = MERGE_ERR_PGERR;
3021 +       struct page *page;
3022 +
3023 +       if (uksm_test_exit(mm))
3024 +               goto out;
3025 +
3026 +       page = rmap_item->page;
3027 +
3028 +       if (page == kpage) { /* ksm page forked */
3029 +               err = 0;
3030 +               goto out;
3031 +       }
3032 +
3033 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3034 +               goto out;
3035 +       BUG_ON(PageTransCompound(page));
3036 +
3037 +       if (!PageAnon(page) || !PageKsm(kpage))
3038 +               goto out;
3039 +
3040 +       /*
3041 +        * We need the page lock to read a stable PageSwapCache in
3042 +        * write_protect_page().  We use trylock_page() instead of
3043 +        * lock_page() because we don't want to wait here - we
3044 +        * prefer to continue scanning and merging different pages,
3045 +        * then come back to this page when it is unlocked.
3046 +        */
3047 +       if (!trylock_page(page))
3048 +               goto out;
3049 +       /*
3050 +        * If this anonymous page is mapped only here, its pte may need
3051 +        * to be write-protected.  If it's mapped elsewhere, all of its
3052 +        * ptes are necessarily already write-protected.  But in either
3053 +        * case, we need to lock and check page_count is not raised.
3054 +        */
3055 +       if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3056 +               if (pages_identical(page, kpage))
3057 +                       err = replace_page(vma, page, kpage, orig_pte);
3058 +               else
3059 +                       err = check_collision(rmap_item, hash);
3060 +       }
3061 +
3062 +       if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3063 +               munlock_vma_page(page);
3064 +               if (!PageMlocked(kpage)) {
3065 +                       unlock_page(page);
3066 +                       lock_page(kpage);
3067 +                       mlock_vma_page(kpage);
3068 +                       page = kpage;           /* for final unlock */
3069 +               }
3070 +       }
3071 +
3072 +       unlock_page(page);
3073 +out:
3074 +       return err;
3075 +}
3076 +
3077 +
3078 +
3079 +/**
3080 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3081 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
3082 + *
3083 + * @return 0 on success.
3084 + */
3085 +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3086 +                            pte_t orig_pte, pte_t wprt_pte)
3087 +{
3088 +       struct mm_struct *mm = vma->vm_mm;
3089 +       pgd_t *pgd;
3090 +       pud_t *pud;
3091 +       pmd_t *pmd;
3092 +       pte_t *ptep;
3093 +       spinlock_t *ptl;
3094 +
3095 +       int err = -EFAULT;
3096 +
3097 +       pgd = pgd_offset(mm, addr);
3098 +       if (!pgd_present(*pgd))
3099 +               goto out;
3100 +
3101 +       pud = pud_offset(pgd, addr);
3102 +       if (!pud_present(*pud))
3103 +               goto out;
3104 +
3105 +       pmd = pmd_offset(pud, addr);
3106 +       if (!pmd_present(*pmd))
3107 +               goto out;
3108 +
3109 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3110 +       if (!pte_same(*ptep, wprt_pte)) {
3111 +               /* already copied, let it be */
3112 +               pte_unmap_unlock(ptep, ptl);
3113 +               goto out;
3114 +       }
3115 +
3116 +       /*
3117 +        * Good boy, still here. When we still get the ksm page, it does not
3118 +        * return to the free page pool, there is no way that a pte was changed
3119 +        * to other page and gets back to this page. And remind that ksm page
3120 +        * do not reuse in do_wp_page(). So it's safe to restore the original
3121 +        * pte.
3122 +        */
3123 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
3124 +       ptep_clear_flush(vma, addr, ptep);
3125 +       set_pte_at_notify(mm, addr, ptep, orig_pte);
3126 +
3127 +       pte_unmap_unlock(ptep, ptl);
3128 +       err = 0;
3129 +out:
3130 +       return err;
3131 +}
3132 +
3133 +/**
3134 + * try_to_merge_two_pages() - take two identical pages and prepare
3135 + * them to be merged into one page(rmap_item->page)
3136 + *
3137 + * @return 0 if we successfully merged two identical pages into
3138 + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3139 + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3140 + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
3141 + *
3142 + */
3143 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3144 +                                 struct rmap_item *tree_rmap_item,
3145 +                                 u32 hash)
3146 +{
3147 +       pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3148 +       pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3149 +       struct vm_area_struct *vma1 = rmap_item->slot->vma;
3150 +       struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3151 +       struct page *page = rmap_item->page;
3152 +       struct page *tree_page = tree_rmap_item->page;
3153 +       int err = MERGE_ERR_PGERR;
3154 +       struct address_space *saved_mapping;
3155 +
3156 +
3157 +       if (rmap_item->page == tree_rmap_item->page)
3158 +               goto out;
3159 +
3160 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3161 +               goto out;
3162 +       BUG_ON(PageTransCompound(page));
3163 +
3164 +       if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
3165 +               goto out;
3166 +       BUG_ON(PageTransCompound(tree_page));
3167 +
3168 +       if (!PageAnon(page) || !PageAnon(tree_page))
3169 +               goto out;
3170 +
3171 +       if (!trylock_page(page))
3172 +               goto out;
3173 +
3174 +
3175 +       if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3176 +               unlock_page(page);
3177 +               goto out;
3178 +       }
3179 +
3180 +       /*
3181 +        * While we hold page lock, upgrade page from
3182 +        * PageAnon+anon_vma to PageKsm+NULL stable_node:
3183 +        * stable_tree_insert() will update stable_node.
3184 +        */
3185 +       saved_mapping = page->mapping;
3186 +       set_page_stable_node(page, NULL);
3187 +       mark_page_accessed(page);
3188 +       unlock_page(page);
3189 +
3190 +       if (!trylock_page(tree_page))
3191 +               goto restore_out;
3192 +
3193 +       if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3194 +               unlock_page(tree_page);
3195 +               goto restore_out;
3196 +       }
3197 +
3198 +       if (pages_identical(page, tree_page)) {
3199 +               err = replace_page(vma2, tree_page, page, wprt_pte2);
3200 +               if (err) {
3201 +                       unlock_page(tree_page);
3202 +                       goto restore_out;
3203 +               }
3204 +
3205 +               if ((vma2->vm_flags & VM_LOCKED)) {
3206 +                       munlock_vma_page(tree_page);
3207 +                       if (!PageMlocked(page)) {
3208 +                               unlock_page(tree_page);
3209 +                               lock_page(page);
3210 +                               mlock_vma_page(page);
3211 +                               tree_page = page; /* for final unlock */
3212 +                       }
3213 +               }
3214 +
3215 +               unlock_page(tree_page);
3216 +
3217 +               goto out; /* success */
3218 +
3219 +       } else {
3220 +               if (tree_rmap_item->hash_max &&
3221 +                   tree_rmap_item->hash_max == rmap_item->hash_max) {
3222 +                       err = MERGE_ERR_COLLI_MAX;
3223 +               } else if (page_hash(page, hash_strength, 0) ==
3224 +                   page_hash(tree_page, hash_strength, 0)) {
3225 +                       inc_rshash_neg(memcmp_cost + hash_strength * 2);
3226 +                       err = MERGE_ERR_COLLI;
3227 +               } else {
3228 +                       err = MERGE_ERR_CHANGED;
3229 +               }
3230 +
3231 +               unlock_page(tree_page);
3232 +       }
3233 +
3234 +restore_out:
3235 +       lock_page(page);
3236 +       if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3237 +                                 orig_pte1, wprt_pte1))
3238 +               page->mapping = saved_mapping;
3239 +
3240 +       unlock_page(page);
3241 +out:
3242 +       return err;
3243 +}
3244 +
3245 +static inline int hash_cmp(u32 new_val, u32 node_val)
3246 +{
3247 +       if (new_val > node_val)
3248 +               return 1;
3249 +       else if (new_val < node_val)
3250 +               return -1;
3251 +       else
3252 +               return 0;
3253 +}
3254 +
3255 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3256 +{
3257 +       u32 hash_max = item->hash_max;
3258 +
3259 +       if (!hash_max) {
3260 +               hash_max = page_hash_max(item->page, hash);
3261 +
3262 +               item->hash_max = hash_max;
3263 +       }
3264 +
3265 +       return hash_max;
3266 +}
3267 +
3268 +
3269 +
3270 +/**
3271 + * stable_tree_search() - search the stable tree for a page
3272 + *
3273 + * @item:      the rmap_item we are comparing with
3274 + * @hash:      the hash value of this item->page already calculated
3275 + *
3276 + * @return     the page we have found, NULL otherwise. The page returned has
3277 + *             been gotten.
3278 + */
3279 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3280 +{
3281 +       struct rb_node *node = root_stable_treep->rb_node;
3282 +       struct tree_node *tree_node;
3283 +       unsigned long hash_max;
3284 +       struct page *page = item->page;
3285 +       struct stable_node *stable_node;
3286 +
3287 +       stable_node = page_stable_node(page);
3288 +       if (stable_node) {
3289 +               /* ksm page forked, that is
3290 +                * if (PageKsm(page) && !in_stable_tree(rmap_item))
3291 +                * it's actually gotten once outside.
3292 +                */
3293 +               get_page(page);
3294 +               return page;
3295 +       }
3296 +
3297 +       while (node) {
3298 +               int cmp;
3299 +
3300 +               tree_node = rb_entry(node, struct tree_node, node);
3301 +
3302 +               cmp = hash_cmp(hash, tree_node->hash);
3303 +
3304 +               if (cmp < 0)
3305 +                       node = node->rb_left;
3306 +               else if (cmp > 0)
3307 +                       node = node->rb_right;
3308 +               else
3309 +                       break;
3310 +       }
3311 +
3312 +       if (!node)
3313 +               return NULL;
3314 +
3315 +       if (tree_node->count == 1) {
3316 +               stable_node = rb_entry(tree_node->sub_root.rb_node,
3317 +                                      struct stable_node, node);
3318 +               BUG_ON(!stable_node);
3319 +
3320 +               goto get_page_out;
3321 +       }
3322 +
3323 +       /*
3324 +        * ok, we have to search the second
3325 +        * level subtree, hash the page to a
3326 +        * full strength.
3327 +        */
3328 +       node = tree_node->sub_root.rb_node;
3329 +       BUG_ON(!node);
3330 +       hash_max = rmap_item_hash_max(item, hash);
3331 +
3332 +       while (node) {
3333 +               int cmp;
3334 +
3335 +               stable_node = rb_entry(node, struct stable_node, node);
3336 +
3337 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
3338 +
3339 +               if (cmp < 0)
3340 +                       node = node->rb_left;
3341 +               else if (cmp > 0)
3342 +                       node = node->rb_right;
3343 +               else
3344 +                       goto get_page_out;
3345 +       }
3346 +
3347 +       return NULL;
3348 +
3349 +get_page_out:
3350 +       page = get_uksm_page(stable_node, 1, 1);
3351 +       return page;
3352 +}
3353 +
3354 +static int try_merge_rmap_item(struct rmap_item *item,
3355 +                              struct page *kpage,
3356 +                              struct page *tree_page)
3357 +{
3358 +       spinlock_t *ptl;
3359 +       pte_t *ptep;
3360 +       unsigned long addr;
3361 +       struct vm_area_struct *vma = item->slot->vma;
3362 +
3363 +       addr = get_rmap_addr(item);
3364 +       ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
3365 +       if (!ptep)
3366 +               return 0;
3367 +
3368 +       if (pte_write(*ptep)) {
3369 +               /* has changed, abort! */
3370 +               pte_unmap_unlock(ptep, ptl);
3371 +               return 0;
3372 +       }
3373 +
3374 +       get_page(tree_page);
3375 +       page_add_anon_rmap(tree_page, vma, addr);
3376 +
3377 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
3378 +       ptep_clear_flush(vma, addr, ptep);
3379 +       set_pte_at_notify(vma->vm_mm, addr, ptep,
3380 +                         mk_pte(tree_page, vma->vm_page_prot));
3381 +
3382 +       page_remove_rmap(kpage);
3383 +       put_page(kpage);
3384 +
3385 +       pte_unmap_unlock(ptep, ptl);
3386 +
3387 +       return 1;
3388 +}
3389 +
3390 +/**
3391 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3392 + * into stable tree, the page was found to be identical to a stable ksm page,
3393 + * this is the last chance we can merge them into one.
3394 + *
3395 + * @item1:     the rmap_item holding the page which we wanted to insert
3396 + *             into stable tree.
3397 + * @item2:     the other rmap_item we found when unstable tree search
3398 + * @oldpage:   the page currently mapped by the two rmap_items
3399 + * @tree_page:         the page we found identical in stable tree node
3400 + * @success1:  return if item1 is successfully merged
3401 + * @success2:  return if item2 is successfully merged
3402 + */
3403 +static void try_merge_with_stable(struct rmap_item *item1,
3404 +                                 struct rmap_item *item2,
3405 +                                 struct page **kpage,
3406 +                                 struct page *tree_page,
3407 +                                 int *success1, int *success2)
3408 +{
3409 +       struct vm_area_struct *vma1 = item1->slot->vma;
3410 +       struct vm_area_struct *vma2 = item2->slot->vma;
3411 +       *success1 = 0;
3412 +       *success2 = 0;
3413 +
3414 +       if (unlikely(*kpage == tree_page)) {
3415 +               /* I don't think this can really happen */
3416 +               printk(KERN_WARNING "UKSM: unexpected condition detected in "
3417 +                       "try_merge_with_stable() -- *kpage == tree_page !\n");
3418 +               *success1 = 1;
3419 +               *success2 = 1;
3420 +               return;
3421 +       }
3422 +
3423 +       if (!PageAnon(*kpage) || !PageKsm(*kpage))
3424 +               goto failed;
3425 +
3426 +       if (!trylock_page(tree_page))
3427 +               goto failed;
3428 +
3429 +       /* If the oldpage is still ksm and still pointed
3430 +        * to in the right place, and still write protected,
3431 +        * we are confident it's not changed, no need to
3432 +        * memcmp anymore.
3433 +        * be ware, we cannot take nested pte locks,
3434 +        * deadlock risk.
3435 +        */
3436 +       if (!try_merge_rmap_item(item1, *kpage, tree_page))
3437 +               goto unlock_failed;
3438 +
3439 +       /* ok, then vma2, remind that pte1 already set */
3440 +       if (!try_merge_rmap_item(item2, *kpage, tree_page))
3441 +               goto success_1;
3442 +
3443 +       *success2 = 1;
3444 +success_1:
3445 +       *success1 = 1;
3446 +
3447 +
3448 +       if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3449 +           (*success2 && vma2->vm_flags & VM_LOCKED)) {
3450 +               munlock_vma_page(*kpage);
3451 +               if (!PageMlocked(tree_page))
3452 +                       mlock_vma_page(tree_page);
3453 +       }
3454 +
3455 +       /*
3456 +        * We do not need oldpage any more in the caller, so can break the lock
3457 +        * now.
3458 +        */
3459 +       unlock_page(*kpage);
3460 +       *kpage = tree_page; /* Get unlocked outside. */
3461 +       return;
3462 +
3463 +unlock_failed:
3464 +       unlock_page(tree_page);
3465 +failed:
3466 +       return;
3467 +}
3468 +
3469 +static inline void stable_node_hash_max(struct stable_node *node,
3470 +                                        struct page *page, u32 hash)
3471 +{
3472 +       u32 hash_max = node->hash_max;
3473 +
3474 +       if (!hash_max) {
3475 +               hash_max = page_hash_max(page, hash);
3476 +               node->hash_max = hash_max;
3477 +       }
3478 +}
3479 +
3480 +static inline
3481 +struct stable_node *new_stable_node(struct tree_node *tree_node,
3482 +                                   struct page *kpage, u32 hash_max)
3483 +{
3484 +       struct stable_node *new_stable_node;
3485 +
3486 +       new_stable_node = alloc_stable_node();
3487 +       if (!new_stable_node)
3488 +               return NULL;
3489 +
3490 +       new_stable_node->kpfn = page_to_pfn(kpage);
3491 +       new_stable_node->hash_max = hash_max;
3492 +       new_stable_node->tree_node = tree_node;
3493 +       set_page_stable_node(kpage, new_stable_node);
3494 +
3495 +       return new_stable_node;
3496 +}
3497 +
3498 +static inline
3499 +struct stable_node *first_level_insert(struct tree_node *tree_node,
3500 +                                      struct rmap_item *rmap_item,
3501 +                                      struct rmap_item *tree_rmap_item,
3502 +                                      struct page **kpage, u32 hash,
3503 +                                      int *success1, int *success2)
3504 +{
3505 +       int cmp;
3506 +       struct page *tree_page;
3507 +       u32 hash_max = 0;
3508 +       struct stable_node *stable_node, *new_snode;
3509 +       struct rb_node *parent = NULL, **new;
3510 +
3511 +       /* this tree node contains no sub-tree yet */
3512 +       stable_node = rb_entry(tree_node->sub_root.rb_node,
3513 +                              struct stable_node, node);
3514 +
3515 +       tree_page = get_uksm_page(stable_node, 1, 0);
3516 +       if (tree_page) {
3517 +               cmp = memcmp_pages(*kpage, tree_page, 1);
3518 +               if (!cmp) {
3519 +                       try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3520 +                                             tree_page, success1, success2);
3521 +                       put_page(tree_page);
3522 +                       if (!*success1 && !*success2)
3523 +                               goto failed;
3524 +
3525 +                       return stable_node;
3526 +
3527 +               } else {
3528 +                       /*
3529 +                        * collision in first level try to create a subtree.
3530 +                        * A new node need to be created.
3531 +                        */
3532 +                       put_page(tree_page);
3533 +
3534 +                       stable_node_hash_max(stable_node, tree_page,
3535 +                                            tree_node->hash);
3536 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
3537 +                       cmp = hash_cmp(hash_max, stable_node->hash_max);
3538 +
3539 +                       parent = &stable_node->node;
3540 +                       if (cmp < 0) {
3541 +                               new = &parent->rb_left;
3542 +                       } else if (cmp > 0) {
3543 +                               new = &parent->rb_right;
3544 +                       } else {
3545 +                               goto failed;
3546 +                       }
3547 +               }
3548 +
3549 +       } else {
3550 +               /* the only stable_node deleted, we reuse its tree_node.
3551 +                */
3552 +               parent = NULL;
3553 +               new = &tree_node->sub_root.rb_node;
3554 +       }
3555 +
3556 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
3557 +       if (!new_snode)
3558 +               goto failed;
3559 +
3560 +       rb_link_node(&new_snode->node, parent, new);
3561 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
3562 +       tree_node->count++;
3563 +       *success1 = *success2 = 1;
3564 +
3565 +       return new_snode;
3566 +
3567 +failed:
3568 +       return NULL;
3569 +}
3570 +
3571 +static inline
3572 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3573 +                                         struct rmap_item *rmap_item,
3574 +                                         struct rmap_item *tree_rmap_item,
3575 +                                         struct page **kpage, u32 hash,
3576 +                                         int *success1, int *success2)
3577 +{
3578 +       struct page *tree_page;
3579 +       u32 hash_max;
3580 +       struct stable_node *stable_node, *new_snode;
3581 +       struct rb_node *parent, **new;
3582 +
3583 +research:
3584 +       parent = NULL;
3585 +       new = &tree_node->sub_root.rb_node;
3586 +       BUG_ON(!*new);
3587 +       hash_max = rmap_item_hash_max(rmap_item, hash);
3588 +       while (*new) {
3589 +               int cmp;
3590 +
3591 +               stable_node = rb_entry(*new, struct stable_node, node);
3592 +
3593 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
3594 +
3595 +               if (cmp < 0) {
3596 +                       parent = *new;
3597 +                       new = &parent->rb_left;
3598 +               } else if (cmp > 0) {
3599 +                       parent = *new;
3600 +                       new = &parent->rb_right;
3601 +               } else {
3602 +                       tree_page = get_uksm_page(stable_node, 1, 0);
3603 +                       if (tree_page) {
3604 +                               cmp = memcmp_pages(*kpage, tree_page, 1);
3605 +                               if (!cmp) {
3606 +                                       try_merge_with_stable(rmap_item,
3607 +                                               tree_rmap_item, kpage,
3608 +                                               tree_page, success1, success2);
3609 +
3610 +                                       put_page(tree_page);
3611 +                                       if (!*success1 && !*success2)
3612 +                                               goto failed;
3613 +                                       /*
3614 +                                        * successfully merged with a stable
3615 +                                        * node
3616 +                                        */
3617 +                                       return stable_node;
3618 +                               } else {
3619 +                                       put_page(tree_page);
3620 +                                       goto failed;
3621 +                               }
3622 +                       } else {
3623 +                               /*
3624 +                                * stable node may be deleted,
3625 +                                * and subtree maybe
3626 +                                * restructed, cannot
3627 +                                * continue, research it.
3628 +                                */
3629 +                               if (tree_node->count) {
3630 +                                       goto research;
3631 +                               } else {
3632 +                                       /* reuse the tree node*/
3633 +                                       parent = NULL;
3634 +                                       new = &tree_node->sub_root.rb_node;
3635 +                               }
3636 +                       }
3637 +               }
3638 +       }
3639 +
3640 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
3641 +       if (!new_snode)
3642 +               goto failed;
3643 +
3644 +       rb_link_node(&new_snode->node, parent, new);
3645 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
3646 +       tree_node->count++;
3647 +       *success1 = *success2 = 1;
3648 +
3649 +       return new_snode;
3650 +
3651 +failed:
3652 +       return NULL;
3653 +}
3654 +
3655 +
3656 +/**
3657 + * stable_tree_insert() - try to insert a merged page in unstable tree to
3658 + * the stable tree
3659 + *
3660 + * @kpage:             the page need to be inserted
3661 + * @hash:              the current hash of this page
3662 + * @rmap_item:         the rmap_item being scanned
3663 + * @tree_rmap_item:    the rmap_item found on unstable tree
3664 + * @success1:          return if rmap_item is merged
3665 + * @success2:          return if tree_rmap_item is merged
3666 + *
3667 + * @return             the stable_node on stable tree if at least one
3668 + *                     rmap_item is inserted into stable tree, NULL
3669 + *                     otherwise.
3670 + */
3671 +static struct stable_node *
3672 +stable_tree_insert(struct page **kpage, u32 hash,
3673 +                  struct rmap_item *rmap_item,
3674 +                  struct rmap_item *tree_rmap_item,
3675 +                  int *success1, int *success2)
3676 +{
3677 +       struct rb_node **new = &root_stable_treep->rb_node;
3678 +       struct rb_node *parent = NULL;
3679 +       struct stable_node *stable_node;
3680 +       struct tree_node *tree_node;
3681 +       u32 hash_max = 0;
3682 +
3683 +       *success1 = *success2 = 0;
3684 +
3685 +       while (*new) {
3686 +               int cmp;
3687 +
3688 +               tree_node = rb_entry(*new, struct tree_node, node);
3689 +
3690 +               cmp = hash_cmp(hash, tree_node->hash);
3691 +
3692 +               if (cmp < 0) {
3693 +                       parent = *new;
3694 +                       new = &parent->rb_left;
3695 +               } else if (cmp > 0) {
3696 +                       parent = *new;
3697 +                       new = &parent->rb_right;
3698 +               } else
3699 +                       break;
3700 +       }
3701 +
3702 +       if (*new) {
3703 +               if (tree_node->count == 1) {
3704 +                       stable_node = first_level_insert(tree_node, rmap_item,
3705 +                                               tree_rmap_item, kpage,
3706 +                                               hash, success1, success2);
3707 +               } else {
3708 +                       stable_node = stable_subtree_insert(tree_node,
3709 +                                       rmap_item, tree_rmap_item, kpage,
3710 +                                       hash, success1, success2);
3711 +               }
3712 +       } else {
3713 +
3714 +               /* no tree node found */
3715 +               tree_node = alloc_tree_node(stable_tree_node_listp);
3716 +               if (!tree_node) {
3717 +                       stable_node = NULL;
3718 +                       goto out;
3719 +               }
3720 +
3721 +               stable_node = new_stable_node(tree_node, *kpage, hash_max);
3722 +               if (!stable_node) {
3723 +                       free_tree_node(tree_node);
3724 +                       goto out;
3725 +               }
3726 +
3727 +               tree_node->hash = hash;
3728 +               rb_link_node(&tree_node->node, parent, new);
3729 +               rb_insert_color(&tree_node->node, root_stable_treep);
3730 +               parent = NULL;
3731 +               new = &tree_node->sub_root.rb_node;
3732 +
3733 +               rb_link_node(&stable_node->node, parent, new);
3734 +               rb_insert_color(&stable_node->node, &tree_node->sub_root);
3735 +               tree_node->count++;
3736 +               *success1 = *success2 = 1;
3737 +       }
3738 +
3739 +out:
3740 +       return stable_node;
3741 +}
3742 +
3743 +
3744 +/**
3745 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3746 + *
3747 + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
3748 + *             -EINVAL if the page mapping has been changed.
3749 + */
3750 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3751 +{
3752 +       int err;
3753 +
3754 +       err = get_mergeable_page_lock_mmap(tree_rmap_item);
3755 +
3756 +       if (err == -EINVAL) {
3757 +               /* its page map has been changed, remove it */
3758 +               remove_rmap_item_from_tree(tree_rmap_item);
3759 +       }
3760 +
3761 +       /* The page is gotten and mmap_sem is locked now. */
3762 +       return err;
3763 +}
3764 +
3765 +
3766 +/**
3767 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3768 + * same hash value. Get its page and trylock the mmap_sem
3769 + */
3770 +static inline
3771 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3772 +                                             u32 hash)
3773 +
3774 +{
3775 +       struct rb_node **new = &root_unstable_tree.rb_node;
3776 +       struct rb_node *parent = NULL;
3777 +       struct tree_node *tree_node;
3778 +       u32 hash_max;
3779 +       struct rmap_item *tree_rmap_item;
3780 +
3781 +       while (*new) {
3782 +               int cmp;
3783 +
3784 +               tree_node = rb_entry(*new, struct tree_node, node);
3785 +
3786 +               cmp = hash_cmp(hash, tree_node->hash);
3787 +
3788 +               if (cmp < 0) {
3789 +                       parent = *new;
3790 +                       new = &parent->rb_left;
3791 +               } else if (cmp > 0) {
3792 +                       parent = *new;
3793 +                       new = &parent->rb_right;
3794 +               } else
3795 +                       break;
3796 +       }
3797 +
3798 +       if (*new) {
3799 +               /* got the tree_node */
3800 +               if (tree_node->count == 1) {
3801 +                       tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3802 +                                                 struct rmap_item, node);
3803 +                       BUG_ON(!tree_rmap_item);
3804 +
3805 +                       goto get_page_out;
3806 +               }
3807 +
3808 +               /* well, search the collision subtree */
3809 +               new = &tree_node->sub_root.rb_node;
3810 +               BUG_ON(!*new);
3811 +               hash_max = rmap_item_hash_max(rmap_item, hash);
3812 +
3813 +               while (*new) {
3814 +                       int cmp;
3815 +
3816 +                       tree_rmap_item = rb_entry(*new, struct rmap_item,
3817 +                                                 node);
3818 +
3819 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3820 +                       parent = *new;
3821 +                       if (cmp < 0)
3822 +                               new = &parent->rb_left;
3823 +                       else if (cmp > 0)
3824 +                               new = &parent->rb_right;
3825 +                       else
3826 +                               goto get_page_out;
3827 +               }
3828 +       } else {
3829 +               /* alloc a new tree_node */
3830 +               tree_node = alloc_tree_node(&unstable_tree_node_list);
3831 +               if (!tree_node)
3832 +                       return NULL;
3833 +
3834 +               tree_node->hash = hash;
3835 +               rb_link_node(&tree_node->node, parent, new);
3836 +               rb_insert_color(&tree_node->node, &root_unstable_tree);
3837 +               parent = NULL;
3838 +               new = &tree_node->sub_root.rb_node;
3839 +       }
3840 +
3841 +       /* did not found even in sub-tree */
3842 +       rmap_item->tree_node = tree_node;
3843 +       rmap_item->address |= UNSTABLE_FLAG;
3844 +       rmap_item->hash_round = uksm_hash_round;
3845 +       rb_link_node(&rmap_item->node, parent, new);
3846 +       rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3847 +
3848 +       uksm_pages_unshared++;
3849 +       return NULL;
3850 +
3851 +get_page_out:
3852 +       if (tree_rmap_item->page == rmap_item->page)
3853 +               return NULL;
3854 +
3855 +       if (get_tree_rmap_item_page(tree_rmap_item))
3856 +               return NULL;
3857 +
3858 +       return tree_rmap_item;
3859 +}
3860 +
3861 +static void hold_anon_vma(struct rmap_item *rmap_item,
3862 +                         struct anon_vma *anon_vma)
3863 +{
3864 +       rmap_item->anon_vma = anon_vma;
3865 +       get_anon_vma(anon_vma);
3866 +}
3867 +
3868 +
3869 +/**
3870 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3871 + * ratio statistics is done in this function.
3872 + *
3873 + */
3874 +static void stable_tree_append(struct rmap_item *rmap_item,
3875 +                              struct stable_node *stable_node, int logdedup)
3876 +{
3877 +       struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3878 +       unsigned long key = (unsigned long)rmap_item->slot;
3879 +       unsigned long factor = rmap_item->slot->rung->step;
3880 +
3881 +       BUG_ON(!stable_node);
3882 +       rmap_item->address |= STABLE_FLAG;
3883 +
3884 +       if (hlist_empty(&stable_node->hlist)) {
3885 +               uksm_pages_shared++;
3886 +               goto node_vma_new;
3887 +       } else {
3888 +               uksm_pages_sharing++;
3889 +       }
3890 +
3891 +       hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3892 +               if (node_vma->key >= key)
3893 +                       break;
3894 +
3895 +               if (logdedup) {
3896 +                       node_vma->slot->pages_bemerged += factor;
3897 +                       if (list_empty(&node_vma->slot->dedup_list))
3898 +                               list_add(&node_vma->slot->dedup_list,
3899 +                                        &vma_slot_dedup);
3900 +               }
3901 +       }
3902 +
3903 +       if (node_vma) {
3904 +               if (node_vma->key == key) {
3905 +                       node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3906 +                       goto node_vma_ok;
3907 +               } else if (node_vma->key > key) {
3908 +                       node_vma_cont = node_vma;
3909 +               }
3910 +       }
3911 +
3912 +node_vma_new:
3913 +       /* no same vma already in node, alloc a new node_vma */
3914 +       new_node_vma = alloc_node_vma();
3915 +       BUG_ON(!new_node_vma);
3916 +       new_node_vma->head = stable_node;
3917 +       new_node_vma->slot = rmap_item->slot;
3918 +
3919 +       if (!node_vma) {
3920 +               hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3921 +       } else if (node_vma->key != key) {
3922 +               if (node_vma->key < key)
3923 +                       hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3924 +               else {
3925 +                       hlist_add_before(&new_node_vma->hlist,
3926 +                                        &node_vma->hlist);
3927 +               }
3928 +
3929 +       }
3930 +       node_vma = new_node_vma;
3931 +
3932 +node_vma_ok: /* ok, ready to add to the list */
3933 +       rmap_item->head = node_vma;
3934 +       hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3935 +       hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3936 +       if (logdedup) {
3937 +               rmap_item->slot->pages_merged++;
3938 +               if (node_vma_cont) {
3939 +                       node_vma = node_vma_cont;
3940 +                       hlist_for_each_entry_continue(node_vma, hlist) {
3941 +                               node_vma->slot->pages_bemerged += factor;
3942 +                               if (list_empty(&node_vma->slot->dedup_list))
3943 +                                       list_add(&node_vma->slot->dedup_list,
3944 +                                                &vma_slot_dedup);
3945 +                       }
3946 +               }
3947 +       }
3948 +}
3949 +
3950 +/*
3951 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3952 + *
3953 + *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3954 + *             put_page(page);
3955 + *
3956 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3957 + * in case the application has unmapped and remapped mm,addr meanwhile.
3958 + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
3959 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3960 + */
3961 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3962 +{
3963 +       struct page *page;
3964 +       int ret = 0;
3965 +
3966 +       do {
3967 +               cond_resched();
3968 +               page = follow_page(vma, addr, FOLL_GET);
3969 +               if (IS_ERR_OR_NULL(page))
3970 +                       break;
3971 +               if (PageKsm(page)) {
3972 +                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
3973 +                                             FAULT_FLAG_WRITE);
3974 +               } else
3975 +                       ret = VM_FAULT_WRITE;
3976 +               put_page(page);
3977 +       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3978 +       /*
3979 +        * We must loop because handle_mm_fault() may back out if there's
3980 +        * any difficulty e.g. if pte accessed bit gets updated concurrently.
3981 +        *
3982 +        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3983 +        * COW has been broken, even if the vma does not permit VM_WRITE;
3984 +        * but note that a concurrent fault might break PageKsm for us.
3985 +        *
3986 +        * VM_FAULT_SIGBUS could occur if we race with truncation of the
3987 +        * backing file, which also invalidates anonymous pages: that's
3988 +        * okay, that truncation will have unmapped the PageKsm for us.
3989 +        *
3990 +        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3991 +        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3992 +        * current task has TIF_MEMDIE set, and will be OOM killed on return
3993 +        * to user; and ksmd, having no mm, would never be chosen for that.
3994 +        *
3995 +        * But if the mm is in a limited mem_cgroup, then the fault may fail
3996 +        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3997 +        * even ksmd can fail in this way - though it's usually breaking ksm
3998 +        * just to undo a merge it made a moment before, so unlikely to oom.
3999 +        *
4000 +        * That's a pity: we might therefore have more kernel pages allocated
4001 +        * than we're counting as nodes in the stable tree; but uksm_do_scan
4002 +        * will retry to break_cow on each pass, so should recover the page
4003 +        * in due course.  The important thing is to not let VM_MERGEABLE
4004 +        * be cleared while any such pages might remain in the area.
4005 +        */
4006 +       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4007 +}
4008 +
4009 +static void break_cow(struct rmap_item *rmap_item)
4010 +{
4011 +       struct vm_area_struct *vma = rmap_item->slot->vma;
4012 +       struct mm_struct *mm = vma->vm_mm;
4013 +       unsigned long addr = get_rmap_addr(rmap_item);
4014 +
4015 +       if (uksm_test_exit(mm))
4016 +               goto out;
4017 +
4018 +       break_ksm(vma, addr);
4019 +out:
4020 +       return;
4021 +}
4022 +
4023 +/*
4024 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4025 + * than check every pte of a given vma, the locking doesn't quite work for
4026 + * that - an rmap_item is assigned to the stable tree after inserting ksm
4027 + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
4028 + * rmap_items from parent to child at fork time (so as not to waste time
4029 + * if exit comes before the next scan reaches it).
4030 + *
4031 + * Similarly, although we'd like to remove rmap_items (so updating counts
4032 + * and freeing memory) when unmerging an area, it's easier to leave that
4033 + * to the next pass of ksmd - consider, for example, how ksmd might be
4034 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
4035 + */
4036 +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4037 +                     unsigned long start, unsigned long end)
4038 +{
4039 +       unsigned long addr;
4040 +       int err = 0;
4041 +
4042 +       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4043 +               if (uksm_test_exit(vma->vm_mm))
4044 +                       break;
4045 +               if (signal_pending(current))
4046 +                       err = -ERESTARTSYS;
4047 +               else
4048 +                       err = break_ksm(vma, addr);
4049 +       }
4050 +       return err;
4051 +}
4052 +
4053 +static inline void inc_uksm_pages_scanned(void)
4054 +{
4055 +       u64 delta;
4056 +
4057 +
4058 +       if (uksm_pages_scanned == U64_MAX) {
4059 +               encode_benefit();
4060 +
4061 +               delta = uksm_pages_scanned >> pages_scanned_base;
4062 +
4063 +               if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4064 +                       pages_scanned_stored >>= 1;
4065 +                       delta >>= 1;
4066 +                       pages_scanned_base++;
4067 +               }
4068 +
4069 +               pages_scanned_stored += delta;
4070 +
4071 +               uksm_pages_scanned = uksm_pages_scanned_last = 0;
4072 +       }
4073 +
4074 +       uksm_pages_scanned++;
4075 +}
4076 +
4077 +static inline int find_zero_page_hash(int strength, u32 hash)
4078 +{
4079 +       return (zero_hash_table[strength] == hash);
4080 +}
4081 +
4082 +static
4083 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4084 +{
4085 +       struct page *zero_page = empty_uksm_zero_page;
4086 +       struct mm_struct *mm = vma->vm_mm;
4087 +       pte_t orig_pte = __pte(0);
4088 +       int err = -EFAULT;
4089 +
4090 +       if (uksm_test_exit(mm))
4091 +               goto out;
4092 +
4093 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
4094 +               goto out;
4095 +       BUG_ON(PageTransCompound(page));
4096 +
4097 +       if (!PageAnon(page))
4098 +               goto out;
4099 +
4100 +       if (!trylock_page(page))
4101 +               goto out;
4102 +
4103 +       if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4104 +               if (is_page_full_zero(page))
4105 +                       err = replace_page(vma, page, zero_page, orig_pte);
4106 +       }
4107 +
4108 +       unlock_page(page);
4109 +out:
4110 +       return err;
4111 +}
4112 +
4113 +/*
4114 + * cmp_and_merge_page() - first see if page can be merged into the stable
4115 + * tree; if not, compare hash to previous and if it's the same, see if page
4116 + * can be inserted into the unstable tree, or merged with a page already there
4117 + * and both transferred to the stable tree.
4118 + *
4119 + * @page: the page that we are searching identical page to.
4120 + * @rmap_item: the reverse mapping into the virtual address of this page
4121 + */
4122 +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4123 +{
4124 +       struct rmap_item *tree_rmap_item;
4125 +       struct page *page;
4126 +       struct page *kpage = NULL;
4127 +       u32 hash_max;
4128 +       int err;
4129 +       unsigned int success1, success2;
4130 +       struct stable_node *snode;
4131 +       int cmp;
4132 +       struct rb_node *parent = NULL, **new;
4133 +
4134 +       remove_rmap_item_from_tree(rmap_item);
4135 +       page = rmap_item->page;
4136 +
4137 +       /* We first start with searching the page inside the stable tree */
4138 +       kpage = stable_tree_search(rmap_item, hash);
4139 +       if (kpage) {
4140 +               err = try_to_merge_with_uksm_page(rmap_item, kpage,
4141 +                                                hash);
4142 +               if (!err) {
4143 +                       /*
4144 +                        * The page was successfully merged, add
4145 +                        * its rmap_item to the stable tree.
4146 +                        * page lock is needed because it's
4147 +                        * racing with try_to_unmap_ksm(), etc.
4148 +                        */
4149 +                       lock_page(kpage);
4150 +                       snode = page_stable_node(kpage);
4151 +                       stable_tree_append(rmap_item, snode, 1);
4152 +                       unlock_page(kpage);
4153 +                       put_page(kpage);
4154 +                       return; /* success */
4155 +               }
4156 +               put_page(kpage);
4157 +
4158 +               /*
4159 +                * if it's a collision and it has been search in sub-rbtree
4160 +                * (hash_max != 0), we want to abort, because if it is
4161 +                * successfully merged in unstable tree, the collision trends to
4162 +                * happen again.
4163 +                */
4164 +               if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4165 +                       return;
4166 +       }
4167 +
4168 +       tree_rmap_item =
4169 +               unstable_tree_search_insert(rmap_item, hash);
4170 +       if (tree_rmap_item) {
4171 +               err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4172 +               /*
4173 +                * As soon as we merge this page, we want to remove the
4174 +                * rmap_item of the page we have merged with from the unstable
4175 +                * tree, and insert it instead as new node in the stable tree.
4176 +                */
4177 +               if (!err) {
4178 +                       kpage = page;
4179 +                       remove_rmap_item_from_tree(tree_rmap_item);
4180 +                       lock_page(kpage);
4181 +                       snode = stable_tree_insert(&kpage, hash,
4182 +                                                  rmap_item, tree_rmap_item,
4183 +                                                  &success1, &success2);
4184 +
4185 +                       /*
4186 +                        * Do not log dedup for tree item, it's not counted as
4187 +                        * scanned in this round.
4188 +                        */
4189 +                       if (success2)
4190 +                               stable_tree_append(tree_rmap_item, snode, 0);
4191 +
4192 +                       /*
4193 +                        * The order of these two stable append is important:
4194 +                        * we are scanning rmap_item.
4195 +                        */
4196 +                       if (success1)
4197 +                               stable_tree_append(rmap_item, snode, 1);
4198 +
4199 +                       /*
4200 +                        * The original kpage may be unlocked inside
4201 +                        * stable_tree_insert() already. This page
4202 +                        * should be unlocked before doing
4203 +                        * break_cow().
4204 +                        */
4205 +                       unlock_page(kpage);
4206 +
4207 +                       if (!success1)
4208 +                               break_cow(rmap_item);
4209 +
4210 +                       if (!success2)
4211 +                               break_cow(tree_rmap_item);
4212 +
4213 +               } else if (err == MERGE_ERR_COLLI) {
4214 +                       BUG_ON(tree_rmap_item->tree_node->count > 1);
4215 +
4216 +                       rmap_item_hash_max(tree_rmap_item,
4217 +                                          tree_rmap_item->tree_node->hash);
4218 +
4219 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
4220 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4221 +                       parent = &tree_rmap_item->node;
4222 +                       if (cmp < 0)
4223 +                               new = &parent->rb_left;
4224 +                       else if (cmp > 0)
4225 +                               new = &parent->rb_right;
4226 +                       else
4227 +                               goto put_up_out;
4228 +
4229 +                       rmap_item->tree_node = tree_rmap_item->tree_node;
4230 +                       rmap_item->address |= UNSTABLE_FLAG;
4231 +                       rmap_item->hash_round = uksm_hash_round;
4232 +                       rb_link_node(&rmap_item->node, parent, new);
4233 +                       rb_insert_color(&rmap_item->node,
4234 +                                       &tree_rmap_item->tree_node->sub_root);
4235 +                       rmap_item->tree_node->count++;
4236 +               } else {
4237 +                       /*
4238 +                        * either one of the page has changed or they collide
4239 +                        * at the max hash, we consider them as ill items.
4240 +                        */
4241 +                       remove_rmap_item_from_tree(tree_rmap_item);
4242 +               }
4243 +put_up_out:
4244 +               put_page(tree_rmap_item->page);
4245 +               up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
4246 +       }
4247 +}
4248 +
4249 +
4250 +
4251 +
4252 +static inline unsigned long get_pool_index(struct vma_slot *slot,
4253 +                                          unsigned long index)
4254 +{
4255 +       unsigned long pool_index;
4256 +
4257 +       pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4258 +       if (pool_index >= slot->pool_size)
4259 +               BUG();
4260 +       return pool_index;
4261 +}
4262 +
4263 +static inline unsigned long index_page_offset(unsigned long index)
4264 +{
4265 +       return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4266 +}
4267 +
4268 +static inline
4269 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4270 +                                           unsigned long index, int need_alloc)
4271 +{
4272 +       unsigned long pool_index;
4273 +       struct page *page;
4274 +       void *addr;
4275 +
4276 +
4277 +       pool_index = get_pool_index(slot, index);
4278 +       if (!slot->rmap_list_pool[pool_index]) {
4279 +               if (!need_alloc)
4280 +                       return NULL;
4281 +
4282 +               page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4283 +               if (!page)
4284 +                       return NULL;
4285 +
4286 +               slot->rmap_list_pool[pool_index] = page;
4287 +       }
4288 +
4289 +       addr = kmap(slot->rmap_list_pool[pool_index]);
4290 +       addr += index_page_offset(index);
4291 +
4292 +       return addr;
4293 +}
4294 +
4295 +static inline void put_rmap_list_entry(struct vma_slot *slot,
4296 +                                      unsigned long index)
4297 +{
4298 +       unsigned long pool_index;
4299 +
4300 +       pool_index = get_pool_index(slot, index);
4301 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4302 +       kunmap(slot->rmap_list_pool[pool_index]);
4303 +}
4304 +
4305 +static inline int entry_is_new(struct rmap_list_entry *entry)
4306 +{
4307 +       return !entry->item;
4308 +}
4309 +
4310 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4311 +                                               unsigned long index)
4312 +{
4313 +       return slot->vma->vm_start + (index << PAGE_SHIFT);
4314 +}
4315 +
4316 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4317 +{
4318 +       unsigned long addr;
4319 +
4320 +       if (is_addr(entry->addr))
4321 +               addr = get_clean_addr(entry->addr);
4322 +       else if (entry->item)
4323 +               addr = get_rmap_addr(entry->item);
4324 +       else
4325 +               BUG();
4326 +
4327 +       return addr;
4328 +}
4329 +
4330 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4331 +{
4332 +       if (is_addr(entry->addr))
4333 +               return NULL;
4334 +
4335 +       return entry->item;
4336 +}
4337 +
4338 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4339 +                                           unsigned long index)
4340 +{
4341 +       unsigned long pool_index;
4342 +
4343 +       pool_index = get_pool_index(slot, index);
4344 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4345 +       slot->pool_counts[pool_index]++;
4346 +}
4347 +
4348 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4349 +                                           unsigned long index)
4350 +{
4351 +       unsigned long pool_index;
4352 +
4353 +       pool_index = get_pool_index(slot, index);
4354 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4355 +       BUG_ON(!slot->pool_counts[pool_index]);
4356 +       slot->pool_counts[pool_index]--;
4357 +}
4358 +
4359 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
4360 +{
4361 +       return !is_addr(entry->addr) && entry->item;
4362 +}
4363 +
4364 +static inline void swap_entries(struct rmap_list_entry *entry1,
4365 +                               unsigned long index1,
4366 +                               struct rmap_list_entry *entry2,
4367 +                               unsigned long index2)
4368 +{
4369 +       struct rmap_list_entry tmp;
4370 +
4371 +       /* swapping two new entries is meaningless */
4372 +       BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4373 +
4374 +       tmp = *entry1;
4375 +       *entry1 = *entry2;
4376 +       *entry2 = tmp;
4377 +
4378 +       if (entry_has_rmap(entry1))
4379 +               entry1->item->entry_index = index1;
4380 +
4381 +       if (entry_has_rmap(entry2))
4382 +               entry2->item->entry_index = index2;
4383 +
4384 +       if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4385 +               inc_rmap_list_pool_count(entry1->item->slot, index1);
4386 +               dec_rmap_list_pool_count(entry1->item->slot, index2);
4387 +       } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4388 +               inc_rmap_list_pool_count(entry2->item->slot, index2);
4389 +               dec_rmap_list_pool_count(entry2->item->slot, index1);
4390 +       }
4391 +}
4392 +
4393 +static inline void free_entry_item(struct rmap_list_entry *entry)
4394 +{
4395 +       unsigned long index;
4396 +       struct rmap_item *item;
4397 +
4398 +       if (!is_addr(entry->addr)) {
4399 +               BUG_ON(!entry->item);
4400 +               item = entry->item;
4401 +               entry->addr = get_rmap_addr(item);
4402 +               set_is_addr(entry->addr);
4403 +               index = item->entry_index;
4404 +               remove_rmap_item_from_tree(item);
4405 +               dec_rmap_list_pool_count(item->slot, index);
4406 +               free_rmap_item(item);
4407 +       }
4408 +}
4409 +
4410 +static inline int pool_entry_boundary(unsigned long index)
4411 +{
4412 +       unsigned long linear_addr;
4413 +
4414 +       linear_addr = sizeof(struct rmap_list_entry *) * index;
4415 +       return index && !offset_in_page(linear_addr);
4416 +}
4417 +
4418 +static inline void try_free_last_pool(struct vma_slot *slot,
4419 +                                     unsigned long index)
4420 +{
4421 +       unsigned long pool_index;
4422 +
4423 +       pool_index = get_pool_index(slot, index);
4424 +       if (slot->rmap_list_pool[pool_index] &&
4425 +           !slot->pool_counts[pool_index]) {
4426 +               __free_page(slot->rmap_list_pool[pool_index]);
4427 +               slot->rmap_list_pool[pool_index] = NULL;
4428 +               slot->flags |= UKSM_SLOT_NEED_SORT;
4429 +       }
4430 +
4431 +}
4432 +
4433 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4434 +                                          struct rmap_item *item)
4435 +{
4436 +       return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4437 +}
4438 +
4439 +static int within_same_pool(struct vma_slot *slot,
4440 +                           unsigned long i, unsigned long j)
4441 +{
4442 +       unsigned long pool_i, pool_j;
4443 +
4444 +       pool_i = get_pool_index(slot, i);
4445 +       pool_j = get_pool_index(slot, j);
4446 +
4447 +       return (pool_i == pool_j);
4448 +}
4449 +
4450 +static void sort_rmap_entry_list(struct vma_slot *slot)
4451 +{
4452 +       unsigned long i, j;
4453 +       struct rmap_list_entry *entry, *swap_entry;
4454 +
4455 +       entry = get_rmap_list_entry(slot, 0, 0);
4456 +       for (i = 0; i < slot->pages; ) {
4457 +
4458 +               if (!entry)
4459 +                       goto skip_whole_pool;
4460 +
4461 +               if (entry_is_new(entry))
4462 +                       goto next_entry;
4463 +
4464 +               if (is_addr(entry->addr)) {
4465 +                       entry->addr = 0;
4466 +                       goto next_entry;
4467 +               }
4468 +
4469 +               j = vma_item_index(slot->vma, entry->item);
4470 +               if (j == i)
4471 +                       goto next_entry;
4472 +
4473 +               if (within_same_pool(slot, i, j))
4474 +                       swap_entry = entry + j - i;
4475 +               else
4476 +                       swap_entry = get_rmap_list_entry(slot, j, 1);
4477 +
4478 +               swap_entries(entry, i, swap_entry, j);
4479 +               if (!within_same_pool(slot, i, j))
4480 +                       put_rmap_list_entry(slot, j);
4481 +               continue;
4482 +
4483 +skip_whole_pool:
4484 +               i += PAGE_SIZE / sizeof(*entry);
4485 +               if (i < slot->pages)
4486 +                       entry = get_rmap_list_entry(slot, i, 0);
4487 +               continue;
4488 +
4489 +next_entry:
4490 +               if (i >= slot->pages - 1 ||
4491 +                   !within_same_pool(slot, i, i + 1)) {
4492 +                       put_rmap_list_entry(slot, i);
4493 +                       if (i + 1 < slot->pages)
4494 +                               entry = get_rmap_list_entry(slot, i + 1, 0);
4495 +               } else
4496 +                       entry++;
4497 +               i++;
4498 +               continue;
4499 +       }
4500 +
4501 +       /* free empty pool entries which contain no rmap_item */
4502 +       /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4503 +       for (i = 0; i < slot->pool_size; i++) {
4504 +               unsigned char has_rmap;
4505 +               void *addr;
4506 +
4507 +               if (!slot->rmap_list_pool[i])
4508 +                       continue;
4509 +
4510 +               has_rmap = 0;
4511 +               addr = kmap(slot->rmap_list_pool[i]);
4512 +               BUG_ON(!addr);
4513 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4514 +                       entry = (struct rmap_list_entry *)addr + j;
4515 +                       if (is_addr(entry->addr))
4516 +                               continue;
4517 +                       if (!entry->item)
4518 +                               continue;
4519 +                       has_rmap = 1;
4520 +               }
4521 +               kunmap(slot->rmap_list_pool[i]);
4522 +               if (!has_rmap) {
4523 +                       BUG_ON(slot->pool_counts[i]);
4524 +                       __free_page(slot->rmap_list_pool[i]);
4525 +                       slot->rmap_list_pool[i] = NULL;
4526 +               }
4527 +       }
4528 +
4529 +       slot->flags &= ~UKSM_SLOT_NEED_SORT;
4530 +}
4531 +
4532 +/*
4533 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
4534 + */
4535 +static inline int vma_fully_scanned(struct vma_slot *slot)
4536 +{
4537 +       return slot->pages_scanned == slot->pages;
4538 +}
4539 +
4540 +/**
4541 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4542 + * its random permutation. This function is embedded with the random
4543 + * permutation index management code.
4544 + */
4545 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4546 +{
4547 +       unsigned long rand_range, addr, swap_index, scan_index;
4548 +       struct rmap_item *item = NULL;
4549 +       struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4550 +       struct page *page;
4551 +
4552 +       scan_index = swap_index = slot->pages_scanned % slot->pages;
4553 +
4554 +       if (pool_entry_boundary(scan_index))
4555 +               try_free_last_pool(slot, scan_index - 1);
4556 +
4557 +       if (vma_fully_scanned(slot)) {
4558 +               if (slot->flags & UKSM_SLOT_NEED_SORT)
4559 +                       slot->flags |= UKSM_SLOT_NEED_RERAND;
4560 +               else
4561 +                       slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4562 +               if (slot->flags & UKSM_SLOT_NEED_SORT)
4563 +                       sort_rmap_entry_list(slot);
4564 +       }
4565 +
4566 +       scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4567 +       if (!scan_entry)
4568 +               return NULL;
4569 +
4570 +       if (entry_is_new(scan_entry)) {
4571 +               scan_entry->addr = get_index_orig_addr(slot, scan_index);
4572 +               set_is_addr(scan_entry->addr);
4573 +       }
4574 +
4575 +       if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4576 +               rand_range = slot->pages - scan_index;
4577 +               BUG_ON(!rand_range);
4578 +               swap_index = scan_index + (prandom_u32() % rand_range);
4579 +       }
4580 +
4581 +       if (swap_index != scan_index) {
4582 +               swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4583 +               if (entry_is_new(swap_entry)) {
4584 +                       swap_entry->addr = get_index_orig_addr(slot,
4585 +                                                              swap_index);
4586 +                       set_is_addr(swap_entry->addr);
4587 +               }
4588 +               swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4589 +       }
4590 +
4591 +       addr = get_entry_address(scan_entry);
4592 +       item = get_entry_item(scan_entry);
4593 +       BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4594 +
4595 +       page = follow_page(slot->vma, addr, FOLL_GET);
4596 +       if (IS_ERR_OR_NULL(page))
4597 +               goto nopage;
4598 +
4599 +       if (!PageAnon(page) && !page_trans_compound_anon(page))
4600 +               goto putpage;
4601 +
4602 +       /*check is zero_page pfn or uksm_zero_page*/
4603 +       if ((page_to_pfn(page) == zero_pfn)
4604 +                       || (page_to_pfn(page) == uksm_zero_pfn))
4605 +               goto putpage;
4606 +
4607 +       flush_anon_page(slot->vma, page, addr);
4608 +       flush_dcache_page(page);
4609 +
4610 +
4611 +       *hash = page_hash(page, hash_strength, 1);
4612 +       inc_uksm_pages_scanned();
4613 +       /*if the page content all zero, re-map to zero-page*/
4614 +       if (find_zero_page_hash(hash_strength, *hash)) {
4615 +               if (!cmp_and_merge_zero_page(slot->vma, page)) {
4616 +                       slot->pages_merged++;
4617 +                       inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
4618 +                       dec_mm_counter(slot->mm, MM_ANONPAGES);
4619 +
4620 +                       /* For full-zero pages, no need to create rmap item */
4621 +                       goto putpage;
4622 +               } else {
4623 +                       inc_rshash_neg(memcmp_cost / 2);
4624 +               }
4625 +       }
4626 +
4627 +       if (!item) {
4628 +               item = alloc_rmap_item();
4629 +               if (item) {
4630 +                       /* It has already been zeroed */
4631 +                       item->slot = slot;
4632 +                       item->address = addr;
4633 +                       item->entry_index = scan_index;
4634 +                       scan_entry->item = item;
4635 +                       inc_rmap_list_pool_count(slot, scan_index);
4636 +               } else
4637 +                       goto putpage;
4638 +       }
4639 +
4640 +       BUG_ON(item->slot != slot);
4641 +       /* the page may have changed */
4642 +       item->page = page;
4643 +       put_rmap_list_entry(slot, scan_index);
4644 +       if (swap_entry)
4645 +               put_rmap_list_entry(slot, swap_index);
4646 +       return item;
4647 +
4648 +putpage:
4649 +       put_page(page);
4650 +       page = NULL;
4651 +nopage:
4652 +       /* no page, store addr back and free rmap_item if possible */
4653 +       free_entry_item(scan_entry);
4654 +       put_rmap_list_entry(slot, scan_index);
4655 +       if (swap_entry)
4656 +               put_rmap_list_entry(slot, swap_index);
4657 +       return NULL;
4658 +}
4659 +
4660 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4661 +{
4662 +       return rmap_item->address & STABLE_FLAG;
4663 +}
4664 +
4665 +/**
4666 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4667 + * mmap_sem locked.
4668 + */
4669 +static noinline void scan_vma_one_page(struct vma_slot *slot)
4670 +{
4671 +       u32 hash;
4672 +       struct mm_struct *mm;
4673 +       struct rmap_item *rmap_item = NULL;
4674 +       struct vm_area_struct *vma = slot->vma;
4675 +
4676 +       mm = vma->vm_mm;
4677 +       BUG_ON(!mm);
4678 +       BUG_ON(!slot);
4679 +
4680 +       rmap_item = get_next_rmap_item(slot, &hash);
4681 +       if (!rmap_item)
4682 +               goto out1;
4683 +
4684 +       if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4685 +               goto out2;
4686 +
4687 +       cmp_and_merge_page(rmap_item, hash);
4688 +out2:
4689 +       put_page(rmap_item->page);
4690 +out1:
4691 +       slot->pages_scanned++;
4692 +       if (slot->fully_scanned_round != fully_scanned_round)
4693 +               scanned_virtual_pages++;
4694 +
4695 +       if (vma_fully_scanned(slot))
4696 +               slot->fully_scanned_round = fully_scanned_round;
4697 +}
4698 +
4699 +static inline unsigned long rung_get_pages(struct scan_rung *rung)
4700 +{
4701 +       struct slot_tree_node *node;
4702 +
4703 +       if (!rung->vma_root.rnode)
4704 +               return 0;
4705 +
4706 +       node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4707 +
4708 +       return node->size;
4709 +}
4710 +
4711 +#define RUNG_SAMPLED_MIN       3
4712 +
4713 +static inline
4714 +void uksm_calc_rung_step(struct scan_rung *rung,
4715 +                        unsigned long page_time, unsigned long ratio)
4716 +{
4717 +       unsigned long sampled, pages;
4718 +
4719 +       /* will be fully scanned ? */
4720 +       if (!rung->cover_msecs) {
4721 +               rung->step = 1;
4722 +               return;
4723 +       }
4724 +
4725 +       sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4726 +                 * ratio / page_time;
4727 +
4728 +       /*
4729 +        *  Before we finsish a scan round and expensive per-round jobs,
4730 +        *  we need to have a chance to estimate the per page time. So
4731 +        *  the sampled number can not be too small.
4732 +        */
4733 +       if (sampled < RUNG_SAMPLED_MIN)
4734 +               sampled = RUNG_SAMPLED_MIN;
4735 +
4736 +       pages = rung_get_pages(rung);
4737 +       if (likely(pages > sampled))
4738 +               rung->step = pages / sampled;
4739 +       else
4740 +               rung->step = 1;
4741 +}
4742 +
4743 +static inline int step_need_recalc(struct scan_rung *rung)
4744 +{
4745 +       unsigned long pages, stepmax;
4746 +
4747 +       pages = rung_get_pages(rung);
4748 +       stepmax = pages / RUNG_SAMPLED_MIN;
4749 +
4750 +       return pages && (rung->step > pages ||
4751 +                        (stepmax && rung->step > stepmax));
4752 +}
4753 +
4754 +static inline
4755 +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4756 +{
4757 +       struct vma_slot *slot;
4758 +
4759 +       if (finished)
4760 +               rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4761 +
4762 +       if (step_recalc || step_need_recalc(rung)) {
4763 +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4764 +               BUG_ON(step_need_recalc(rung));
4765 +       }
4766 +
4767 +       slot_iter_index = prandom_u32() % rung->step;
4768 +       BUG_ON(!rung->vma_root.rnode);
4769 +       slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4770 +       BUG_ON(!slot);
4771 +
4772 +       rung->current_scan = slot;
4773 +       rung->current_offset = slot_iter_index;
4774 +}
4775 +
4776 +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4777 +{
4778 +       return &slot->rung->vma_root;
4779 +}
4780 +
4781 +/*
4782 + * return if resetted.
4783 + */
4784 +static int advance_current_scan(struct scan_rung *rung)
4785 +{
4786 +       unsigned short n;
4787 +       struct vma_slot *slot, *next = NULL;
4788 +
4789 +       BUG_ON(!rung->vma_root.num);
4790 +
4791 +       slot = rung->current_scan;
4792 +       n = (slot->pages - rung->current_offset) % rung->step;
4793 +       slot_iter_index = rung->step - n;
4794 +       next = sradix_tree_next(&rung->vma_root, slot->snode,
4795 +                               slot->sindex, slot_iter);
4796 +
4797 +       if (next) {
4798 +               rung->current_offset = slot_iter_index;
4799 +               rung->current_scan = next;
4800 +               return 0;
4801 +       } else {
4802 +               reset_current_scan(rung, 1, 0);
4803 +               return 1;
4804 +       }
4805 +}
4806 +
4807 +static inline void rung_rm_slot(struct vma_slot *slot)
4808 +{
4809 +       struct scan_rung *rung = slot->rung;
4810 +       struct sradix_tree_root *root;
4811 +
4812 +       if (rung->current_scan == slot)
4813 +               advance_current_scan(rung);
4814 +
4815 +       root = slot_get_root(slot);
4816 +       sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4817 +       slot->snode = NULL;
4818 +       if (step_need_recalc(rung)) {
4819 +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4820 +               BUG_ON(step_need_recalc(rung));
4821 +       }
4822 +
4823 +       /* In case advance_current_scan loop back to this slot again */
4824 +       if (rung->vma_root.num && rung->current_scan == slot)
4825 +               reset_current_scan(slot->rung, 1, 0);
4826 +}
4827 +
4828 +static inline void rung_add_new_slots(struct scan_rung *rung,
4829 +                       struct vma_slot **slots, unsigned long num)
4830 +{
4831 +       int err;
4832 +       struct vma_slot *slot;
4833 +       unsigned long i;
4834 +       struct sradix_tree_root *root = &rung->vma_root;
4835 +
4836 +       err = sradix_tree_enter(root, (void **)slots, num);
4837 +       BUG_ON(err);
4838 +
4839 +       for (i = 0; i < num; i++) {
4840 +               slot = slots[i];
4841 +               slot->rung = rung;
4842 +               BUG_ON(vma_fully_scanned(slot));
4843 +       }
4844 +
4845 +       if (rung->vma_root.num == num)
4846 +               reset_current_scan(rung, 0, 1);
4847 +}
4848 +
4849 +static inline int rung_add_one_slot(struct scan_rung *rung,
4850 +                                    struct vma_slot *slot)
4851 +{
4852 +       int err;
4853 +
4854 +       err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4855 +       if (err)
4856 +               return err;
4857 +
4858 +       slot->rung = rung;
4859 +       if (rung->vma_root.num == 1)
4860 +               reset_current_scan(rung, 0, 1);
4861 +
4862 +       return 0;
4863 +}
4864 +
4865 +/*
4866 + * Return true if the slot is deleted from its rung.
4867 + */
4868 +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4869 +{
4870 +       struct scan_rung *old_rung = slot->rung;
4871 +       int err;
4872 +
4873 +       if (old_rung == rung)
4874 +               return 0;
4875 +
4876 +       rung_rm_slot(slot);
4877 +       err = rung_add_one_slot(rung, slot);
4878 +       if (err) {
4879 +               err = rung_add_one_slot(old_rung, slot);
4880 +               WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4881 +       }
4882 +
4883 +       return 1;
4884 +}
4885 +
4886 +static inline int vma_rung_up(struct vma_slot *slot)
4887 +{
4888 +       struct scan_rung *rung;
4889 +
4890 +       rung = slot->rung;
4891 +       if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4892 +               rung++;
4893 +
4894 +       return vma_rung_enter(slot, rung);
4895 +}
4896 +
4897 +static inline int vma_rung_down(struct vma_slot *slot)
4898 +{
4899 +       struct scan_rung *rung;
4900 +
4901 +       rung = slot->rung;
4902 +       if (slot->rung != &uksm_scan_ladder[0])
4903 +               rung--;
4904 +
4905 +       return vma_rung_enter(slot, rung);
4906 +}
4907 +
4908 +/**
4909 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4910 + */
4911 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4912 +{
4913 +       unsigned long ret;
4914 +
4915 +       BUG_ON(slot->pages_scanned == slot->last_scanned);
4916 +
4917 +       ret = slot->pages_merged;
4918 +
4919 +       /* Thrashing area filtering */
4920 +       if (ret && uksm_thrash_threshold) {
4921 +               if (slot->pages_cowed * 100 / slot->pages_merged
4922 +                   > uksm_thrash_threshold) {
4923 +                       ret = 0;
4924 +               } else {
4925 +                       ret = slot->pages_merged - slot->pages_cowed;
4926 +               }
4927 +       }
4928 +
4929 +       return ret;
4930 +}
4931 +
4932 +/**
4933 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4934 + */
4935 +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4936 +{
4937 +       unsigned long ret;
4938 +       unsigned long pages_scanned;
4939 +
4940 +       pages_scanned = slot->pages_scanned;
4941 +       if (!pages_scanned) {
4942 +               if (uksm_thrash_threshold)
4943 +                       return 0;
4944 +               else
4945 +                       pages_scanned = slot->pages_scanned;
4946 +       }
4947 +
4948 +       ret = slot->pages_bemerged * 100 / pages_scanned;
4949 +
4950 +       /* Thrashing area filtering */
4951 +       if (ret && uksm_thrash_threshold) {
4952 +               if (slot->pages_cowed * 100 / slot->pages_bemerged
4953 +                   > uksm_thrash_threshold) {
4954 +                       ret = 0;
4955 +               } else {
4956 +                       ret = slot->pages_bemerged - slot->pages_cowed;
4957 +               }
4958 +       }
4959 +
4960 +       return ret;
4961 +}
4962 +
4963 +/**
4964 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4965 + * stable tree need to be restructured, this is the function re-inserting the
4966 + * stable node.
4967 + */
4968 +static inline void stable_node_reinsert(struct stable_node *new_node,
4969 +                                       struct page *page,
4970 +                                       struct rb_root *root_treep,
4971 +                                       struct list_head *tree_node_listp,
4972 +                                       u32 hash)
4973 +{
4974 +       struct rb_node **new = &root_treep->rb_node;
4975 +       struct rb_node *parent = NULL;
4976 +       struct stable_node *stable_node;
4977 +       struct tree_node *tree_node;
4978 +       struct page *tree_page;
4979 +       int cmp;
4980 +
4981 +       while (*new) {
4982 +               int cmp;
4983 +
4984 +               tree_node = rb_entry(*new, struct tree_node, node);
4985 +
4986 +               cmp = hash_cmp(hash, tree_node->hash);
4987 +
4988 +               if (cmp < 0) {
4989 +                       parent = *new;
4990 +                       new = &parent->rb_left;
4991 +               } else if (cmp > 0) {
4992 +                       parent = *new;
4993 +                       new = &parent->rb_right;
4994 +               } else
4995 +                       break;
4996 +       }
4997 +
4998 +       if (*new) {
4999 +               /* find a stable tree node with same first level hash value */
5000 +               stable_node_hash_max(new_node, page, hash);
5001 +               if (tree_node->count == 1) {
5002 +                       stable_node = rb_entry(tree_node->sub_root.rb_node,
5003 +                                              struct stable_node, node);
5004 +                       tree_page = get_uksm_page(stable_node, 1, 0);
5005 +                       if (tree_page) {
5006 +                               stable_node_hash_max(stable_node,
5007 +                                                     tree_page, hash);
5008 +                               put_page(tree_page);
5009 +
5010 +                               /* prepare for stable node insertion */
5011 +
5012 +                               cmp = hash_cmp(new_node->hash_max,
5013 +                                                  stable_node->hash_max);
5014 +                               parent = &stable_node->node;
5015 +                               if (cmp < 0)
5016 +                                       new = &parent->rb_left;
5017 +                               else if (cmp > 0)
5018 +                                       new = &parent->rb_right;
5019 +                               else
5020 +                                       goto failed;
5021 +
5022 +                               goto add_node;
5023 +                       } else {
5024 +                               /* the only stable_node deleted, the tree node
5025 +                                * was not deleted.
5026 +                                */
5027 +                               goto tree_node_reuse;
5028 +                       }
5029 +               }
5030 +
5031 +               /* well, search the collision subtree */
5032 +               new = &tree_node->sub_root.rb_node;
5033 +               parent = NULL;
5034 +               BUG_ON(!*new);
5035 +               while (*new) {
5036 +                       int cmp;
5037 +
5038 +                       stable_node = rb_entry(*new, struct stable_node, node);
5039 +
5040 +                       cmp = hash_cmp(new_node->hash_max,
5041 +                                          stable_node->hash_max);
5042 +
5043 +                       if (cmp < 0) {
5044 +                               parent = *new;
5045 +                               new = &parent->rb_left;
5046 +                       } else if (cmp > 0) {
5047 +                               parent = *new;
5048 +                               new = &parent->rb_right;
5049 +                       } else {
5050 +                               /* oh, no, still a collision */
5051 +                               goto failed;
5052 +                       }
5053 +               }
5054 +
5055 +               goto add_node;
5056 +       }
5057 +
5058 +       /* no tree node found */
5059 +       tree_node = alloc_tree_node(tree_node_listp);
5060 +       if (!tree_node) {
5061 +               printk(KERN_ERR "UKSM: memory allocation error!\n");
5062 +               goto failed;
5063 +       } else {
5064 +               tree_node->hash = hash;
5065 +               rb_link_node(&tree_node->node, parent, new);
5066 +               rb_insert_color(&tree_node->node, root_treep);
5067 +
5068 +tree_node_reuse:
5069 +               /* prepare for stable node insertion */
5070 +               parent = NULL;
5071 +               new = &tree_node->sub_root.rb_node;
5072 +       }
5073 +
5074 +add_node:
5075 +       rb_link_node(&new_node->node, parent, new);
5076 +       rb_insert_color(&new_node->node, &tree_node->sub_root);
5077 +       new_node->tree_node = tree_node;
5078 +       tree_node->count++;
5079 +       return;
5080 +
5081 +failed:
5082 +       /* This can only happen when two nodes have collided
5083 +        * in two levels.
5084 +        */
5085 +       new_node->tree_node = NULL;
5086 +       return;
5087 +}
5088 +
5089 +static inline void free_all_tree_nodes(struct list_head *list)
5090 +{
5091 +       struct tree_node *node, *tmp;
5092 +
5093 +       list_for_each_entry_safe(node, tmp, list, all_list) {
5094 +               free_tree_node(node);
5095 +       }
5096 +}
5097 +
5098 +/**
5099 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5100 + * strength to the current hash_strength. It re-structures the hole tree.
5101 + */
5102 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5103 +{
5104 +       struct stable_node *node, *tmp;
5105 +       struct rb_root *root_new_treep;
5106 +       struct list_head *new_tree_node_listp;
5107 +
5108 +       stable_tree_index = (stable_tree_index + 1) % 2;
5109 +       root_new_treep = &root_stable_tree[stable_tree_index];
5110 +       new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5111 +       *root_new_treep = RB_ROOT;
5112 +       BUG_ON(!list_empty(new_tree_node_listp));
5113 +
5114 +       /*
5115 +        * we need to be safe, the node could be removed by get_uksm_page()
5116 +        */
5117 +       list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5118 +               void *addr;
5119 +               struct page *node_page;
5120 +               u32 hash;
5121 +
5122 +               /*
5123 +                * We are completely re-structuring the stable nodes to a new
5124 +                * stable tree. We don't want to touch the old tree unlinks and
5125 +                * old tree_nodes. The old tree_nodes will be freed at once.
5126 +                */
5127 +               node_page = get_uksm_page(node, 0, 0);
5128 +               if (!node_page)
5129 +                       continue;
5130 +
5131 +               if (node->tree_node) {
5132 +                       hash = node->tree_node->hash;
5133 +
5134 +                       addr = kmap_atomic(node_page);
5135 +
5136 +                       hash = delta_hash(addr, prev_hash_strength,
5137 +                                         hash_strength, hash);
5138 +                       kunmap_atomic(addr);
5139 +               } else {
5140 +                       /*
5141 +                        *it was not inserted to rbtree due to collision in last
5142 +                        *round scan.
5143 +                        */
5144 +                       hash = page_hash(node_page, hash_strength, 0);
5145 +               }
5146 +
5147 +               stable_node_reinsert(node, node_page, root_new_treep,
5148 +                                    new_tree_node_listp, hash);
5149 +               put_page(node_page);
5150 +       }
5151 +
5152 +       root_stable_treep = root_new_treep;
5153 +       free_all_tree_nodes(stable_tree_node_listp);
5154 +       BUG_ON(!list_empty(stable_tree_node_listp));
5155 +       stable_tree_node_listp = new_tree_node_listp;
5156 +}
5157 +
5158 +static inline void inc_hash_strength(unsigned long delta)
5159 +{
5160 +       hash_strength += 1 << delta;
5161 +       if (hash_strength > HASH_STRENGTH_MAX)
5162 +               hash_strength = HASH_STRENGTH_MAX;
5163 +}
5164 +
5165 +static inline void dec_hash_strength(unsigned long delta)
5166 +{
5167 +       unsigned long change = 1 << delta;
5168 +
5169 +       if (hash_strength <= change + 1)
5170 +               hash_strength = 1;
5171 +       else
5172 +               hash_strength -= change;
5173 +}
5174 +
5175 +static inline void inc_hash_strength_delta(void)
5176 +{
5177 +       hash_strength_delta++;
5178 +       if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5179 +               hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5180 +}
5181 +
5182 +/*
5183 +static inline unsigned long get_current_neg_ratio(void)
5184 +{
5185 +       if (!rshash_pos || rshash_neg > rshash_pos)
5186 +               return 100;
5187 +
5188 +       return div64_u64(100 * rshash_neg , rshash_pos);
5189 +}
5190 +*/
5191 +
5192 +static inline unsigned long get_current_neg_ratio(void)
5193 +{
5194 +       u64 pos = benefit.pos;
5195 +       u64 neg = benefit.neg;
5196 +
5197 +       if (!neg)
5198 +               return 0;
5199 +
5200 +       if (!pos || neg > pos)
5201 +               return 100;
5202 +
5203 +       if (neg > div64_u64(U64_MAX, 100))
5204 +               pos = div64_u64(pos, 100);
5205 +       else
5206 +               neg *= 100;
5207 +
5208 +       return div64_u64(neg, pos);
5209 +}
5210 +
5211 +static inline unsigned long get_current_benefit(void)
5212 +{
5213 +       u64 pos = benefit.pos;
5214 +       u64 neg = benefit.neg;
5215 +       u64 scanned = benefit.scanned;
5216 +
5217 +       if (neg > pos)
5218 +               return 0;
5219 +
5220 +       return div64_u64((pos - neg), scanned);
5221 +}
5222 +
5223 +static inline int judge_rshash_direction(void)
5224 +{
5225 +       u64 current_neg_ratio, stable_benefit;
5226 +       u64 current_benefit, delta = 0;
5227 +       int ret = STILL;
5228 +
5229 +       /* Try to probe a value after the boot, and in case the system
5230 +          are still for a long time. */
5231 +       if ((fully_scanned_round & 0xFFULL) == 10) {
5232 +               ret = OBSCURE;
5233 +               goto out;
5234 +       }
5235 +
5236 +       current_neg_ratio = get_current_neg_ratio();
5237 +
5238 +       if (current_neg_ratio == 0) {
5239 +               rshash_neg_cont_zero++;
5240 +               if (rshash_neg_cont_zero > 2)
5241 +                       return GO_DOWN;
5242 +               else
5243 +                       return STILL;
5244 +       }
5245 +       rshash_neg_cont_zero = 0;
5246 +
5247 +       if (current_neg_ratio > 90) {
5248 +               ret = GO_UP;
5249 +               goto out;
5250 +       }
5251 +
5252 +       current_benefit = get_current_benefit();
5253 +       stable_benefit = rshash_state.stable_benefit;
5254 +
5255 +       if (!stable_benefit) {
5256 +               ret = OBSCURE;
5257 +               goto out;
5258 +       }
5259 +
5260 +       if (current_benefit > stable_benefit)
5261 +               delta = current_benefit - stable_benefit;
5262 +       else if (current_benefit < stable_benefit)
5263 +               delta = stable_benefit - current_benefit;
5264 +
5265 +       delta = div64_u64(100 * delta , stable_benefit);
5266 +
5267 +       if (delta > 50) {
5268 +               rshash_cont_obscure++;
5269 +               if (rshash_cont_obscure > 2)
5270 +                       return OBSCURE;
5271 +               else
5272 +                       return STILL;
5273 +       }
5274 +
5275 +out:
5276 +       rshash_cont_obscure = 0;
5277 +       return ret;
5278 +}
5279 +
5280 +/**
5281 + * rshash_adjust() - The main function to control the random sampling state
5282 + * machine for hash strength adapting.
5283 + *
5284 + * return true if hash_strength has changed.
5285 + */
5286 +static inline int rshash_adjust(void)
5287 +{
5288 +       unsigned long prev_hash_strength = hash_strength;
5289 +
5290 +       if (!encode_benefit())
5291 +               return 0;
5292 +
5293 +       switch (rshash_state.state) {
5294 +       case RSHASH_STILL:
5295 +               switch (judge_rshash_direction()) {
5296 +               case GO_UP:
5297 +                       if (rshash_state.pre_direct == GO_DOWN)
5298 +                               hash_strength_delta = 0;
5299 +
5300 +                       inc_hash_strength(hash_strength_delta);
5301 +                       inc_hash_strength_delta();
5302 +                       rshash_state.stable_benefit = get_current_benefit();
5303 +                       rshash_state.pre_direct = GO_UP;
5304 +                       break;
5305 +
5306 +               case GO_DOWN:
5307 +                       if (rshash_state.pre_direct == GO_UP)
5308 +                               hash_strength_delta = 0;
5309 +
5310 +                       dec_hash_strength(hash_strength_delta);
5311 +                       inc_hash_strength_delta();
5312 +                       rshash_state.stable_benefit = get_current_benefit();
5313 +                       rshash_state.pre_direct = GO_DOWN;
5314 +                       break;
5315 +
5316 +               case OBSCURE:
5317 +                       rshash_state.stable_point = hash_strength;
5318 +                       rshash_state.turn_point_down = hash_strength;
5319 +                       rshash_state.turn_point_up = hash_strength;
5320 +                       rshash_state.turn_benefit_down = get_current_benefit();
5321 +                       rshash_state.turn_benefit_up = get_current_benefit();
5322 +                       rshash_state.lookup_window_index = 0;
5323 +                       rshash_state.state = RSHASH_TRYDOWN;
5324 +                       dec_hash_strength(hash_strength_delta);
5325 +                       inc_hash_strength_delta();
5326 +                       break;
5327 +
5328 +               case STILL:
5329 +                       break;
5330 +               default:
5331 +                       BUG();
5332 +               }
5333 +               break;
5334 +
5335 +       case RSHASH_TRYDOWN:
5336 +               if (rshash_state.lookup_window_index++ % 5 == 0)
5337 +                       rshash_state.below_count = 0;
5338 +
5339 +               if (get_current_benefit() < rshash_state.stable_benefit)
5340 +                       rshash_state.below_count++;
5341 +               else if (get_current_benefit() >
5342 +                        rshash_state.turn_benefit_down) {
5343 +                       rshash_state.turn_point_down = hash_strength;
5344 +                       rshash_state.turn_benefit_down = get_current_benefit();
5345 +               }
5346 +
5347 +               if (rshash_state.below_count >= 3 ||
5348 +                   judge_rshash_direction() == GO_UP ||
5349 +                   hash_strength == 1) {
5350 +                       hash_strength = rshash_state.stable_point;
5351 +                       hash_strength_delta = 0;
5352 +                       inc_hash_strength(hash_strength_delta);
5353 +                       inc_hash_strength_delta();
5354 +                       rshash_state.lookup_window_index = 0;
5355 +                       rshash_state.state = RSHASH_TRYUP;
5356 +                       hash_strength_delta = 0;
5357 +               } else {
5358 +                       dec_hash_strength(hash_strength_delta);
5359 +                       inc_hash_strength_delta();
5360 +               }
5361 +               break;
5362 +
5363 +       case RSHASH_TRYUP:
5364 +               if (rshash_state.lookup_window_index++ % 5 == 0)
5365 +                       rshash_state.below_count = 0;
5366 +
5367 +               if (get_current_benefit() < rshash_state.turn_benefit_down)
5368 +                       rshash_state.below_count++;
5369 +               else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5370 +                       rshash_state.turn_point_up = hash_strength;
5371 +                       rshash_state.turn_benefit_up = get_current_benefit();
5372 +               }
5373 +
5374 +               if (rshash_state.below_count >= 3 ||
5375 +                   judge_rshash_direction() == GO_DOWN ||
5376 +                   hash_strength == HASH_STRENGTH_MAX) {
5377 +                       hash_strength = rshash_state.turn_benefit_up >
5378 +                               rshash_state.turn_benefit_down ?
5379 +                               rshash_state.turn_point_up :
5380 +                               rshash_state.turn_point_down;
5381 +
5382 +                       rshash_state.state = RSHASH_PRE_STILL;
5383 +               } else {
5384 +                       inc_hash_strength(hash_strength_delta);
5385 +                       inc_hash_strength_delta();
5386 +               }
5387 +
5388 +               break;
5389 +
5390 +       case RSHASH_NEW:
5391 +       case RSHASH_PRE_STILL:
5392 +               rshash_state.stable_benefit = get_current_benefit();
5393 +               rshash_state.state = RSHASH_STILL;
5394 +               hash_strength_delta = 0;
5395 +               break;
5396 +       default:
5397 +               BUG();
5398 +       }
5399 +
5400 +       /* rshash_neg = rshash_pos = 0; */
5401 +       reset_benefit();
5402 +
5403 +       if (prev_hash_strength != hash_strength)
5404 +               stable_tree_delta_hash(prev_hash_strength);
5405 +
5406 +       return prev_hash_strength != hash_strength;
5407 +}
5408 +
5409 +/**
5410 + * round_update_ladder() - The main function to do update of all the
5411 + * adjustments whenever a scan round is finished.
5412 + */
5413 +static noinline void round_update_ladder(void)
5414 +{
5415 +       int i;
5416 +       unsigned long dedup;
5417 +       struct vma_slot *slot, *tmp_slot;
5418 +
5419 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5420 +               uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5421 +       }
5422 +
5423 +       list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5424 +
5425 +               /* slot may be rung_rm_slot() when mm exits */
5426 +               if (slot->snode) {
5427 +                       dedup = cal_dedup_ratio_old(slot);
5428 +                       if (dedup && dedup >= uksm_abundant_threshold)
5429 +                               vma_rung_up(slot);
5430 +               }
5431 +
5432 +               slot->pages_bemerged = 0;
5433 +               slot->pages_cowed = 0;
5434 +
5435 +               list_del_init(&slot->dedup_list);
5436 +       }
5437 +}
5438 +
5439 +static void uksm_del_vma_slot(struct vma_slot *slot)
5440 +{
5441 +       int i, j;
5442 +       struct rmap_list_entry *entry;
5443 +
5444 +       if (slot->snode) {
5445 +               /*
5446 +                * In case it just failed when entering the rung, it's not
5447 +                * necessary.
5448 +                */
5449 +               rung_rm_slot(slot);
5450 +       }
5451 +
5452 +       if (!list_empty(&slot->dedup_list))
5453 +               list_del(&slot->dedup_list);
5454 +
5455 +       if (!slot->rmap_list_pool || !slot->pool_counts) {
5456 +               /* In case it OOMed in uksm_vma_enter() */
5457 +               goto out;
5458 +       }
5459 +
5460 +       for (i = 0; i < slot->pool_size; i++) {
5461 +               void *addr;
5462 +
5463 +               if (!slot->rmap_list_pool[i])
5464 +                       continue;
5465 +
5466 +               addr = kmap(slot->rmap_list_pool[i]);
5467 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5468 +                       entry = (struct rmap_list_entry *)addr + j;
5469 +                       if (is_addr(entry->addr))
5470 +                               continue;
5471 +                       if (!entry->item)
5472 +                               continue;
5473 +
5474 +                       remove_rmap_item_from_tree(entry->item);
5475 +                       free_rmap_item(entry->item);
5476 +                       slot->pool_counts[i]--;
5477 +               }
5478 +               BUG_ON(slot->pool_counts[i]);
5479 +               kunmap(slot->rmap_list_pool[i]);
5480 +               __free_page(slot->rmap_list_pool[i]);
5481 +       }
5482 +       kfree(slot->rmap_list_pool);
5483 +       kfree(slot->pool_counts);
5484 +
5485 +out:
5486 +       slot->rung = NULL;
5487 +       BUG_ON(uksm_pages_total < slot->pages);
5488 +       if (slot->flags & UKSM_SLOT_IN_UKSM)
5489 +               uksm_pages_total -= slot->pages;
5490 +
5491 +       if (slot->fully_scanned_round == fully_scanned_round)
5492 +               scanned_virtual_pages -= slot->pages;
5493 +       else
5494 +               scanned_virtual_pages -= slot->pages_scanned;
5495 +       free_vma_slot(slot);
5496 +}
5497 +
5498 +
5499 +#define SPIN_LOCK_PERIOD       32
5500 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5501 +static inline void cleanup_vma_slots(void)
5502 +{
5503 +       struct vma_slot *slot;
5504 +       int i;
5505 +
5506 +       i = 0;
5507 +       spin_lock(&vma_slot_list_lock);
5508 +       while (!list_empty(&vma_slot_del)) {
5509 +               slot = list_entry(vma_slot_del.next,
5510 +                                 struct vma_slot, slot_list);
5511 +               list_del(&slot->slot_list);
5512 +               cleanup_slots[i++] = slot;
5513 +               if (i == SPIN_LOCK_PERIOD) {
5514 +                       spin_unlock(&vma_slot_list_lock);
5515 +                       while (--i >= 0)
5516 +                               uksm_del_vma_slot(cleanup_slots[i]);
5517 +                       i = 0;
5518 +                       spin_lock(&vma_slot_list_lock);
5519 +               }
5520 +       }
5521 +       spin_unlock(&vma_slot_list_lock);
5522 +
5523 +       while (--i >= 0)
5524 +               uksm_del_vma_slot(cleanup_slots[i]);
5525 +}
5526 +
5527 +/*
5528 +*expotional moving average formula
5529 +*/
5530 +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5531 +{
5532 +       /*
5533 +        * For a very high burst, even the ema cannot work well, a false very
5534 +        * high per-page time estimation can result in feedback in very high
5535 +        * overhead of context swith and rung update -- this will then lead
5536 +        * to higher per-paper time, this may not converge.
5537 +        *
5538 +        * Instead, we try to approach this value in a binary manner.
5539 +        */
5540 +       if (curr > last_ema * 10)
5541 +               return last_ema * 2;
5542 +
5543 +       return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5544 +}
5545 +
5546 +/*
5547 + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5548 + * nanoseconds based on current uksm_sleep_jiffies.
5549 + */
5550 +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5551 +{
5552 +       return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5553 +               (TIME_RATIO_SCALE - ratio) * ratio;
5554 +}
5555 +
5556 +
5557 +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5558 +{
5559 +       unsigned long ret;
5560 +
5561 +       BUG_ON(!cpu_time_ratio);
5562 +
5563 +       if (cpu_time_ratio > 0)
5564 +               ret = cpu_time_ratio;
5565 +       else
5566 +               ret = (unsigned long)(-cpu_time_ratio) *
5567 +                       uksm_max_cpu_percentage / 100UL;
5568 +
5569 +       return ret ? ret : 1;
5570 +}
5571 +
5572 +static noinline void uksm_calc_scan_pages(void)
5573 +{
5574 +       struct scan_rung *ladder = uksm_scan_ladder;
5575 +       unsigned long sleep_usecs, nsecs;
5576 +       unsigned long ratio;
5577 +       int i;
5578 +       unsigned long per_page;
5579 +
5580 +       if (uksm_ema_page_time > 100000 ||
5581 +           (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5582 +               uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5583 +
5584 +       per_page = uksm_ema_page_time;
5585 +       BUG_ON(!per_page);
5586 +
5587 +       /*
5588 +        * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5589 +        * based on saved user input.
5590 +        */
5591 +       if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5592 +               uksm_sleep_jiffies = uksm_sleep_saved;
5593 +
5594 +       /* We require a rung scan at least 1 page in a period. */
5595 +       nsecs = per_page;
5596 +       ratio = rung_real_ratio(ladder[0].cpu_ratio);
5597 +       if (cpu_ratio_to_nsec(ratio) < nsecs) {
5598 +               sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5599 +                               / NSEC_PER_USEC;
5600 +               uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5601 +       }
5602 +
5603 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5604 +               ratio = rung_real_ratio(ladder[i].cpu_ratio);
5605 +               ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5606 +                                       per_page;
5607 +               BUG_ON(!ladder[i].pages_to_scan);
5608 +               uksm_calc_rung_step(&ladder[i], per_page, ratio);
5609 +       }
5610 +}
5611 +
5612 +/*
5613 + * From the scan time of this round (ns) to next expected min sleep time
5614 + * (ms), be careful of the possible overflows. ratio is taken from
5615 + * rung_real_ratio()
5616 + */
5617 +static inline
5618 +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5619 +{
5620 +       scan_time >>= 20; /* to msec level now */
5621 +       BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5622 +
5623 +       return (unsigned int) ((unsigned long) scan_time *
5624 +                              (TIME_RATIO_SCALE - ratio) / ratio);
5625 +}
5626 +
5627 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5628 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5629 +
5630 +static inline unsigned long vma_pool_size(struct vma_slot *slot)
5631 +{
5632 +       return round_up(sizeof(struct rmap_list_entry) * slot->pages,
5633 +                       PAGE_SIZE) >> PAGE_SHIFT;
5634 +}
5635 +
5636 +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5637 +{
5638 +       struct scan_rung *rung;
5639 +       unsigned long pool_size, i;
5640 +       struct vma_slot *slot;
5641 +       int failed;
5642 +
5643 +       rung = &uksm_scan_ladder[0];
5644 +
5645 +       failed = 0;
5646 +       for (i = 0; i < num; i++) {
5647 +               slot = slots[i];
5648 +
5649 +               pool_size = vma_pool_size(slot);
5650 +               slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5651 +                                              pool_size, GFP_KERNEL);
5652 +               if (!slot->rmap_list_pool)
5653 +                       break;
5654 +
5655 +               slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
5656 +                                           GFP_KERNEL);
5657 +               if (!slot->pool_counts) {
5658 +                       kfree(slot->rmap_list_pool);
5659 +                       break;
5660 +               }
5661 +
5662 +               slot->pool_size = pool_size;
5663 +               BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
5664 +               slot->flags |= UKSM_SLOT_IN_UKSM;
5665 +               uksm_pages_total += slot->pages;
5666 +       }
5667 +
5668 +       if (i)
5669 +               rung_add_new_slots(rung, slots, i);
5670 +
5671 +       return;
5672 +}
5673 +
5674 +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5675 +
5676 +static void uksm_enter_all_slots(void)
5677 +{
5678 +       struct vma_slot *slot;
5679 +       unsigned long index;
5680 +       struct list_head empty_vma_list;
5681 +       int i;
5682 +
5683 +       i = 0;
5684 +       index = 0;
5685 +       INIT_LIST_HEAD(&empty_vma_list);
5686 +
5687 +       spin_lock(&vma_slot_list_lock);
5688 +       while (!list_empty(&vma_slot_new)) {
5689 +               slot = list_entry(vma_slot_new.next,
5690 +                                 struct vma_slot, slot_list);
5691 +
5692 +               if (!slot->vma->anon_vma) {
5693 +                       list_move(&slot->slot_list, &empty_vma_list);
5694 +               } else if (vma_can_enter(slot->vma)) {
5695 +                       batch_slots[index++] = slot;
5696 +                       list_del_init(&slot->slot_list);
5697 +               } else {
5698 +                       list_move(&slot->slot_list, &vma_slot_noadd);
5699 +               }
5700 +
5701 +               if (++i == SPIN_LOCK_PERIOD ||
5702 +                   (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5703 +                       spin_unlock(&vma_slot_list_lock);
5704 +
5705 +                       if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5706 +                               uksm_vma_enter(batch_slots, index);
5707 +                               index = 0;
5708 +                       }
5709 +                       i = 0;
5710 +                       cond_resched();
5711 +                       spin_lock(&vma_slot_list_lock);
5712 +               }
5713 +       }
5714 +
5715 +       list_splice(&empty_vma_list, &vma_slot_new);
5716 +
5717 +       spin_unlock(&vma_slot_list_lock);
5718 +
5719 +       if (index)
5720 +               uksm_vma_enter(batch_slots, index);
5721 +
5722 +}
5723 +
5724 +static inline int rung_round_finished(struct scan_rung *rung)
5725 +{
5726 +       return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5727 +}
5728 +
5729 +static inline void judge_slot(struct vma_slot *slot)
5730 +{
5731 +       struct scan_rung *rung = slot->rung;
5732 +       unsigned long dedup;
5733 +       int deleted;
5734 +
5735 +       dedup = cal_dedup_ratio(slot);
5736 +       if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5737 +               deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5738 +       else if (dedup && dedup >= uksm_abundant_threshold)
5739 +               deleted = vma_rung_up(slot);
5740 +       else
5741 +               deleted = vma_rung_down(slot);
5742 +
5743 +       slot->pages_merged = 0;
5744 +       slot->pages_cowed = 0;
5745 +
5746 +       if (vma_fully_scanned(slot))
5747 +               slot->pages_scanned = 0;
5748 +
5749 +       slot->last_scanned = slot->pages_scanned;
5750 +
5751 +       /* If its deleted in above, then rung was already advanced. */
5752 +       if (!deleted)
5753 +               advance_current_scan(rung);
5754 +}
5755 +
5756 +
5757 +static inline int hash_round_finished(void)
5758 +{
5759 +       if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5760 +               scanned_virtual_pages = 0;
5761 +               if (uksm_pages_scanned)
5762 +                       fully_scanned_round++;
5763 +
5764 +               return 1;
5765 +       } else {
5766 +               return 0;
5767 +       }
5768 +}
5769 +
5770 +#define UKSM_MMSEM_BATCH       5
5771 +#define BUSY_RETRY             100
5772 +
5773 +/**
5774 + * uksm_do_scan()  - the main worker function.
5775 + */
5776 +static noinline void uksm_do_scan(void)
5777 +{
5778 +       struct vma_slot *slot, *iter;
5779 +       struct mm_struct *busy_mm;
5780 +       unsigned char round_finished, all_rungs_emtpy;
5781 +       int i, err, mmsem_batch;
5782 +       unsigned long pcost;
5783 +       long long delta_exec;
5784 +       unsigned long vpages, max_cpu_ratio;
5785 +       unsigned long long start_time, end_time, scan_time;
5786 +       unsigned int expected_jiffies;
5787 +
5788 +       might_sleep();
5789 +
5790 +       vpages = 0;
5791 +
5792 +       start_time = task_sched_runtime(current);
5793 +       max_cpu_ratio = 0;
5794 +       mmsem_batch = 0;
5795 +
5796 +       for (i = 0; i < SCAN_LADDER_SIZE;) {
5797 +               struct scan_rung *rung = &uksm_scan_ladder[i];
5798 +               unsigned long ratio;
5799 +               int busy_retry;
5800 +
5801 +               if (!rung->pages_to_scan) {
5802 +                       i++;
5803 +                       continue;
5804 +               }
5805 +
5806 +               if (!rung->vma_root.num) {
5807 +                       rung->pages_to_scan = 0;
5808 +                       i++;
5809 +                       continue;
5810 +               }
5811 +
5812 +               ratio = rung_real_ratio(rung->cpu_ratio);
5813 +               if (ratio > max_cpu_ratio)
5814 +                       max_cpu_ratio = ratio;
5815 +
5816 +               busy_retry = BUSY_RETRY;
5817 +               /*
5818 +                * Do not consider rung_round_finished() here, just used up the
5819 +                * rung->pages_to_scan quota.
5820 +                */
5821 +               while (rung->pages_to_scan && rung->vma_root.num &&
5822 +                      likely(!freezing(current))) {
5823 +                       int reset = 0;
5824 +
5825 +                       slot = rung->current_scan;
5826 +
5827 +                       BUG_ON(vma_fully_scanned(slot));
5828 +
5829 +                       if (mmsem_batch) {
5830 +                               err = 0;
5831 +                       } else {
5832 +                               err = try_down_read_slot_mmap_sem(slot);
5833 +                       }
5834 +
5835 +                       if (err == -ENOENT) {
5836 +rm_slot:
5837 +                               rung_rm_slot(slot);
5838 +                               continue;
5839 +                       }
5840 +
5841 +                       busy_mm = slot->mm;
5842 +
5843 +                       if (err == -EBUSY) {
5844 +                               /* skip other vmas on the same mm */
5845 +                               do {
5846 +                                       reset = advance_current_scan(rung);
5847 +                                       iter = rung->current_scan;
5848 +                                       busy_retry--;
5849 +                                       if (iter->vma->vm_mm != busy_mm ||
5850 +                                           !busy_retry || reset)
5851 +                                               break;
5852 +                               } while (1);
5853 +
5854 +                               if (iter->vma->vm_mm != busy_mm) {
5855 +                                       continue;
5856 +                               } else {
5857 +                                       /* scan round finsished */
5858 +                                       break;
5859 +                               }
5860 +                       }
5861 +
5862 +                       BUG_ON(!vma_can_enter(slot->vma));
5863 +                       if (uksm_test_exit(slot->vma->vm_mm)) {
5864 +                               mmsem_batch = 0;
5865 +                               up_read(&slot->vma->vm_mm->mmap_sem);
5866 +                               goto rm_slot;
5867 +                       }
5868 +
5869 +                       if (mmsem_batch)
5870 +                               mmsem_batch--;
5871 +                       else
5872 +                               mmsem_batch = UKSM_MMSEM_BATCH;
5873 +
5874 +                       /* Ok, we have take the mmap_sem, ready to scan */
5875 +                       scan_vma_one_page(slot);
5876 +                       rung->pages_to_scan--;
5877 +                       vpages++;
5878 +
5879 +                       if (rung->current_offset + rung->step > slot->pages - 1
5880 +                           || vma_fully_scanned(slot)) {
5881 +                               up_read(&slot->vma->vm_mm->mmap_sem);
5882 +                               judge_slot(slot);
5883 +                               mmsem_batch = 0;
5884 +                       } else {
5885 +                               rung->current_offset += rung->step;
5886 +                               if (!mmsem_batch)
5887 +                                       up_read(&slot->vma->vm_mm->mmap_sem);
5888 +                       }
5889 +
5890 +                       busy_retry = BUSY_RETRY;
5891 +                       cond_resched();
5892 +               }
5893 +
5894 +               if (mmsem_batch) {
5895 +                       up_read(&slot->vma->vm_mm->mmap_sem);
5896 +                       mmsem_batch = 0;
5897 +               }
5898 +
5899 +               if (freezing(current))
5900 +                       break;
5901 +
5902 +               cond_resched();
5903 +       }
5904 +       end_time = task_sched_runtime(current);
5905 +       delta_exec = end_time - start_time;
5906 +
5907 +       if (freezing(current))
5908 +               return;
5909 +
5910 +       cleanup_vma_slots();
5911 +       uksm_enter_all_slots();
5912 +
5913 +       round_finished = 1;
5914 +       all_rungs_emtpy = 1;
5915 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5916 +               struct scan_rung *rung = &uksm_scan_ladder[i];
5917 +
5918 +               if (rung->vma_root.num) {
5919 +                       all_rungs_emtpy = 0;
5920 +                       if (!rung_round_finished(rung))
5921 +                               round_finished = 0;
5922 +               }
5923 +       }
5924 +
5925 +       if (all_rungs_emtpy)
5926 +               round_finished = 0;
5927 +
5928 +       if (round_finished) {
5929 +               round_update_ladder();
5930 +               uksm_eval_round++;
5931 +
5932 +               if (hash_round_finished() && rshash_adjust()) {
5933 +                       /* Reset the unstable root iff hash strength changed */
5934 +                       uksm_hash_round++;
5935 +                       root_unstable_tree = RB_ROOT;
5936 +                       free_all_tree_nodes(&unstable_tree_node_list);
5937 +               }
5938 +
5939 +               /*
5940 +                * A number of pages can hang around indefinitely on per-cpu
5941 +                * pagevecs, raised page count preventing write_protect_page
5942 +                * from merging them.  Though it doesn't really matter much,
5943 +                * it is puzzling to see some stuck in pages_volatile until
5944 +                * other activity jostles them out, and they also prevented
5945 +                * LTP's KSM test from succeeding deterministically; so drain
5946 +                * them here (here rather than on entry to uksm_do_scan(),
5947 +                * so we don't IPI too often when pages_to_scan is set low).
5948 +                */
5949 +               lru_add_drain_all();
5950 +       }
5951 +
5952 +
5953 +       if (vpages && delta_exec > 0) {
5954 +               pcost = (unsigned long) delta_exec / vpages;
5955 +               if (likely(uksm_ema_page_time))
5956 +                       uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5957 +               else
5958 +                       uksm_ema_page_time = pcost;
5959 +       }
5960 +
5961 +       uksm_calc_scan_pages();
5962 +       uksm_sleep_real = uksm_sleep_jiffies;
5963 +       /* in case of radical cpu bursts, apply the upper bound */
5964 +       end_time = task_sched_runtime(current);
5965 +       if (max_cpu_ratio && end_time > start_time) {
5966 +               scan_time = end_time - start_time;
5967 +               expected_jiffies = msecs_to_jiffies(
5968 +                       scan_time_to_sleep(scan_time, max_cpu_ratio));
5969 +
5970 +               if (expected_jiffies > uksm_sleep_real)
5971 +                       uksm_sleep_real = expected_jiffies;
5972 +
5973 +               /* We have a 1 second up bound for responsiveness. */
5974 +               if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5975 +                       uksm_sleep_real = msecs_to_jiffies(1000);
5976 +       }
5977 +
5978 +       return;
5979 +}
5980 +
5981 +static int ksmd_should_run(void)
5982 +{
5983 +       return uksm_run & UKSM_RUN_MERGE;
5984 +}
5985 +
5986 +static int uksm_scan_thread(void *nothing)
5987 +{
5988 +       set_freezable();
5989 +       set_user_nice(current, 5);
5990 +
5991 +       while (!kthread_should_stop()) {
5992 +               mutex_lock(&uksm_thread_mutex);
5993 +               if (ksmd_should_run()) {
5994 +                       uksm_do_scan();
5995 +               }
5996 +               mutex_unlock(&uksm_thread_mutex);
5997 +
5998 +               try_to_freeze();
5999 +
6000 +               if (ksmd_should_run()) {
6001 +                       schedule_timeout_interruptible(uksm_sleep_real);
6002 +                       uksm_sleep_times++;
6003 +               } else {
6004 +                       wait_event_freezable(uksm_thread_wait,
6005 +                               ksmd_should_run() || kthread_should_stop());
6006 +               }
6007 +       }
6008 +       return 0;
6009 +}
6010 +
6011 +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6012 +{
6013 +       struct stable_node *stable_node;
6014 +       struct node_vma *node_vma;
6015 +       struct rmap_item *rmap_item;
6016 +       int ret = SWAP_AGAIN;
6017 +       int search_new_forks = 0;
6018 +       unsigned long address;
6019 +
6020 +       VM_BUG_ON_PAGE(!PageKsm(page), page);
6021 +       VM_BUG_ON_PAGE(!PageLocked(page), page);
6022 +
6023 +       stable_node = page_stable_node(page);
6024 +       if (!stable_node)
6025 +               return ret;
6026 +again:
6027 +       hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6028 +               hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6029 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
6030 +                       struct anon_vma_chain *vmac;
6031 +                       struct vm_area_struct *vma;
6032 +
6033 +                       anon_vma_lock_read(anon_vma);
6034 +                       anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6035 +                                                      0, ULONG_MAX) {
6036 +                               vma = vmac->vma;
6037 +                               address = get_rmap_addr(rmap_item);
6038 +
6039 +                               if (address < vma->vm_start ||
6040 +                                   address >= vma->vm_end)
6041 +                                       continue;
6042 +
6043 +                               if ((rmap_item->slot->vma == vma) ==
6044 +                                   search_new_forks)
6045 +                                       continue;
6046 +
6047 +                               if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6048 +                                       continue;
6049 +
6050 +                               ret = rwc->rmap_one(page, vma, address, rwc->arg);
6051 +                               if (ret != SWAP_AGAIN) {
6052 +                                       anon_vma_unlock_read(anon_vma);
6053 +                                       goto out;
6054 +                               }
6055 +
6056 +                               if (rwc->done && rwc->done(page)) {
6057 +                                       anon_vma_unlock_read(anon_vma);
6058 +                                       goto out;
6059 +                               }
6060 +                       }
6061 +                       anon_vma_unlock_read(anon_vma);
6062 +               }
6063 +       }
6064 +       if (!search_new_forks++)
6065 +               goto again;
6066 +out:
6067 +       return ret;
6068 +}
6069 +
6070 +#ifdef CONFIG_MIGRATION
6071 +/* Common ksm interface but may be specific to uksm */
6072 +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6073 +{
6074 +       struct stable_node *stable_node;
6075 +
6076 +       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6077 +       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6078 +       VM_BUG_ON(newpage->mapping != oldpage->mapping);
6079 +
6080 +       stable_node = page_stable_node(newpage);
6081 +       if (stable_node) {
6082 +               VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6083 +               stable_node->kpfn = page_to_pfn(newpage);
6084 +       }
6085 +}
6086 +#endif /* CONFIG_MIGRATION */
6087 +
6088 +#ifdef CONFIG_MEMORY_HOTREMOVE
6089 +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6090 +                                                unsigned long end_pfn)
6091 +{
6092 +       struct rb_node *node;
6093 +
6094 +       for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6095 +               struct stable_node *stable_node;
6096 +
6097 +               stable_node = rb_entry(node, struct stable_node, node);
6098 +               if (stable_node->kpfn >= start_pfn &&
6099 +                   stable_node->kpfn < end_pfn)
6100 +                       return stable_node;
6101 +       }
6102 +       return NULL;
6103 +}
6104 +
6105 +static int uksm_memory_callback(struct notifier_block *self,
6106 +                              unsigned long action, void *arg)
6107 +{
6108 +       struct memory_notify *mn = arg;
6109 +       struct stable_node *stable_node;
6110 +
6111 +       switch (action) {
6112 +       case MEM_GOING_OFFLINE:
6113 +               /*
6114 +                * Keep it very simple for now: just lock out ksmd and
6115 +                * MADV_UNMERGEABLE while any memory is going offline.
6116 +                * mutex_lock_nested() is necessary because lockdep was alarmed
6117 +                * that here we take uksm_thread_mutex inside notifier chain
6118 +                * mutex, and later take notifier chain mutex inside
6119 +                * uksm_thread_mutex to unlock it.   But that's safe because both
6120 +                * are inside mem_hotplug_mutex.
6121 +                */
6122 +               mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6123 +               break;
6124 +
6125 +       case MEM_OFFLINE:
6126 +               /*
6127 +                * Most of the work is done by page migration; but there might
6128 +                * be a few stable_nodes left over, still pointing to struct
6129 +                * pages which have been offlined: prune those from the tree.
6130 +                */
6131 +               while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6132 +                                       mn->start_pfn + mn->nr_pages)) != NULL)
6133 +                       remove_node_from_stable_tree(stable_node, 1, 1);
6134 +               /* fallthrough */
6135 +
6136 +       case MEM_CANCEL_OFFLINE:
6137 +               mutex_unlock(&uksm_thread_mutex);
6138 +               break;
6139 +       }
6140 +       return NOTIFY_OK;
6141 +}
6142 +#endif /* CONFIG_MEMORY_HOTREMOVE */
6143 +
6144 +#ifdef CONFIG_SYSFS
6145 +/*
6146 + * This all compiles without CONFIG_SYSFS, but is a waste of space.
6147 + */
6148 +
6149 +#define UKSM_ATTR_RO(_name) \
6150 +       static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6151 +#define UKSM_ATTR(_name) \
6152 +       static struct kobj_attribute _name##_attr = \
6153 +               __ATTR(_name, 0644, _name##_show, _name##_store)
6154 +
6155 +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6156 +                                   struct kobj_attribute *attr, char *buf)
6157 +{
6158 +       return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6159 +}
6160 +
6161 +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6162 +                                    struct kobj_attribute *attr,
6163 +                                    const char *buf, size_t count)
6164 +{
6165 +       unsigned long max_cpu_percentage;
6166 +       int err;
6167 +
6168 +       err = kstrtoul(buf, 10, &max_cpu_percentage);
6169 +       if (err || max_cpu_percentage > 100)
6170 +               return -EINVAL;
6171 +
6172 +       if (max_cpu_percentage == 100)
6173 +               max_cpu_percentage = 99;
6174 +       else if (max_cpu_percentage < 10)
6175 +               max_cpu_percentage = 10;
6176 +
6177 +       uksm_max_cpu_percentage = max_cpu_percentage;
6178 +
6179 +       return count;
6180 +}
6181 +UKSM_ATTR(max_cpu_percentage);
6182 +
6183 +static ssize_t sleep_millisecs_show(struct kobject *kobj,
6184 +                                   struct kobj_attribute *attr, char *buf)
6185 +{
6186 +       return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6187 +}
6188 +
6189 +static ssize_t sleep_millisecs_store(struct kobject *kobj,
6190 +                                    struct kobj_attribute *attr,
6191 +                                    const char *buf, size_t count)
6192 +{
6193 +       unsigned long msecs;
6194 +       int err;
6195 +
6196 +       err = kstrtoul(buf, 10, &msecs);
6197 +       if (err || msecs > MSEC_PER_SEC)
6198 +               return -EINVAL;
6199 +
6200 +       uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6201 +       uksm_sleep_saved = uksm_sleep_jiffies;
6202 +
6203 +       return count;
6204 +}
6205 +UKSM_ATTR(sleep_millisecs);
6206 +
6207 +
6208 +static ssize_t cpu_governor_show(struct kobject *kobj,
6209 +                                 struct kobj_attribute *attr, char *buf)
6210 +{
6211 +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6212 +       int i;
6213 +
6214 +       buf[0] = '\0';
6215 +       for (i = 0; i < n ; i++) {
6216 +               if (uksm_cpu_governor == i)
6217 +                       strcat(buf, "[");
6218 +
6219 +               strcat(buf, uksm_cpu_governor_str[i]);
6220 +
6221 +               if (uksm_cpu_governor == i)
6222 +                       strcat(buf, "]");
6223 +
6224 +               strcat(buf, " ");
6225 +       }
6226 +       strcat(buf, "\n");
6227 +
6228 +       return strlen(buf);
6229 +}
6230 +
6231 +static inline void init_performance_values(void)
6232 +{
6233 +       int i;
6234 +       struct scan_rung *rung;
6235 +       struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6236 +
6237 +
6238 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6239 +               rung = uksm_scan_ladder + i;
6240 +               rung->cpu_ratio = preset->cpu_ratio[i];
6241 +               rung->cover_msecs = preset->cover_msecs[i];
6242 +       }
6243 +
6244 +       uksm_max_cpu_percentage = preset->max_cpu;
6245 +}
6246 +
6247 +static ssize_t cpu_governor_store(struct kobject *kobj,
6248 +                                  struct kobj_attribute *attr,
6249 +                                  const char *buf, size_t count)
6250 +{
6251 +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6252 +
6253 +       for (n--; n >=0 ; n--) {
6254 +               if (!strncmp(buf, uksm_cpu_governor_str[n],
6255 +                            strlen(uksm_cpu_governor_str[n])))
6256 +                       break;
6257 +       }
6258 +
6259 +       if (n < 0)
6260 +               return -EINVAL;
6261 +       else
6262 +               uksm_cpu_governor = n;
6263 +
6264 +       init_performance_values();
6265 +
6266 +       return count;
6267 +}
6268 +UKSM_ATTR(cpu_governor);
6269 +
6270 +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6271 +                       char *buf)
6272 +{
6273 +       return sprintf(buf, "%u\n", uksm_run);
6274 +}
6275 +
6276 +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6277 +                        const char *buf, size_t count)
6278 +{
6279 +       int err;
6280 +       unsigned long flags;
6281 +
6282 +       err = kstrtoul(buf, 10, &flags);
6283 +       if (err || flags > UINT_MAX)
6284 +               return -EINVAL;
6285 +       if (flags > UKSM_RUN_MERGE)
6286 +               return -EINVAL;
6287 +
6288 +       mutex_lock(&uksm_thread_mutex);
6289 +       if (uksm_run != flags) {
6290 +               uksm_run = flags;
6291 +       }
6292 +       mutex_unlock(&uksm_thread_mutex);
6293 +
6294 +       if (flags & UKSM_RUN_MERGE)
6295 +               wake_up_interruptible(&uksm_thread_wait);
6296 +
6297 +       return count;
6298 +}
6299 +UKSM_ATTR(run);
6300 +
6301 +static ssize_t abundant_threshold_show(struct kobject *kobj,
6302 +                                    struct kobj_attribute *attr, char *buf)
6303 +{
6304 +       return sprintf(buf, "%u\n", uksm_abundant_threshold);
6305 +}
6306 +
6307 +static ssize_t abundant_threshold_store(struct kobject *kobj,
6308 +                                     struct kobj_attribute *attr,
6309 +                                     const char *buf, size_t count)
6310 +{
6311 +       int err;
6312 +       unsigned long flags;
6313 +
6314 +       err = kstrtoul(buf, 10, &flags);
6315 +       if (err || flags > 99)
6316 +               return -EINVAL;
6317 +
6318 +       uksm_abundant_threshold = flags;
6319 +
6320 +       return count;
6321 +}
6322 +UKSM_ATTR(abundant_threshold);
6323 +
6324 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6325 +                                    struct kobj_attribute *attr, char *buf)
6326 +{
6327 +       return sprintf(buf, "%u\n", uksm_thrash_threshold);
6328 +}
6329 +
6330 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6331 +                                     struct kobj_attribute *attr,
6332 +                                     const char *buf, size_t count)
6333 +{
6334 +       int err;
6335 +       unsigned long flags;
6336 +
6337 +       err = kstrtoul(buf, 10, &flags);
6338 +       if (err || flags > 99)
6339 +               return -EINVAL;
6340 +
6341 +       uksm_thrash_threshold = flags;
6342 +
6343 +       return count;
6344 +}
6345 +UKSM_ATTR(thrash_threshold);
6346 +
6347 +static ssize_t cpu_ratios_show(struct kobject *kobj,
6348 +                              struct kobj_attribute *attr, char *buf)
6349 +{
6350 +       int i, size;
6351 +       struct scan_rung *rung;
6352 +       char *p = buf;
6353 +
6354 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6355 +               rung = &uksm_scan_ladder[i];
6356 +
6357 +               if (rung->cpu_ratio > 0)
6358 +                       size = sprintf(p, "%d ", rung->cpu_ratio);
6359 +               else
6360 +                       size = sprintf(p, "MAX/%d ",
6361 +                                       TIME_RATIO_SCALE / -rung->cpu_ratio);
6362 +
6363 +               p += size;
6364 +       }
6365 +
6366 +       *p++ = '\n';
6367 +       *p = '\0';
6368 +
6369 +       return p - buf;
6370 +}
6371 +
6372 +static ssize_t cpu_ratios_store(struct kobject *kobj,
6373 +                                     struct kobj_attribute *attr,
6374 +                                     const char *buf, size_t count)
6375 +{
6376 +       int i, cpuratios[SCAN_LADDER_SIZE], err;
6377 +       unsigned long value;
6378 +       struct scan_rung *rung;
6379 +       char *p, *end = NULL;
6380 +
6381 +       p = kzalloc(count, GFP_KERNEL);
6382 +       if (!p)
6383 +               return -ENOMEM;
6384 +
6385 +       memcpy(p, buf, count);
6386 +
6387 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6388 +               if (i != SCAN_LADDER_SIZE -1) {
6389 +                       end = strchr(p, ' ');
6390 +                       if (!end)
6391 +                               return -EINVAL;
6392 +
6393 +                       *end = '\0';
6394 +               }
6395 +
6396 +               if (strstr(p, "MAX/")) {
6397 +                       p = strchr(p, '/') + 1;
6398 +                       err = kstrtoul(p, 10, &value);
6399 +                       if (err || value > TIME_RATIO_SCALE || !value)
6400 +                               return -EINVAL;
6401 +
6402 +                       cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
6403 +               } else {
6404 +                       err = kstrtoul(p, 10, &value);
6405 +                       if (err || value > TIME_RATIO_SCALE || !value)
6406 +                               return -EINVAL;
6407 +
6408 +                       cpuratios[i] = value;
6409 +               }
6410 +
6411 +               p = end + 1;
6412 +       }
6413 +
6414 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6415 +               rung = &uksm_scan_ladder[i];
6416 +
6417 +               rung->cpu_ratio = cpuratios[i];
6418 +       }
6419 +
6420 +       return count;
6421 +}
6422 +UKSM_ATTR(cpu_ratios);
6423 +
6424 +static ssize_t eval_intervals_show(struct kobject *kobj,
6425 +                              struct kobj_attribute *attr, char *buf)
6426 +{
6427 +       int i, size;
6428 +       struct scan_rung *rung;
6429 +       char *p = buf;
6430 +
6431 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6432 +               rung = &uksm_scan_ladder[i];
6433 +               size = sprintf(p, "%u ", rung->cover_msecs);
6434 +               p += size;
6435 +       }
6436 +
6437 +       *p++ = '\n';
6438 +       *p = '\0';
6439 +
6440 +       return p - buf;
6441 +}
6442 +
6443 +static ssize_t eval_intervals_store(struct kobject *kobj,
6444 +                                     struct kobj_attribute *attr,
6445 +                                     const char *buf, size_t count)
6446 +{
6447 +       int i, err;
6448 +       unsigned long values[SCAN_LADDER_SIZE];
6449 +       struct scan_rung *rung;
6450 +       char *p, *end = NULL;
6451 +
6452 +       p = kzalloc(count, GFP_KERNEL);
6453 +       if (!p)
6454 +               return -ENOMEM;
6455 +
6456 +       memcpy(p, buf, count);
6457 +
6458 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6459 +               if (i != SCAN_LADDER_SIZE -1) {
6460 +                       end = strchr(p, ' ');
6461 +                       if (!end)
6462 +                               return -EINVAL;
6463 +
6464 +                       *end = '\0';
6465 +               }
6466 +
6467 +               err = kstrtoul(p, 10, &values[i]);
6468 +               if (err)
6469 +                       return -EINVAL;
6470 +
6471 +               p = end + 1;
6472 +       }
6473 +
6474 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6475 +               rung = &uksm_scan_ladder[i];
6476 +
6477 +               rung->cover_msecs = values[i];
6478 +       }
6479 +
6480 +       return count;
6481 +}
6482 +UKSM_ATTR(eval_intervals);
6483 +
6484 +static ssize_t ema_per_page_time_show(struct kobject *kobj,
6485 +                                struct kobj_attribute *attr, char *buf)
6486 +{
6487 +       return sprintf(buf, "%lu\n", uksm_ema_page_time);
6488 +}
6489 +UKSM_ATTR_RO(ema_per_page_time);
6490 +
6491 +static ssize_t pages_shared_show(struct kobject *kobj,
6492 +                                struct kobj_attribute *attr, char *buf)
6493 +{
6494 +       return sprintf(buf, "%lu\n", uksm_pages_shared);
6495 +}
6496 +UKSM_ATTR_RO(pages_shared);
6497 +
6498 +static ssize_t pages_sharing_show(struct kobject *kobj,
6499 +                                 struct kobj_attribute *attr, char *buf)
6500 +{
6501 +       return sprintf(buf, "%lu\n", uksm_pages_sharing);
6502 +}
6503 +UKSM_ATTR_RO(pages_sharing);
6504 +
6505 +static ssize_t pages_unshared_show(struct kobject *kobj,
6506 +                                  struct kobj_attribute *attr, char *buf)
6507 +{
6508 +       return sprintf(buf, "%lu\n", uksm_pages_unshared);
6509 +}
6510 +UKSM_ATTR_RO(pages_unshared);
6511 +
6512 +static ssize_t full_scans_show(struct kobject *kobj,
6513 +                              struct kobj_attribute *attr, char *buf)
6514 +{
6515 +       return sprintf(buf, "%llu\n", fully_scanned_round);
6516 +}
6517 +UKSM_ATTR_RO(full_scans);
6518 +
6519 +static ssize_t pages_scanned_show(struct kobject *kobj,
6520 +                                 struct kobj_attribute *attr, char *buf)
6521 +{
6522 +       unsigned long base = 0;
6523 +       u64 delta, ret;
6524 +
6525 +       if (pages_scanned_stored) {
6526 +               base = pages_scanned_base;
6527 +               ret = pages_scanned_stored;
6528 +               delta = uksm_pages_scanned >> base;
6529 +               if (CAN_OVERFLOW_U64(ret, delta)) {
6530 +                       ret >>= 1;
6531 +                       delta >>= 1;
6532 +                       base++;
6533 +                       ret += delta;
6534 +               }
6535 +       } else {
6536 +               ret = uksm_pages_scanned;
6537 +       }
6538 +
6539 +       while (ret > ULONG_MAX) {
6540 +               ret >>= 1;
6541 +               base++;
6542 +       }
6543 +
6544 +       if (base)
6545 +               return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6546 +       else
6547 +               return sprintf(buf, "%lu\n", (unsigned long)ret);
6548 +}
6549 +UKSM_ATTR_RO(pages_scanned);
6550 +
6551 +static ssize_t hash_strength_show(struct kobject *kobj,
6552 +                                 struct kobj_attribute *attr, char *buf)
6553 +{
6554 +       return sprintf(buf, "%lu\n", hash_strength);
6555 +}
6556 +UKSM_ATTR_RO(hash_strength);
6557 +
6558 +static ssize_t sleep_times_show(struct kobject *kobj,
6559 +                                 struct kobj_attribute *attr, char *buf)
6560 +{
6561 +       return sprintf(buf, "%llu\n", uksm_sleep_times);
6562 +}
6563 +UKSM_ATTR_RO(sleep_times);
6564 +
6565 +
6566 +static struct attribute *uksm_attrs[] = {
6567 +       &max_cpu_percentage_attr.attr,
6568 +       &sleep_millisecs_attr.attr,
6569 +       &cpu_governor_attr.attr,
6570 +       &run_attr.attr,
6571 +       &ema_per_page_time_attr.attr,
6572 +       &pages_shared_attr.attr,
6573 +       &pages_sharing_attr.attr,
6574 +       &pages_unshared_attr.attr,
6575 +       &full_scans_attr.attr,
6576 +       &pages_scanned_attr.attr,
6577 +       &hash_strength_attr.attr,
6578 +       &sleep_times_attr.attr,
6579 +       &thrash_threshold_attr.attr,
6580 +       &abundant_threshold_attr.attr,
6581 +       &cpu_ratios_attr.attr,
6582 +       &eval_intervals_attr.attr,
6583 +       NULL,
6584 +};
6585 +
6586 +static struct attribute_group uksm_attr_group = {
6587 +       .attrs = uksm_attrs,
6588 +       .name = "uksm",
6589 +};
6590 +#endif /* CONFIG_SYSFS */
6591 +
6592 +static inline void init_scan_ladder(void)
6593 +{
6594 +       int i;
6595 +       struct scan_rung *rung;
6596 +
6597 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6598 +               rung = uksm_scan_ladder + i;
6599 +               slot_tree_init_root(&rung->vma_root);
6600 +       }
6601 +
6602 +       init_performance_values();
6603 +       uksm_calc_scan_pages();
6604 +}
6605 +
6606 +static inline int cal_positive_negative_costs(void)
6607 +{
6608 +       struct page *p1, *p2;
6609 +       unsigned char *addr1, *addr2;
6610 +       unsigned long i, time_start, hash_cost;
6611 +       unsigned long loopnum = 0;
6612 +
6613 +       /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6614 +       volatile u32 hash;
6615 +       volatile int ret;
6616 +
6617 +       p1 = alloc_page(GFP_KERNEL);
6618 +       if (!p1)
6619 +               return -ENOMEM;
6620 +
6621 +       p2 = alloc_page(GFP_KERNEL);
6622 +       if (!p2)
6623 +               return -ENOMEM;
6624 +
6625 +       addr1 = kmap_atomic(p1);
6626 +       addr2 = kmap_atomic(p2);
6627 +       memset(addr1, prandom_u32(), PAGE_SIZE);
6628 +       memcpy(addr2, addr1, PAGE_SIZE);
6629 +
6630 +       /* make sure that the two pages differ in last byte */
6631 +       addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6632 +       kunmap_atomic(addr2);
6633 +       kunmap_atomic(addr1);
6634 +
6635 +       time_start = jiffies;
6636 +       while (jiffies - time_start < 100) {
6637 +               for (i = 0; i < 100; i++)
6638 +                       hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6639 +               loopnum += 100;
6640 +       }
6641 +       hash_cost = (jiffies - time_start);
6642 +
6643 +       time_start = jiffies;
6644 +       for (i = 0; i < loopnum; i++)
6645 +               ret = pages_identical(p1, p2);
6646 +       memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6647 +       memcmp_cost /= hash_cost;
6648 +       printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
6649 +                        "hash=%u cmp_ret=%d.\n",
6650 +              memcmp_cost, hash, ret);
6651 +
6652 +       __free_page(p1);
6653 +       __free_page(p2);
6654 +       return 0;
6655 +}
6656 +
6657 +static int init_zeropage_hash_table(void)
6658 +{
6659 +       struct page *page;
6660 +       char *addr;
6661 +       int i;
6662 +
6663 +       page = alloc_page(GFP_KERNEL);
6664 +       if (!page)
6665 +               return -ENOMEM;
6666 +
6667 +       addr = kmap_atomic(page);
6668 +       memset(addr, 0, PAGE_SIZE);
6669 +       kunmap_atomic(addr);
6670 +
6671 +       zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6672 +               GFP_KERNEL);
6673 +       if (!zero_hash_table)
6674 +               return -ENOMEM;
6675 +
6676 +       for (i = 0; i < HASH_STRENGTH_MAX; i++)
6677 +               zero_hash_table[i] = page_hash(page, i, 0);
6678 +
6679 +       __free_page(page);
6680 +
6681 +       return 0;
6682 +}
6683 +
6684 +static inline int init_random_sampling(void)
6685 +{
6686 +       unsigned long i;
6687 +       random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6688 +       if (!random_nums)
6689 +               return -ENOMEM;
6690 +
6691 +       for (i = 0; i < HASH_STRENGTH_FULL; i++)
6692 +               random_nums[i] = i;
6693 +
6694 +       for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6695 +               unsigned long rand_range, swap_index, tmp;
6696 +
6697 +               rand_range = HASH_STRENGTH_FULL - i;
6698 +               swap_index = i + prandom_u32() % rand_range;
6699 +               tmp = random_nums[i];
6700 +               random_nums[i] =  random_nums[swap_index];
6701 +               random_nums[swap_index] = tmp;
6702 +       }
6703 +
6704 +       rshash_state.state = RSHASH_NEW;
6705 +       rshash_state.below_count = 0;
6706 +       rshash_state.lookup_window_index = 0;
6707 +
6708 +       return cal_positive_negative_costs();
6709 +}
6710 +
6711 +static int __init uksm_slab_init(void)
6712 +{
6713 +       rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6714 +       if (!rmap_item_cache)
6715 +               goto out;
6716 +
6717 +       stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6718 +       if (!stable_node_cache)
6719 +               goto out_free1;
6720 +
6721 +       node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6722 +       if (!node_vma_cache)
6723 +               goto out_free2;
6724 +
6725 +       vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6726 +       if (!vma_slot_cache)
6727 +               goto out_free3;
6728 +
6729 +       tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6730 +       if (!tree_node_cache)
6731 +               goto out_free4;
6732 +
6733 +       return 0;
6734 +
6735 +out_free4:
6736 +       kmem_cache_destroy(vma_slot_cache);
6737 +out_free3:
6738 +       kmem_cache_destroy(node_vma_cache);
6739 +out_free2:
6740 +       kmem_cache_destroy(stable_node_cache);
6741 +out_free1:
6742 +       kmem_cache_destroy(rmap_item_cache);
6743 +out:
6744 +       return -ENOMEM;
6745 +}
6746 +
6747 +static void __init uksm_slab_free(void)
6748 +{
6749 +       kmem_cache_destroy(stable_node_cache);
6750 +       kmem_cache_destroy(rmap_item_cache);
6751 +       kmem_cache_destroy(node_vma_cache);
6752 +       kmem_cache_destroy(vma_slot_cache);
6753 +       kmem_cache_destroy(tree_node_cache);
6754 +}
6755 +
6756 +/* Common interface to ksm, different to it. */
6757 +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6758 +               unsigned long end, int advice, unsigned long *vm_flags)
6759 +{
6760 +       int err;
6761 +
6762 +       switch (advice) {
6763 +       case MADV_MERGEABLE:
6764 +               return 0;               /* just ignore the advice */
6765 +
6766 +       case MADV_UNMERGEABLE:
6767 +               if (!(*vm_flags & VM_MERGEABLE))
6768 +                       return 0;               /* just ignore the advice */
6769 +
6770 +               if (vma->anon_vma) {
6771 +                       err = unmerge_uksm_pages(vma, start, end);
6772 +                       if (err)
6773 +                               return err;
6774 +               }
6775 +
6776 +               uksm_remove_vma(vma);
6777 +               *vm_flags &= ~VM_MERGEABLE;
6778 +               break;
6779 +       }
6780 +
6781 +       return 0;
6782 +}
6783 +
6784 +/* Common interface to ksm, actually the same. */
6785 +struct page *ksm_might_need_to_copy(struct page *page,
6786 +                       struct vm_area_struct *vma, unsigned long address)
6787 +{
6788 +       struct anon_vma *anon_vma = page_anon_vma(page);
6789 +       struct page *new_page;
6790 +
6791 +       if (PageKsm(page)) {
6792 +               if (page_stable_node(page))
6793 +                       return page;    /* no need to copy it */
6794 +       } else if (!anon_vma) {
6795 +               return page;            /* no need to copy it */
6796 +       } else if (anon_vma->root == vma->anon_vma->root &&
6797 +                page->index == linear_page_index(vma, address)) {
6798 +               return page;            /* still no need to copy it */
6799 +       }
6800 +       if (!PageUptodate(page))
6801 +               return page;            /* let do_swap_page report the error */
6802 +
6803 +       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6804 +       if (new_page) {
6805 +               copy_user_highpage(new_page, page, address, vma);
6806 +
6807 +               SetPageDirty(new_page);
6808 +               __SetPageUptodate(new_page);
6809 +               __set_page_locked(new_page);
6810 +       }
6811 +
6812 +       return new_page;
6813 +}
6814 +
6815 +static int __init uksm_init(void)
6816 +{
6817 +       struct task_struct *uksm_thread;
6818 +       int err;
6819 +
6820 +       uksm_sleep_jiffies = msecs_to_jiffies(100);
6821 +       uksm_sleep_saved = uksm_sleep_jiffies;
6822 +
6823 +       slot_tree_init();
6824 +       init_scan_ladder();
6825 +
6826 +
6827 +       err = init_random_sampling();
6828 +       if (err)
6829 +               goto out_free2;
6830 +
6831 +       err = uksm_slab_init();
6832 +       if (err)
6833 +               goto out_free1;
6834 +
6835 +       err = init_zeropage_hash_table();
6836 +       if (err)
6837 +               goto out_free0;
6838 +
6839 +       uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6840 +       if (IS_ERR(uksm_thread)) {
6841 +               printk(KERN_ERR "uksm: creating kthread failed\n");
6842 +               err = PTR_ERR(uksm_thread);
6843 +               goto out_free;
6844 +       }
6845 +
6846 +#ifdef CONFIG_SYSFS
6847 +       err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6848 +       if (err) {
6849 +               printk(KERN_ERR "uksm: register sysfs failed\n");
6850 +               kthread_stop(uksm_thread);
6851 +               goto out_free;
6852 +       }
6853 +#else
6854 +       uksm_run = UKSM_RUN_MERGE;      /* no way for user to start it */
6855 +
6856 +#endif /* CONFIG_SYSFS */
6857 +
6858 +#ifdef CONFIG_MEMORY_HOTREMOVE
6859 +       /*
6860 +        * Choose a high priority since the callback takes uksm_thread_mutex:
6861 +        * later callbacks could only be taking locks which nest within that.
6862 +        */
6863 +       hotplug_memory_notifier(uksm_memory_callback, 100);
6864 +#endif
6865 +       return 0;
6866 +
6867 +out_free:
6868 +       kfree(zero_hash_table);
6869 +out_free0:
6870 +       uksm_slab_free();
6871 +out_free1:
6872 +       kfree(random_nums);
6873 +out_free2:
6874 +       kfree(uksm_scan_ladder);
6875 +       return err;
6876 +}
6877 +
6878 +#ifdef MODULE
6879 +subsys_initcall(ksm_init);
6880 +#else
6881 +late_initcall(uksm_init);
6882 +#endif
6883 +
6884 diff --git a/mm/vmstat.c b/mm/vmstat.c
6885 index 1b12d39..24b803f 100644
6886 --- a/mm/vmstat.c
6887 +++ b/mm/vmstat.c
6888 @@ -795,6 +795,9 @@ const char * const vmstat_text[] = {
6889         "nr_anon_transparent_hugepages",
6890         "nr_free_cma",
6891  
6892 +#ifdef CONFIG_UKSM
6893 +       "nr_uksm_zero_pages",
6894 +#endif
6895         /* enum writeback_stat_item counters */
6896         "nr_dirty_threshold",
6897         "nr_dirty_background_threshold",
This page took 1.010791 seconds and 3 git commands to generate.