]> git.pld-linux.org Git - packages/kernel.git/blame - uksm-0.1.2.3-for-v3.18.patch
3.18.44 - Dirty COW CVE-2016-5195 fix
[packages/kernel.git] / uksm-0.1.2.3-for-v3.18.patch
CommitLineData
04bb9bf1
JR
1diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
2index 081c497..75bde3d 100644
3--- a/Documentation/vm/00-INDEX
4+++ b/Documentation/vm/00-INDEX
5@@ -16,6 +16,8 @@ hwpoison.txt
6 - explains what hwpoison is
7 ksm.txt
8 - how to use the Kernel Samepage Merging feature.
9+uksm.txt
10+ - Introduction to Ultra KSM
11 numa
12 - information about NUMA specific code in the Linux vm.
13 numa_memory_policy.txt
14diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
15new file mode 100644
16index 0000000..08bd645
17--- /dev/null
18+++ b/Documentation/vm/uksm.txt
19@@ -0,0 +1,58 @@
20+The Ultra Kernel Samepage Merging feature
21+----------------------------------------------
22+/*
23+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
24+ *
25+ * This is an improvement upon KSM. Some basic data structures and routines
26+ * are borrowed from ksm.c .
27+ *
28+ * Its new features:
29+ * 1. Full system scan:
30+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
31+ * interaction to submit a memory area to KSM is no longer needed.
32+ *
33+ * 2. Rich area detection:
34+ * It automatically detects rich areas containing abundant duplicated
35+ * pages based. Rich areas are given a full scan speed. Poor areas are
36+ * sampled at a reasonable speed with very low CPU consumption.
37+ *
38+ * 3. Ultra Per-page scan speed improvement:
39+ * A new hash algorithm is proposed. As a result, on a machine with
40+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
41+ * can scan memory areas that does not contain duplicated pages at speed of
42+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
43+ * 477MB/sec ~ 923MB/sec.
44+ *
45+ * 4. Thrashing area avoidance:
46+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
47+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
48+ * hash value based volatile page detection.
49+ *
50+ *
51+ * 5. Misc changes upon KSM:
52+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
53+ * comparison. It's much faster than default C version on x86.
54+ * * rmap_item now has an struct *page member to loosely cache a
55+ * address-->page mapping, which reduces too much time-costly
56+ * follow_page().
57+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
58+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
59+ * ksm is needed for this case.
60+ *
61+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
62+ * Now uksmd consider full zero pages as special pages and merge them to an
63+ * special unswappable uksm zero page.
64+ */
65+
66+ChangeLog:
67+
68+2012-05-05 The creation of this Doc
69+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
70+2012-05-28 UKSM 0.1.1.2 bug fix release
71+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
72+2012-07-2 UKSM 0.1.2-beta2
73+2012-07-10 UKSM 0.1.2-beta3
74+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
75+2012-10-13 UKSM 0.1.2.1 Bug fixes.
76+2012-12-31 UKSM 0.1.2.2 Minor bug fixes
77+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug"
78diff --git a/fs/exec.c b/fs/exec.c
79index 7302b75..84b0df5 100644
80--- a/fs/exec.c
81+++ b/fs/exec.c
82@@ -19,7 +19,7 @@
83 * current->executable is only used by the procfs. This allows a dispatch
84 * table to check for several different types of binary formats. We keep
85 * trying until we recognize the file or we run out of supported binary
86- * formats.
87+ * formats.
88 */
89
90 #include <linux/slab.h>
91@@ -56,6 +56,7 @@
92 #include <linux/pipe_fs_i.h>
93 #include <linux/oom.h>
94 #include <linux/compat.h>
95+#include <linux/ksm.h>
96
97 #include <asm/uaccess.h>
98 #include <asm/mmu_context.h>
99@@ -1128,6 +1129,7 @@ void setup_new_exec(struct linux_binprm * bprm)
100 /* An exec changes our domain. We are no longer part of the thread
101 group */
102 current->self_exec_id++;
103+
104 flush_signal_handlers(current, 0);
105 do_close_on_exec(current->files);
106 }
107diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
108index aa1eee0..5a9149c 100644
109--- a/fs/proc/meminfo.c
110+++ b/fs/proc/meminfo.c
111@@ -121,6 +121,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
112 "SUnreclaim: %8lu kB\n"
113 "KernelStack: %8lu kB\n"
114 "PageTables: %8lu kB\n"
115+#ifdef CONFIG_UKSM
116+ "KsmZeroPages: %8lu kB\n"
117+#endif
118 #ifdef CONFIG_QUICKLIST
119 "Quicklists: %8lu kB\n"
120 #endif
121@@ -175,6 +178,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
122 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
123 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
124 K(global_page_state(NR_PAGETABLE)),
125+#ifdef CONFIG_UKSM
126+ K(global_page_state(NR_UKSM_ZERO_PAGES)),
127+#endif
128 #ifdef CONFIG_QUICKLIST
129 K(quicklist_total_size()),
130 #endif
131diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
132index 752e30d..1e7c826 100644
133--- a/include/asm-generic/pgtable.h
134+++ b/include/asm-generic/pgtable.h
135@@ -537,12 +537,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
136 unsigned long size);
137 #endif
138
139+#ifdef CONFIG_UKSM
140+static inline int is_uksm_zero_pfn(unsigned long pfn)
141+{
142+ extern unsigned long uksm_zero_pfn;
143+ return pfn == uksm_zero_pfn;
144+}
145+#else
146+static inline int is_uksm_zero_pfn(unsigned long pfn)
147+{
148+ return 0;
149+}
150+#endif
151+
152 #ifdef __HAVE_COLOR_ZERO_PAGE
153 static inline int is_zero_pfn(unsigned long pfn)
154 {
155 extern unsigned long zero_pfn;
156 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
157- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
158+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
159 }
160
161 #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
162@@ -551,7 +564,7 @@ static inline int is_zero_pfn(unsigned long pfn)
163 static inline int is_zero_pfn(unsigned long pfn)
164 {
165 extern unsigned long zero_pfn;
166- return pfn == zero_pfn;
167+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
168 }
169
170 static inline unsigned long my_zero_pfn(unsigned long addr)
171diff --git a/include/linux/ksm.h b/include/linux/ksm.h
172index 3be6bb1..51557d1 100644
173--- a/include/linux/ksm.h
174+++ b/include/linux/ksm.h
175@@ -19,21 +19,6 @@ struct mem_cgroup;
176 #ifdef CONFIG_KSM
177 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
178 unsigned long end, int advice, unsigned long *vm_flags);
179-int __ksm_enter(struct mm_struct *mm);
180-void __ksm_exit(struct mm_struct *mm);
181-
182-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
183-{
184- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
185- return __ksm_enter(mm);
186- return 0;
187-}
188-
189-static inline void ksm_exit(struct mm_struct *mm)
190-{
191- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
192- __ksm_exit(mm);
193-}
194
195 /*
196 * A KSM page is one of those write-protected "shared pages" or "merged pages"
197@@ -76,6 +61,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
198 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
199 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
200
201+#ifdef CONFIG_KSM_LEGACY
202+int __ksm_enter(struct mm_struct *mm);
203+void __ksm_exit(struct mm_struct *mm);
204+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
205+{
206+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
207+ return __ksm_enter(mm);
208+ return 0;
209+}
210+
211+static inline void ksm_exit(struct mm_struct *mm)
212+{
213+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
214+ __ksm_exit(mm);
215+}
216+
217+#elif defined(CONFIG_UKSM)
218+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
219+{
220+ return 0;
221+}
222+
223+static inline void ksm_exit(struct mm_struct *mm)
224+{
225+}
226+#endif /* !CONFIG_UKSM */
227+
228 #else /* !CONFIG_KSM */
229
230 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
231@@ -123,4 +135,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
232 #endif /* CONFIG_MMU */
233 #endif /* !CONFIG_KSM */
234
235+#include <linux/uksm.h>
236+
237 #endif /* __LINUX_KSM_H */
238diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
239index 6e0b286..ebd6243 100644
240--- a/include/linux/mm_types.h
241+++ b/include/linux/mm_types.h
242@@ -308,6 +308,9 @@ struct vm_area_struct {
243 #ifdef CONFIG_NUMA
244 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
245 #endif
246+#ifdef CONFIG_UKSM
247+ struct vma_slot *uksm_vma_slot;
248+#endif
249 };
250
251 struct core_thread {
252diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
253index ffe66e3..355f0e9 100644
254--- a/include/linux/mmzone.h
255+++ b/include/linux/mmzone.h
256@@ -157,6 +157,9 @@ enum zone_stat_item {
257 WORKINGSET_NODERECLAIM,
258 NR_ANON_TRANSPARENT_HUGEPAGES,
259 NR_FREE_CMA_PAGES,
260+#ifdef CONFIG_UKSM
261+ NR_UKSM_ZERO_PAGES,
262+#endif
263 NR_VM_ZONE_STAT_ITEMS };
264
265 /*
266@@ -865,7 +868,7 @@ static inline int is_highmem_idx(enum zone_type idx)
267 }
268
269 /**
270- * is_highmem - helper function to quickly check if a struct zone is a
271+ * is_highmem - helper function to quickly check if a struct zone is a
272 * highmem zone or not. This is an attempt to keep references
273 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
274 * @zone - pointer to struct zone variable
275diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
276new file mode 100644
277index 0000000..6780fdb
278--- /dev/null
279+++ b/include/linux/sradix-tree.h
280@@ -0,0 +1,77 @@
281+#ifndef _LINUX_SRADIX_TREE_H
282+#define _LINUX_SRADIX_TREE_H
283+
284+
285+#define INIT_SRADIX_TREE(root, mask) \
286+do { \
287+ (root)->height = 0; \
288+ (root)->gfp_mask = (mask); \
289+ (root)->rnode = NULL; \
290+} while (0)
291+
292+#define ULONG_BITS (sizeof(unsigned long) * 8)
293+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
294+//#define SRADIX_TREE_MAP_SHIFT 6
295+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
296+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
297+
298+struct sradix_tree_node {
299+ unsigned int height; /* Height from the bottom */
300+ unsigned int count;
301+ unsigned int fulls; /* Number of full sublevel trees */
302+ struct sradix_tree_node *parent;
303+ void *stores[0];
304+};
305+
306+/* A simple radix tree implementation */
307+struct sradix_tree_root {
308+ unsigned int height;
309+ struct sradix_tree_node *rnode;
310+
311+ /* Where found to have available empty stores in its sublevels */
312+ struct sradix_tree_node *enter_node;
313+ unsigned int shift;
314+ unsigned int stores_size;
315+ unsigned int mask;
316+ unsigned long min; /* The first hole index */
317+ unsigned long num;
318+ //unsigned long *height_to_maxindex;
319+
320+ /* How the node is allocated and freed. */
321+ struct sradix_tree_node *(*alloc)(void);
322+ void (*free)(struct sradix_tree_node *node);
323+
324+ /* When a new node is added and removed */
325+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
326+ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
327+ void (*rm)(struct sradix_tree_node *node, unsigned offset);
328+};
329+
330+struct sradix_tree_path {
331+ struct sradix_tree_node *node;
332+ int offset;
333+};
334+
335+static inline
336+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
337+{
338+ root->height = 0;
339+ root->rnode = NULL;
340+ root->shift = shift;
341+ root->stores_size = 1UL << shift;
342+ root->mask = root->stores_size - 1;
343+}
344+
345+
346+extern void *sradix_tree_next(struct sradix_tree_root *root,
347+ struct sradix_tree_node *node, unsigned long index,
348+ int (*iter)(void *, unsigned long));
349+
350+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
351+
352+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
353+ struct sradix_tree_node *node, unsigned long index);
354+
355+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
356+
357+#endif /* _LINUX_SRADIX_TREE_H */
358diff --git a/include/linux/uksm.h b/include/linux/uksm.h
359new file mode 100644
360index 0000000..a644bca
361--- /dev/null
362+++ b/include/linux/uksm.h
363@@ -0,0 +1,146 @@
364+#ifndef __LINUX_UKSM_H
365+#define __LINUX_UKSM_H
366+/*
367+ * Memory merging support.
368+ *
369+ * This code enables dynamic sharing of identical pages found in different
370+ * memory areas, even if they are not shared by fork().
371+ */
372+
373+/* if !CONFIG_UKSM this file should not be compiled at all. */
374+#ifdef CONFIG_UKSM
375+
376+#include <linux/bitops.h>
377+#include <linux/mm.h>
378+#include <linux/pagemap.h>
379+#include <linux/rmap.h>
380+#include <linux/sched.h>
381+
382+extern unsigned long zero_pfn __read_mostly;
383+extern unsigned long uksm_zero_pfn __read_mostly;
384+extern struct page *empty_uksm_zero_page;
385+
386+/* must be done before linked to mm */
387+extern void uksm_vma_add_new(struct vm_area_struct *vma);
388+extern void uksm_remove_vma(struct vm_area_struct *vma);
389+
390+#define UKSM_SLOT_NEED_SORT (1 << 0)
391+#define UKSM_SLOT_NEED_RERAND (1 << 1)
392+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
393+#define UKSM_SLOT_FUL_SCANNED (1 << 3)
394+#define UKSM_SLOT_IN_UKSM (1 << 4)
395+
396+struct vma_slot {
397+ struct sradix_tree_node *snode;
398+ unsigned long sindex;
399+
400+ struct list_head slot_list;
401+ unsigned long fully_scanned_round;
402+ unsigned long dedup_num;
403+ unsigned long pages_scanned;
404+ unsigned long last_scanned;
405+ unsigned long pages_to_scan;
406+ struct scan_rung *rung;
407+ struct page **rmap_list_pool;
408+ unsigned int *pool_counts;
409+ unsigned long pool_size;
410+ struct vm_area_struct *vma;
411+ struct mm_struct *mm;
412+ unsigned long ctime_j;
413+ unsigned long pages;
414+ unsigned long flags;
415+ unsigned long pages_cowed; /* pages cowed this round */
416+ unsigned long pages_merged; /* pages merged this round */
417+ unsigned long pages_bemerged;
418+
419+ /* when it has page merged in this eval round */
420+ struct list_head dedup_list;
421+};
422+
423+static inline void uksm_unmap_zero_page(pte_t pte)
424+{
425+ if (pte_pfn(pte) == uksm_zero_pfn)
426+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
427+}
428+
429+static inline void uksm_map_zero_page(pte_t pte)
430+{
431+ if (pte_pfn(pte) == uksm_zero_pfn)
432+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
433+}
434+
435+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
436+{
437+ if (vma->uksm_vma_slot && PageKsm(page))
438+ vma->uksm_vma_slot->pages_cowed++;
439+}
440+
441+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
442+{
443+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
444+ vma->uksm_vma_slot->pages_cowed++;
445+}
446+
447+static inline int uksm_flags_can_scan(unsigned long vm_flags)
448+{
449+#ifndef VM_SAO
450+#define VM_SAO 0
451+#endif
452+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
453+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP |
454+ VM_SHARED | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO));
455+}
456+
457+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
458+{
459+ if (uksm_flags_can_scan(*vm_flags_p))
460+ *vm_flags_p |= VM_MERGEABLE;
461+}
462+
463+/*
464+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
465+ * be removed when uksm zero page patch is stable enough.
466+ */
467+static inline void uksm_bugon_zeropage(pte_t pte)
468+{
469+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
470+}
471+#else
472+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
473+{
474+}
475+
476+static inline void uksm_remove_vma(struct vm_area_struct *vma)
477+{
478+}
479+
480+static inline void uksm_unmap_zero_page(pte_t pte)
481+{
482+}
483+
484+static inline void uksm_map_zero_page(pte_t pte)
485+{
486+}
487+
488+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
489+{
490+}
491+
492+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
493+{
494+}
495+
496+static inline int uksm_flags_can_scan(unsigned long vm_flags)
497+{
498+ return 0;
499+}
500+
501+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
502+{
503+}
504+
505+static inline void uksm_bugon_zeropage(pte_t pte)
506+{
507+}
508+#endif /* !CONFIG_UKSM */
509+#endif /* __LINUX_UKSM_H */
510diff --git a/kernel/fork.c b/kernel/fork.c
511index 9b7d746..73ad90d 100644
512--- a/kernel/fork.c
513+++ b/kernel/fork.c
514@@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
515 goto fail_nomem;
516 charge = len;
517 }
518- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
519+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
520 if (!tmp)
521 goto fail_nomem;
522 *tmp = *mpnt;
523@@ -467,7 +467,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
524 __vma_link_rb(mm, tmp, rb_link, rb_parent);
525 rb_link = &tmp->vm_rb.rb_right;
526 rb_parent = &tmp->vm_rb;
527-
528+ uksm_vma_add_new(tmp);
529 mm->map_count++;
530 retval = copy_page_range(mm, oldmm, mpnt);
531
532diff --git a/lib/Makefile b/lib/Makefile
533index 0211d2b..426536f 100644
534--- a/lib/Makefile
535+++ b/lib/Makefile
536@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
537 endif
538
539 lib-y := ctype.o string.o vsprintf.o cmdline.o \
540- rbtree.o radix-tree.o dump_stack.o timerqueue.o\
541+ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
542 idr.o int_sqrt.o extable.o \
543 sha1.o md5.o irq_regs.o argv_split.o \
544 proportions.o flex_proportions.o ratelimit.o show_mem.o \
545diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
546new file mode 100644
547index 0000000..8d06329
548--- /dev/null
549+++ b/lib/sradix-tree.c
550@@ -0,0 +1,476 @@
551+#include <linux/errno.h>
552+#include <linux/mm.h>
553+#include <linux/mman.h>
554+#include <linux/spinlock.h>
555+#include <linux/slab.h>
556+#include <linux/gcd.h>
557+#include <linux/sradix-tree.h>
558+
559+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
560+{
561+ return node->fulls == root->stores_size ||
562+ (node->height == 1 && node->count == root->stores_size);
563+}
564+
565+/*
566+ * Extend a sradix tree so it can store key @index.
567+ */
568+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
569+{
570+ struct sradix_tree_node *node;
571+ unsigned int height;
572+
573+ if (unlikely(root->rnode == NULL)) {
574+ if (!(node = root->alloc()))
575+ return -ENOMEM;
576+
577+ node->height = 1;
578+ root->rnode = node;
579+ root->height = 1;
580+ }
581+
582+ /* Figure out what the height should be. */
583+ height = root->height;
584+ index >>= root->shift * height;
585+
586+ while (index) {
587+ index >>= root->shift;
588+ height++;
589+ }
590+
591+ while (height > root->height) {
592+ unsigned int newheight;
593+ if (!(node = root->alloc()))
594+ return -ENOMEM;
595+
596+ /* Increase the height. */
597+ node->stores[0] = root->rnode;
598+ root->rnode->parent = node;
599+ if (root->extend)
600+ root->extend(node, root->rnode);
601+
602+ newheight = root->height + 1;
603+ node->height = newheight;
604+ node->count = 1;
605+ if (sradix_node_full(root, root->rnode))
606+ node->fulls = 1;
607+
608+ root->rnode = node;
609+ root->height = newheight;
610+ }
611+
612+ return 0;
613+}
614+
615+/*
616+ * Search the next item from the current node, that is not NULL
617+ * and can satify root->iter().
618+ */
619+void *sradix_tree_next(struct sradix_tree_root *root,
620+ struct sradix_tree_node *node, unsigned long index,
621+ int (*iter)(void *item, unsigned long height))
622+{
623+ unsigned long offset;
624+ void *item;
625+
626+ if (unlikely(node == NULL)) {
627+ node = root->rnode;
628+ for (offset = 0; offset < root->stores_size; offset++) {
629+ item = node->stores[offset];
630+ if (item && (!iter || iter(item, node->height)))
631+ break;
632+ }
633+
634+ if (unlikely(offset >= root->stores_size))
635+ return NULL;
636+
637+ if (node->height == 1)
638+ return item;
639+ else
640+ goto go_down;
641+ }
642+
643+ while (node) {
644+ offset = (index & root->mask) + 1;
645+ for (;offset < root->stores_size; offset++) {
646+ item = node->stores[offset];
647+ if (item && (!iter || iter(item, node->height)))
648+ break;
649+ }
650+
651+ if (offset < root->stores_size)
652+ break;
653+
654+ node = node->parent;
655+ index >>= root->shift;
656+ }
657+
658+ if (!node)
659+ return NULL;
660+
661+ while (node->height > 1) {
662+go_down:
663+ node = item;
664+ for (offset = 0; offset < root->stores_size; offset++) {
665+ item = node->stores[offset];
666+ if (item && (!iter || iter(item, node->height)))
667+ break;
668+ }
669+
670+ if (unlikely(offset >= root->stores_size))
671+ return NULL;
672+ }
673+
674+ BUG_ON(offset > root->stores_size);
675+
676+ return item;
677+}
678+
679+/*
680+ * Blindly insert the item to the tree. Typically, we reuse the
681+ * first empty store item.
682+ */
683+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
684+{
685+ unsigned long index;
686+ unsigned int height;
687+ struct sradix_tree_node *node, *tmp = NULL;
688+ int offset, offset_saved;
689+ void **store = NULL;
690+ int error, i, j, shift;
691+
692+go_on:
693+ index = root->min;
694+
695+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
696+ node = root->enter_node;
697+ BUG_ON((index >> (root->shift * root->height)));
698+ } else {
699+ node = root->rnode;
700+ if (node == NULL || (index >> (root->shift * root->height))
701+ || sradix_node_full(root, node)) {
702+ error = sradix_tree_extend(root, index);
703+ if (error)
704+ return error;
705+
706+ node = root->rnode;
707+ }
708+ }
709+
710+
711+ height = node->height;
712+ shift = (height - 1) * root->shift;
713+ offset = (index >> shift) & root->mask;
714+ while (shift > 0) {
715+ offset_saved = offset;
716+ for (; offset < root->stores_size; offset++) {
717+ store = &node->stores[offset];
718+ tmp = *store;
719+
720+ if (!tmp || !sradix_node_full(root, tmp))
721+ break;
722+ }
723+ BUG_ON(offset >= root->stores_size);
724+
725+ if (offset != offset_saved) {
726+ index += (offset - offset_saved) << shift;
727+ index &= ~((1UL << shift) - 1);
728+ }
729+
730+ if (!tmp) {
731+ if (!(tmp = root->alloc()))
732+ return -ENOMEM;
733+
734+ tmp->height = shift / root->shift;
735+ *store = tmp;
736+ tmp->parent = node;
737+ node->count++;
738+// if (root->extend)
739+// root->extend(node, tmp);
740+ }
741+
742+ node = tmp;
743+ shift -= root->shift;
744+ offset = (index >> shift) & root->mask;
745+ }
746+
747+ BUG_ON(node->height != 1);
748+
749+
750+ store = &node->stores[offset];
751+ for (i = 0, j = 0;
752+ j < root->stores_size - node->count &&
753+ i < root->stores_size - offset && j < num; i++) {
754+ if (!store[i]) {
755+ store[i] = item[j];
756+ if (root->assign)
757+ root->assign(node, index + i, item[j]);
758+ j++;
759+ }
760+ }
761+
762+ node->count += j;
763+ root->num += j;
764+ num -= j;
765+
766+ while (sradix_node_full(root, node)) {
767+ node = node->parent;
768+ if (!node)
769+ break;
770+
771+ node->fulls++;
772+ }
773+
774+ if (unlikely(!node)) {
775+ /* All nodes are full */
776+ root->min = 1 << (root->height * root->shift);
777+ root->enter_node = NULL;
778+ } else {
779+ root->min = index + i - 1;
780+ root->min |= (1UL << (node->height - 1)) - 1;
781+ root->min++;
782+ root->enter_node = node;
783+ }
784+
785+ if (num) {
786+ item += j;
787+ goto go_on;
788+ }
789+
790+ return 0;
791+}
792+
793+
794+/**
795+ * sradix_tree_shrink - shrink height of a sradix tree to minimal
796+ * @root sradix tree root
797+ *
798+ */
799+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
800+{
801+ /* try to shrink tree height */
802+ while (root->height > 1) {
803+ struct sradix_tree_node *to_free = root->rnode;
804+
805+ /*
806+ * The candidate node has more than one child, or its child
807+ * is not at the leftmost store, we cannot shrink.
808+ */
809+ if (to_free->count != 1 || !to_free->stores[0])
810+ break;
811+
812+ root->rnode = to_free->stores[0];
813+ root->rnode->parent = NULL;
814+ root->height--;
815+ if (unlikely(root->enter_node == to_free)) {
816+ root->enter_node = NULL;
817+ }
818+ root->free(to_free);
819+ }
820+}
821+
822+/*
823+ * Del the item on the known leaf node and index
824+ */
825+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
826+ struct sradix_tree_node *node, unsigned long index)
827+{
828+ unsigned int offset;
829+ struct sradix_tree_node *start, *end;
830+
831+ BUG_ON(node->height != 1);
832+
833+ start = node;
834+ while (node && !(--node->count))
835+ node = node->parent;
836+
837+ end = node;
838+ if (!node) {
839+ root->rnode = NULL;
840+ root->height = 0;
841+ root->min = 0;
842+ root->num = 0;
843+ root->enter_node = NULL;
844+ } else {
845+ offset = (index >> (root->shift * (node->height - 1))) & root->mask;
846+ if (root->rm)
847+ root->rm(node, offset);
848+ node->stores[offset] = NULL;
849+ root->num--;
850+ if (root->min > index) {
851+ root->min = index;
852+ root->enter_node = node;
853+ }
854+ }
855+
856+ if (start != end) {
857+ do {
858+ node = start;
859+ start = start->parent;
860+ if (unlikely(root->enter_node == node))
861+ root->enter_node = end;
862+ root->free(node);
863+ } while (start != end);
864+
865+ /*
866+ * Note that shrink may free "end", so enter_node still need to
867+ * be checked inside.
868+ */
869+ sradix_tree_shrink(root);
870+ } else if (node->count == root->stores_size - 1) {
871+ /* It WAS a full leaf node. Update the ancestors */
872+ node = node->parent;
873+ while (node) {
874+ node->fulls--;
875+ if (node->fulls != root->stores_size - 1)
876+ break;
877+
878+ node = node->parent;
879+ }
880+ }
881+}
882+
883+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
884+{
885+ unsigned int height, offset;
886+ struct sradix_tree_node *node;
887+ int shift;
888+
889+ node = root->rnode;
890+ if (node == NULL || (index >> (root->shift * root->height)))
891+ return NULL;
892+
893+ height = root->height;
894+ shift = (height - 1) * root->shift;
895+
896+ do {
897+ offset = (index >> shift) & root->mask;
898+ node = node->stores[offset];
899+ if (!node)
900+ return NULL;
901+
902+ shift -= root->shift;
903+ } while (shift >= 0);
904+
905+ return node;
906+}
907+
908+/*
909+ * Return the item if it exists, otherwise create it in place
910+ * and return the created item.
911+ */
912+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
913+ unsigned long index, void *(*item_alloc)(void))
914+{
915+ unsigned int height, offset;
916+ struct sradix_tree_node *node, *tmp;
917+ void *item;
918+ int shift, error;
919+
920+ if (root->rnode == NULL || (index >> (root->shift * root->height))) {
921+ if (item_alloc) {
922+ error = sradix_tree_extend(root, index);
923+ if (error)
924+ return NULL;
925+ } else {
926+ return NULL;
927+ }
928+ }
929+
930+ node = root->rnode;
931+ height = root->height;
932+ shift = (height - 1) * root->shift;
933+
934+ do {
935+ offset = (index >> shift) & root->mask;
936+ if (!node->stores[offset]) {
937+ if (!(tmp = root->alloc()))
938+ return NULL;
939+
940+ tmp->height = shift / root->shift;
941+ node->stores[offset] = tmp;
942+ tmp->parent = node;
943+ node->count++;
944+ node = tmp;
945+ } else {
946+ node = node->stores[offset];
947+ }
948+
949+ shift -= root->shift;
950+ } while (shift > 0);
951+
952+ BUG_ON(node->height != 1);
953+ offset = index & root->mask;
954+ if (node->stores[offset]) {
955+ return node->stores[offset];
956+ } else if (item_alloc) {
957+ if (!(item = item_alloc()))
958+ return NULL;
959+
960+ node->stores[offset] = item;
961+
962+ /*
963+ * NOTE: we do NOT call root->assign here, since this item is
964+ * newly created by us having no meaning. Caller can call this
965+ * if it's necessary to do so.
966+ */
967+
968+ node->count++;
969+ root->num++;
970+
971+ while (sradix_node_full(root, node)) {
972+ node = node->parent;
973+ if (!node)
974+ break;
975+
976+ node->fulls++;
977+ }
978+
979+ if (unlikely(!node)) {
980+ /* All nodes are full */
981+ root->min = 1 << (root->height * root->shift);
982+ } else {
983+ if (root->min == index) {
984+ root->min |= (1UL << (node->height - 1)) - 1;
985+ root->min++;
986+ root->enter_node = node;
987+ }
988+ }
989+
990+ return item;
991+ } else {
992+ return NULL;
993+ }
994+
995+}
996+
997+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
998+{
999+ unsigned int height, offset;
1000+ struct sradix_tree_node *node;
1001+ int shift;
1002+
1003+ node = root->rnode;
1004+ if (node == NULL || (index >> (root->shift * root->height)))
1005+ return -ENOENT;
1006+
1007+ height = root->height;
1008+ shift = (height - 1) * root->shift;
1009+
1010+ do {
1011+ offset = (index >> shift) & root->mask;
1012+ node = node->stores[offset];
1013+ if (!node)
1014+ return -ENOENT;
1015+
1016+ shift -= root->shift;
1017+ } while (shift > 0);
1018+
1019+ offset = index & root->mask;
1020+ if (!node->stores[offset])
1021+ return -ENOENT;
1022+
1023+ sradix_tree_delete_from_leaf(root, node, index);
1024+
1025+ return 0;
1026+}
1027diff --git a/mm/Kconfig b/mm/Kconfig
1028index 1d1ae6b..511dde6 100644
1029--- a/mm/Kconfig
1030+++ b/mm/Kconfig
1031@@ -339,6 +339,32 @@ config KSM
1032 See Documentation/vm/ksm.txt for more information: KSM is inactive
1033 until a program has madvised that an area is MADV_MERGEABLE, and
1034 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1035+choice
1036+ prompt "Choose UKSM/KSM strategy"
1037+ default UKSM
1038+ depends on KSM
1039+ help
1040+ This option allows to select a UKSM/KSM stragety.
1041+
1042+config UKSM
1043+ bool "Ultra-KSM for page merging"
1044+ depends on KSM
1045+ help
1046+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1047+ page Merging), but with a fundamentally rewritten core algorithm. With
1048+ an advanced algorithm, UKSM now can transparently scans all anonymously
1049+ mapped user space applications with an significantly improved scan speed
1050+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1051+ UKSM. Now UKSM has its first stable release and first real world enterprise user.
1052+ For more information, please goto its project page.
1053+ (www.kerneldedup.org)
1054+
1055+config KSM_LEGACY
1056+ bool "Legacy KSM implementation"
1057+ depends on KSM
1058+ help
1059+ The legacy KSM implementation from Redhat.
1060+endchoice
1061
1062 config DEFAULT_MMAP_MIN_ADDR
1063 int "Low address space to protect from user allocation"
1064diff --git a/mm/Makefile b/mm/Makefile
1065index 8405eb0..7689f0c 100644
1066--- a/mm/Makefile
1067+++ b/mm/Makefile
1068@@ -44,7 +44,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
1069 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1070 obj-$(CONFIG_SLOB) += slob.o
1071 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1072-obj-$(CONFIG_KSM) += ksm.o
1073+obj-$(CONFIG_KSM_LEGACY) += ksm.o
1074+obj-$(CONFIG_UKSM) += uksm.o
1075 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
1076 obj-$(CONFIG_SLAB) += slab.o
1077 obj-$(CONFIG_SLUB) += slub.o
1078diff --git a/mm/memory.c b/mm/memory.c
1079index d5f2ae9..86b5d09 100644
1080--- a/mm/memory.c
1081+++ b/mm/memory.c
1082@@ -120,6 +120,28 @@ unsigned long highest_memmap_pfn __read_mostly;
1083
1084 EXPORT_SYMBOL(zero_pfn);
1085
1086+#ifdef CONFIG_UKSM
1087+unsigned long uksm_zero_pfn __read_mostly;
1088+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1089+struct page *empty_uksm_zero_page;
1090+
1091+static int __init setup_uksm_zero_page(void)
1092+{
1093+ unsigned long addr;
1094+ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
1095+ if (!addr)
1096+ panic("Oh boy, that early out of memory?");
1097+
1098+ empty_uksm_zero_page = virt_to_page((void *) addr);
1099+ SetPageReserved(empty_uksm_zero_page);
1100+
1101+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1102+
1103+ return 0;
1104+}
1105+core_initcall(setup_uksm_zero_page);
1106+#endif
1107+
1108 /*
1109 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1110 */
1111@@ -131,6 +153,7 @@ static int __init init_zero_pfn(void)
1112 core_initcall(init_zero_pfn);
1113
1114
1115+
1116 #if defined(SPLIT_RSS_COUNTING)
1117
1118 void sync_mm_rss(struct mm_struct *mm)
1119@@ -878,6 +901,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1120 rss[MM_ANONPAGES]++;
1121 else
1122 rss[MM_FILEPAGES]++;
1123+
1124+ /* Should return NULL in vm_normal_page() */
1125+ uksm_bugon_zeropage(pte);
1126+ } else {
1127+ uksm_map_zero_page(pte);
1128 }
1129
1130 out_set_pte:
1131@@ -1120,8 +1148,10 @@ again:
1132 ptent = ptep_get_and_clear_full(mm, addr, pte,
1133 tlb->fullmm);
1134 tlb_remove_tlb_entry(tlb, pte, addr);
1135- if (unlikely(!page))
1136+ if (unlikely(!page)) {
1137+ uksm_unmap_zero_page(ptent);
1138 continue;
1139+ }
1140 if (unlikely(details) && details->nonlinear_vma
1141 && linear_page_index(details->nonlinear_vma,
1142 addr) != page->index) {
1143@@ -1983,8 +2013,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1144 clear_page(kaddr);
1145 kunmap_atomic(kaddr);
1146 flush_dcache_page(dst);
1147- } else
1148+ } else {
1149 copy_user_highpage(dst, src, va, vma);
1150+ uksm_cow_page(vma, src);
1151+ }
1152 }
1153
1154 /*
1155@@ -2198,6 +2230,7 @@ gotten:
1156 new_page = alloc_zeroed_user_highpage_movable(vma, address);
1157 if (!new_page)
1158 goto oom;
1159+ uksm_cow_pte(vma, orig_pte);
1160 } else {
1161 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1162 if (!new_page)
1163@@ -2223,8 +2256,11 @@ gotten:
1164 dec_mm_counter_fast(mm, MM_FILEPAGES);
1165 inc_mm_counter_fast(mm, MM_ANONPAGES);
1166 }
1167- } else
1168+ uksm_bugon_zeropage(orig_pte);
1169+ } else {
1170+ uksm_unmap_zero_page(orig_pte);
1171 inc_mm_counter_fast(mm, MM_ANONPAGES);
1172+ }
1173 flush_cache_page(vma, address, pte_pfn(orig_pte));
1174 entry = mk_pte(new_page, vma->vm_page_prot);
1175 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1176diff --git a/mm/mmap.c b/mm/mmap.c
1177index ae91989..844f366 100644
1178--- a/mm/mmap.c
1179+++ b/mm/mmap.c
1180@@ -41,6 +41,7 @@
1181 #include <linux/notifier.h>
1182 #include <linux/memory.h>
1183 #include <linux/printk.h>
1184+#include <linux/ksm.h>
1185
1186 #include <asm/uaccess.h>
1187 #include <asm/cacheflush.h>
1188@@ -279,6 +280,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1189 if (vma->vm_file)
1190 fput(vma->vm_file);
1191 mpol_put(vma_policy(vma));
1192+ uksm_remove_vma(vma);
1193 kmem_cache_free(vm_area_cachep, vma);
1194 return next;
1195 }
1196@@ -739,9 +741,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1197 long adjust_next = 0;
1198 int remove_next = 0;
1199
1200+/*
1201+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1202+ * acquired
1203+ */
1204+ uksm_remove_vma(vma);
1205+
1206 if (next && !insert) {
1207 struct vm_area_struct *exporter = NULL;
1208
1209+ uksm_remove_vma(next);
1210 if (end >= next->vm_end) {
1211 /*
1212 * vma expands, overlapping all the next, and
1213@@ -838,6 +847,7 @@ again: remove_next = 1 + (end > next->vm_end);
1214 end_changed = true;
1215 }
1216 vma->vm_pgoff = pgoff;
1217+
1218 if (adjust_next) {
1219 next->vm_start += adjust_next << PAGE_SHIFT;
1220 next->vm_pgoff += adjust_next;
1221@@ -908,16 +918,22 @@ again: remove_next = 1 + (end > next->vm_end);
1222 * up the code too much to do both in one go.
1223 */
1224 next = vma->vm_next;
1225- if (remove_next == 2)
1226+ if (remove_next == 2) {
1227+ uksm_remove_vma(next);
1228 goto again;
1229- else if (next)
1230+ } else if (next) {
1231 vma_gap_update(next);
1232- else
1233+ } else {
1234 mm->highest_vm_end = end;
1235+ }
1236+ } else {
1237+ if (next && !insert)
1238+ uksm_vma_add_new(next);
1239 }
1240 if (insert && file)
1241 uprobe_mmap(insert);
1242
1243+ uksm_vma_add_new(vma);
1244 validate_mm(mm);
1245
1246 return 0;
1247@@ -1310,6 +1326,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1248 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1249 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1250
1251+ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
1252+ uksm_vm_flags_mod(&vm_flags);
1253+
1254 if (flags & MAP_LOCKED)
1255 if (!can_do_mlock())
1256 return -EPERM;
1257@@ -1651,6 +1670,7 @@ munmap_back:
1258 allow_write_access(file);
1259 }
1260 file = vma->vm_file;
1261+ uksm_vma_add_new(vma);
1262 out:
1263 perf_event_mmap(vma);
1264
1265@@ -1692,6 +1712,7 @@ allow_write_and_free_vma:
1266 if (vm_flags & VM_DENYWRITE)
1267 allow_write_access(file);
1268 free_vma:
1269+ uksm_remove_vma(vma);
1270 kmem_cache_free(vm_area_cachep, vma);
1271 unacct_error:
1272 if (charged)
1273@@ -2488,6 +2509,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1274 else
1275 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1276
1277+ uksm_vma_add_new(new);
1278+
1279 /* Success. */
1280 if (!err)
1281 return 0;
1282@@ -2654,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1283 return addr;
1284
1285 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1286+ uksm_vm_flags_mod(&flags);
1287
1288 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1289 if (error & ~PAGE_MASK)
1290@@ -2712,6 +2736,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
1291 vma->vm_flags = flags;
1292 vma->vm_page_prot = vm_get_page_prot(flags);
1293 vma_link(mm, vma, prev, rb_link, rb_parent);
1294+ uksm_vma_add_new(vma);
1295 out:
1296 perf_event_mmap(vma);
1297 mm->total_vm += len >> PAGE_SHIFT;
1298@@ -2747,6 +2772,12 @@ void exit_mmap(struct mm_struct *mm)
1299 /* mm's last user has gone, and its about to be pulled down */
1300 mmu_notifier_release(mm);
1301
1302+ /*
1303+ * Taking write lock on mmap_sem does not harm others,
1304+ * but it's crucial for uksm to avoid races.
1305+ */
1306+ down_write(&mm->mmap_sem);
1307+
1308 if (mm->locked_vm) {
1309 vma = mm->mmap;
1310 while (vma) {
1311@@ -2783,6 +2814,11 @@ void exit_mmap(struct mm_struct *mm)
1312 }
1313 vm_unacct_memory(nr_accounted);
1314
1315+ mm->mmap = NULL;
1316+ mm->mm_rb = RB_ROOT;
1317+ vmacache_invalidate(mm);
1318+ up_write(&mm->mmap_sem);
1319+
1320 WARN_ON(atomic_long_read(&mm->nr_ptes) >
1321 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1322 }
1323@@ -2891,6 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1324 new_vma->vm_ops->open(new_vma);
1325 vma_link(mm, new_vma, prev, rb_link, rb_parent);
1326 *need_rmap_locks = false;
1327+ uksm_vma_add_new(new_vma);
1328 }
1329 }
1330 return new_vma;
1331@@ -3004,10 +3041,10 @@ static struct vm_area_struct *__install_special_mapping(
1332 ret = insert_vm_struct(mm, vma);
1333 if (ret)
1334 goto out;
1335-
1336 mm->total_vm += len >> PAGE_SHIFT;
1337
1338 perf_event_mmap(vma);
1339+ uksm_vma_add_new(vma);
1340
1341 return vma;
1342
1343diff --git a/mm/rmap.c b/mm/rmap.c
1344index 3e4c721..d39d8a3 100644
1345--- a/mm/rmap.c
1346+++ b/mm/rmap.c
1347@@ -908,9 +908,9 @@ void page_move_anon_rmap(struct page *page,
1348
1349 /**
1350 * __page_set_anon_rmap - set up new anonymous rmap
1351- * @page: Page to add to rmap
1352+ * @page: Page to add to rmap
1353 * @vma: VM area to add page to.
1354- * @address: User virtual address of the mapping
1355+ * @address: User virtual address of the mapping
1356 * @exclusive: the page is exclusively owned by the current process
1357 */
1358 static void __page_set_anon_rmap(struct page *page,
1359diff --git a/mm/uksm.c b/mm/uksm.c
1360new file mode 100644
1361index 0000000..c76fcfc
1362--- /dev/null
1363+++ b/mm/uksm.c
1364@@ -0,0 +1,5519 @@
1365+/*
1366+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1367+ *
1368+ * This is an improvement upon KSM. Some basic data structures and routines
1369+ * are borrowed from ksm.c .
1370+ *
1371+ * Its new features:
1372+ * 1. Full system scan:
1373+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
1374+ * interaction to submit a memory area to KSM is no longer needed.
1375+ *
1376+ * 2. Rich area detection:
1377+ * It automatically detects rich areas containing abundant duplicated
1378+ * pages based. Rich areas are given a full scan speed. Poor areas are
1379+ * sampled at a reasonable speed with very low CPU consumption.
1380+ *
1381+ * 3. Ultra Per-page scan speed improvement:
1382+ * A new hash algorithm is proposed. As a result, on a machine with
1383+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1384+ * can scan memory areas that does not contain duplicated pages at speed of
1385+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1386+ * 477MB/sec ~ 923MB/sec.
1387+ *
1388+ * 4. Thrashing area avoidance:
1389+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
1390+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
1391+ * hash value based volatile page detection.
1392+ *
1393+ *
1394+ * 5. Misc changes upon KSM:
1395+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1396+ * comparison. It's much faster than default C version on x86.
1397+ * * rmap_item now has an struct *page member to loosely cache a
1398+ * address-->page mapping, which reduces too much time-costly
1399+ * follow_page().
1400+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1401+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1402+ * ksm is needed for this case.
1403+ *
1404+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
1405+ * Now uksmd consider full zero pages as special pages and merge them to an
1406+ * special unswappable uksm zero page.
1407+ */
1408+
1409+#include <linux/errno.h>
1410+#include <linux/mm.h>
1411+#include <linux/fs.h>
1412+#include <linux/mman.h>
1413+#include <linux/sched.h>
1414+#include <linux/rwsem.h>
1415+#include <linux/pagemap.h>
1416+#include <linux/rmap.h>
1417+#include <linux/spinlock.h>
1418+#include <linux/jhash.h>
1419+#include <linux/delay.h>
1420+#include <linux/kthread.h>
1421+#include <linux/wait.h>
1422+#include <linux/slab.h>
1423+#include <linux/rbtree.h>
1424+#include <linux/memory.h>
1425+#include <linux/mmu_notifier.h>
1426+#include <linux/swap.h>
1427+#include <linux/ksm.h>
1428+#include <linux/crypto.h>
1429+#include <linux/scatterlist.h>
1430+#include <crypto/hash.h>
1431+#include <linux/random.h>
1432+#include <linux/math64.h>
1433+#include <linux/gcd.h>
1434+#include <linux/freezer.h>
1435+#include <linux/sradix-tree.h>
1436+
1437+#include <asm/tlbflush.h>
1438+#include "internal.h"
1439+
1440+#ifdef CONFIG_X86
1441+#undef memcmp
1442+
1443+#ifdef CONFIG_X86_32
1444+#define memcmp memcmpx86_32
1445+/*
1446+ * Compare 4-byte-aligned address s1 and s2, with length n
1447+ */
1448+int memcmpx86_32(void *s1, void *s2, size_t n)
1449+{
1450+ size_t num = n / 4;
1451+ register int res;
1452+
1453+ __asm__ __volatile__
1454+ (
1455+ "testl %3,%3\n\t"
1456+ "repe; cmpsd\n\t"
1457+ "je 1f\n\t"
1458+ "sbbl %0,%0\n\t"
1459+ "orl $1,%0\n"
1460+ "1:"
1461+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1462+ : "0" (0)
1463+ : "cc");
1464+
1465+ return res;
1466+}
1467+
1468+/*
1469+ * Check the page is all zero ?
1470+ */
1471+static int is_full_zero(const void *s1, size_t len)
1472+{
1473+ unsigned char same;
1474+
1475+ len /= 4;
1476+
1477+ __asm__ __volatile__
1478+ ("repe; scasl;"
1479+ "sete %0"
1480+ : "=qm" (same), "+D" (s1), "+c" (len)
1481+ : "a" (0)
1482+ : "cc");
1483+
1484+ return same;
1485+}
1486+
1487+
1488+#elif defined(CONFIG_X86_64)
1489+#define memcmp memcmpx86_64
1490+/*
1491+ * Compare 8-byte-aligned address s1 and s2, with length n
1492+ */
1493+int memcmpx86_64(void *s1, void *s2, size_t n)
1494+{
1495+ size_t num = n / 8;
1496+ register int res;
1497+
1498+ __asm__ __volatile__
1499+ (
1500+ "testq %q3,%q3\n\t"
1501+ "repe; cmpsq\n\t"
1502+ "je 1f\n\t"
1503+ "sbbq %q0,%q0\n\t"
1504+ "orq $1,%q0\n"
1505+ "1:"
1506+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1507+ : "0" (0)
1508+ : "cc");
1509+
1510+ return res;
1511+}
1512+
1513+static int is_full_zero(const void *s1, size_t len)
1514+{
1515+ unsigned char same;
1516+
1517+ len /= 8;
1518+
1519+ __asm__ __volatile__
1520+ ("repe; scasq;"
1521+ "sete %0"
1522+ : "=qm" (same), "+D" (s1), "+c" (len)
1523+ : "a" (0)
1524+ : "cc");
1525+
1526+ return same;
1527+}
1528+
1529+#endif
1530+#else
1531+static int is_full_zero(const void *s1, size_t len)
1532+{
1533+ unsigned long *src = s1;
1534+ int i;
1535+
1536+ len /= sizeof(*src);
1537+
1538+ for (i = 0; i < len; i++) {
1539+ if (src[i])
1540+ return 0;
1541+ }
1542+
1543+ return 1;
1544+}
1545+#endif
1546+
1547+#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
1548+#define TIME_RATIO_SCALE 10000
1549+
1550+#define SLOT_TREE_NODE_SHIFT 8
1551+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
1552+struct slot_tree_node {
1553+ unsigned long size;
1554+ struct sradix_tree_node snode;
1555+ void *stores[SLOT_TREE_NODE_STORE_SIZE];
1556+};
1557+
1558+static struct kmem_cache *slot_tree_node_cachep;
1559+
1560+static struct sradix_tree_node *slot_tree_node_alloc(void)
1561+{
1562+ struct slot_tree_node *p;
1563+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
1564+ if (!p)
1565+ return NULL;
1566+
1567+ return &p->snode;
1568+}
1569+
1570+static void slot_tree_node_free(struct sradix_tree_node *node)
1571+{
1572+ struct slot_tree_node *p;
1573+
1574+ p = container_of(node, struct slot_tree_node, snode);
1575+ kmem_cache_free(slot_tree_node_cachep, p);
1576+}
1577+
1578+static void slot_tree_node_extend(struct sradix_tree_node *parent,
1579+ struct sradix_tree_node *child)
1580+{
1581+ struct slot_tree_node *p, *c;
1582+
1583+ p = container_of(parent, struct slot_tree_node, snode);
1584+ c = container_of(child, struct slot_tree_node, snode);
1585+
1586+ p->size += c->size;
1587+}
1588+
1589+void slot_tree_node_assign(struct sradix_tree_node *node,
1590+ unsigned index, void *item)
1591+{
1592+ struct vma_slot *slot = item;
1593+ struct slot_tree_node *cur;
1594+
1595+ slot->snode = node;
1596+ slot->sindex = index;
1597+
1598+ while (node) {
1599+ cur = container_of(node, struct slot_tree_node, snode);
1600+ cur->size += slot->pages;
1601+ node = node->parent;
1602+ }
1603+}
1604+
1605+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
1606+{
1607+ struct vma_slot *slot;
1608+ struct slot_tree_node *cur;
1609+ unsigned long pages;
1610+
1611+ if (node->height == 1) {
1612+ slot = node->stores[offset];
1613+ pages = slot->pages;
1614+ } else {
1615+ cur = container_of(node->stores[offset],
1616+ struct slot_tree_node, snode);
1617+ pages = cur->size;
1618+ }
1619+
1620+ while (node) {
1621+ cur = container_of(node, struct slot_tree_node, snode);
1622+ cur->size -= pages;
1623+ node = node->parent;
1624+ }
1625+}
1626+
1627+unsigned long slot_iter_index;
1628+int slot_iter(void *item, unsigned long height)
1629+{
1630+ struct slot_tree_node *node;
1631+ struct vma_slot *slot;
1632+
1633+ if (height == 1) {
1634+ slot = item;
1635+ if (slot_iter_index < slot->pages) {
1636+ /*in this one*/
1637+ return 1;
1638+ } else {
1639+ slot_iter_index -= slot->pages;
1640+ return 0;
1641+ }
1642+
1643+ } else {
1644+ node = container_of(item, struct slot_tree_node, snode);
1645+ if (slot_iter_index < node->size) {
1646+ /*in this one*/
1647+ return 1;
1648+ } else {
1649+ slot_iter_index -= node->size;
1650+ return 0;
1651+ }
1652+ }
1653+}
1654+
1655+
1656+static inline void slot_tree_init_root(struct sradix_tree_root *root)
1657+{
1658+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1659+ root->alloc = slot_tree_node_alloc;
1660+ root->free = slot_tree_node_free;
1661+ root->extend = slot_tree_node_extend;
1662+ root->assign = slot_tree_node_assign;
1663+ root->rm = slot_tree_node_rm;
1664+}
1665+
1666+void slot_tree_init(void)
1667+{
1668+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1669+ sizeof(struct slot_tree_node), 0,
1670+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1671+ NULL);
1672+}
1673+
1674+
1675+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1676+struct scan_rung {
1677+ //struct list_head scanned_list;
1678+ struct sradix_tree_root vma_root;
1679+ struct sradix_tree_root vma_root2;
1680+
1681+ struct vma_slot *current_scan;
1682+ unsigned long current_offset;
1683+
1684+ /*
1685+ * The initial value for current_offset, it should loop over
1686+ * [0~ step - 1] to let all slot have its chance to be scanned.
1687+ */
1688+ unsigned long offset_init;
1689+ unsigned long step; /* dynamic step for current_offset */
1690+ unsigned int flags;
1691+ unsigned long pages_to_scan;
1692+ //unsigned long fully_scanned_slots;
1693+ /*
1694+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1695+ * the cpu time ratio it can spend in rung_i for every scan
1696+ * period. if < 0, then it is the cpu time ratio relative to the
1697+ * max cpu percentage user specified. Both in unit of
1698+ * 1/TIME_RATIO_SCALE
1699+ */
1700+ int cpu_ratio;
1701+
1702+ /*
1703+ * How long it will take for all slots in this rung to be fully
1704+ * scanned? If it's zero, we don't care about the cover time:
1705+ * it's fully scanned.
1706+ */
1707+ unsigned int cover_msecs;
1708+ //unsigned long vma_num;
1709+ //unsigned long pages; /* Sum of all slot's pages in rung */
1710+};
1711+
1712+/**
1713+ * node of either the stable or unstale rbtree
1714+ *
1715+ */
1716+struct tree_node {
1717+ struct rb_node node; /* link in the main (un)stable rbtree */
1718+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1719+ u32 hash;
1720+ unsigned long count; /* TODO: merged with sub_root */
1721+ struct list_head all_list; /* all tree nodes in stable/unstable tree */
1722+};
1723+
1724+/**
1725+ * struct stable_node - node of the stable rbtree
1726+ * @node: rb node of this ksm page in the stable tree
1727+ * @hlist: hlist head of rmap_items using this ksm page
1728+ * @kpfn: page frame number of this ksm page
1729+ */
1730+struct stable_node {
1731+ struct rb_node node; /* link in sub-rbtree */
1732+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1733+ struct hlist_head hlist;
1734+ unsigned long kpfn;
1735+ u32 hash_max; /* if ==0 then it's not been calculated yet */
1736+ struct list_head all_list; /* in a list for all stable nodes */
1737+};
1738+
1739+/**
1740+ * struct node_vma - group rmap_items linked in a same stable
1741+ * node together.
1742+ */
1743+struct node_vma {
1744+ union {
1745+ struct vma_slot *slot;
1746+ unsigned long key; /* slot is used as key sorted on hlist */
1747+ };
1748+ struct hlist_node hlist;
1749+ struct hlist_head rmap_hlist;
1750+ struct stable_node *head;
1751+};
1752+
1753+/**
1754+ * struct rmap_item - reverse mapping item for virtual addresses
1755+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1756+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1757+ * @mm: the memory structure this rmap_item is pointing into
1758+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1759+ * @node: rb node of this rmap_item in the unstable tree
1760+ * @head: pointer to stable_node heading this list in the stable tree
1761+ * @hlist: link into hlist of rmap_items hanging off that stable_node
1762+ */
1763+struct rmap_item {
1764+ struct vma_slot *slot;
1765+ struct page *page;
1766+ unsigned long address; /* + low bits used for flags below */
1767+ unsigned long hash_round;
1768+ unsigned long entry_index;
1769+ union {
1770+ struct {/* when in unstable tree */
1771+ struct rb_node node;
1772+ struct tree_node *tree_node;
1773+ u32 hash_max;
1774+ };
1775+ struct { /* when in stable tree */
1776+ struct node_vma *head;
1777+ struct hlist_node hlist;
1778+ struct anon_vma *anon_vma;
1779+ };
1780+ };
1781+} __attribute__((aligned(4)));
1782+
1783+struct rmap_list_entry {
1784+ union {
1785+ struct rmap_item *item;
1786+ unsigned long addr;
1787+ };
1788+ /* lowest bit is used for is_addr tag */
1789+} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
1790+
1791+
1792+/* Basic data structure definition ends */
1793+
1794+
1795+/*
1796+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1797+ * The flags use the low bits of rmap_item.address
1798+ */
1799+#define UNSTABLE_FLAG 0x1
1800+#define STABLE_FLAG 0x2
1801+#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
1802+
1803+/*
1804+ * rmap_list_entry helpers
1805+ */
1806+#define IS_ADDR_FLAG 1
1807+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
1808+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
1809+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1810+
1811+
1812+/*
1813+ * High speed caches for frequently allocated and freed structs
1814+ */
1815+static struct kmem_cache *rmap_item_cache;
1816+static struct kmem_cache *stable_node_cache;
1817+static struct kmem_cache *node_vma_cache;
1818+static struct kmem_cache *vma_slot_cache;
1819+static struct kmem_cache *tree_node_cache;
1820+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1821+ sizeof(struct __struct), __alignof__(struct __struct),\
1822+ (__flags), NULL)
1823+
1824+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1825+#define SCAN_LADDER_SIZE 4
1826+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1827+
1828+/* The evaluation rounds uksmd has finished */
1829+static unsigned long long uksm_eval_round = 1;
1830+
1831+/*
1832+ * we add 1 to this var when we consider we should rebuild the whole
1833+ * unstable tree.
1834+ */
1835+static unsigned long uksm_hash_round = 1;
1836+
1837+/*
1838+ * How many times the whole memory is scanned.
1839+ */
1840+static unsigned long long fully_scanned_round = 1;
1841+
1842+/* The total number of virtual pages of all vma slots */
1843+static u64 uksm_pages_total;
1844+
1845+/* The number of pages has been scanned since the start up */
1846+static u64 uksm_pages_scanned;
1847+
1848+static u64 scanned_virtual_pages;
1849+
1850+/* The number of pages has been scanned since last encode_benefit call */
1851+static u64 uksm_pages_scanned_last;
1852+
1853+/* If the scanned number is tooo large, we encode it here */
1854+static u64 pages_scanned_stored;
1855+
1856+static unsigned long pages_scanned_base;
1857+
1858+/* The number of nodes in the stable tree */
1859+static unsigned long uksm_pages_shared;
1860+
1861+/* The number of page slots additionally sharing those nodes */
1862+static unsigned long uksm_pages_sharing;
1863+
1864+/* The number of nodes in the unstable tree */
1865+static unsigned long uksm_pages_unshared;
1866+
1867+/*
1868+ * Milliseconds ksmd should sleep between scans,
1869+ * >= 100ms to be consistent with
1870+ * scan_time_to_sleep_msec()
1871+ */
1872+static unsigned int uksm_sleep_jiffies;
1873+
1874+/* The real value for the uksmd next sleep */
1875+static unsigned int uksm_sleep_real;
1876+
1877+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1878+static unsigned int uksm_sleep_saved;
1879+
1880+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1881+static unsigned int uksm_max_cpu_percentage;
1882+
1883+static int uksm_cpu_governor;
1884+
1885+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1886+
1887+struct uksm_cpu_preset_s {
1888+ int cpu_ratio[SCAN_LADDER_SIZE];
1889+ unsigned int cover_msecs[SCAN_LADDER_SIZE];
1890+ unsigned int max_cpu; /* percentage */
1891+};
1892+
1893+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1894+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1895+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1896+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1897+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1898+};
1899+
1900+/* The default value for uksm_ema_page_time if it's not initialized */
1901+#define UKSM_PAGE_TIME_DEFAULT 500
1902+
1903+/*cost to scan one page by expotional moving average in nsecs */
1904+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1905+
1906+/* The expotional moving average alpha weight, in percentage. */
1907+#define EMA_ALPHA 20
1908+
1909+/*
1910+ * The threshold used to filter out thrashing areas,
1911+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1912+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1913+ * will be considered as having a zero duplication ratio.
1914+ */
1915+static unsigned int uksm_thrash_threshold = 50;
1916+
1917+/* How much dedup ratio is considered to be abundant*/
1918+static unsigned int uksm_abundant_threshold = 10;
1919+
1920+/* All slots having merged pages in this eval round. */
1921+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1922+
1923+/* How many times the ksmd has slept since startup */
1924+static unsigned long long uksm_sleep_times;
1925+
1926+#define UKSM_RUN_STOP 0
1927+#define UKSM_RUN_MERGE 1
1928+static unsigned int uksm_run = 1;
1929+
1930+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1931+static DEFINE_MUTEX(uksm_thread_mutex);
1932+
1933+/*
1934+ * List vma_slot_new is for newly created vma_slot waiting to be added by
1935+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1936+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1937+ * VMA has been removed/freed.
1938+ */
1939+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1940+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1941+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1942+static DEFINE_SPINLOCK(vma_slot_list_lock);
1943+
1944+/* The unstable tree heads */
1945+static struct rb_root root_unstable_tree = RB_ROOT;
1946+
1947+/*
1948+ * All tree_nodes are in a list to be freed at once when unstable tree is
1949+ * freed after each scan round.
1950+ */
1951+static struct list_head unstable_tree_node_list =
1952+ LIST_HEAD_INIT(unstable_tree_node_list);
1953+
1954+/* List contains all stable nodes */
1955+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1956+
1957+/*
1958+ * When the hash strength is changed, the stable tree must be delta_hashed and
1959+ * re-structured. We use two set of below structs to speed up the
1960+ * re-structuring of stable tree.
1961+ */
1962+static struct list_head
1963+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1964+ LIST_HEAD_INIT(stable_tree_node_list[1])};
1965+
1966+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1967+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1968+static struct rb_root *root_stable_treep = &root_stable_tree[0];
1969+static unsigned long stable_tree_index;
1970+
1971+/* The hash strength needed to hash a full page */
1972+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
1973+
1974+/* The hash strength needed for loop-back hashing */
1975+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
1976+
1977+/* The random offsets in a page */
1978+static u32 *random_nums;
1979+
1980+/* The hash strength */
1981+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1982+
1983+/* The delta value each time the hash strength increases or decreases */
1984+static unsigned long hash_strength_delta;
1985+#define HASH_STRENGTH_DELTA_MAX 5
1986+
1987+/* The time we have saved due to random_sample_hash */
1988+static u64 rshash_pos;
1989+
1990+/* The time we have wasted due to hash collision */
1991+static u64 rshash_neg;
1992+
1993+struct uksm_benefit {
1994+ u64 pos;
1995+ u64 neg;
1996+ u64 scanned;
1997+ unsigned long base;
1998+} benefit;
1999+
2000+/*
2001+ * The relative cost of memcmp, compared to 1 time unit of random sample
2002+ * hash, this value is tested when ksm module is initialized
2003+ */
2004+static unsigned long memcmp_cost;
2005+
2006+static unsigned long rshash_neg_cont_zero;
2007+static unsigned long rshash_cont_obscure;
2008+
2009+/* The possible states of hash strength adjustment heuristic */
2010+enum rshash_states {
2011+ RSHASH_STILL,
2012+ RSHASH_TRYUP,
2013+ RSHASH_TRYDOWN,
2014+ RSHASH_NEW,
2015+ RSHASH_PRE_STILL,
2016+};
2017+
2018+/* The possible direction we are about to adjust hash strength */
2019+enum rshash_direct {
2020+ GO_UP,
2021+ GO_DOWN,
2022+ OBSCURE,
2023+ STILL,
2024+};
2025+
2026+/* random sampling hash state machine */
2027+static struct {
2028+ enum rshash_states state;
2029+ enum rshash_direct pre_direct;
2030+ u8 below_count;
2031+ /* Keep a lookup window of size 5, iff above_count/below_count > 3
2032+ * in this window we stop trying.
2033+ */
2034+ u8 lookup_window_index;
2035+ u64 stable_benefit;
2036+ unsigned long turn_point_down;
2037+ unsigned long turn_benefit_down;
2038+ unsigned long turn_point_up;
2039+ unsigned long turn_benefit_up;
2040+ unsigned long stable_point;
2041+} rshash_state;
2042+
2043+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2044+static u32 *zero_hash_table;
2045+
2046+static inline struct node_vma *alloc_node_vma(void)
2047+{
2048+ struct node_vma *node_vma;
2049+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
2050+ if (node_vma) {
2051+ INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2052+ INIT_HLIST_NODE(&node_vma->hlist);
2053+ }
2054+ return node_vma;
2055+}
2056+
2057+static inline void free_node_vma(struct node_vma *node_vma)
2058+{
2059+ kmem_cache_free(node_vma_cache, node_vma);
2060+}
2061+
2062+
2063+static inline struct vma_slot *alloc_vma_slot(void)
2064+{
2065+ struct vma_slot *slot;
2066+
2067+ /*
2068+ * In case ksm is not initialized by now.
2069+ * Oops, we need to consider the call site of uksm_init() in the future.
2070+ */
2071+ if (!vma_slot_cache)
2072+ return NULL;
2073+
2074+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
2075+ if (slot) {
2076+ INIT_LIST_HEAD(&slot->slot_list);
2077+ INIT_LIST_HEAD(&slot->dedup_list);
2078+ slot->flags |= UKSM_SLOT_NEED_RERAND;
2079+ }
2080+ return slot;
2081+}
2082+
2083+static inline void free_vma_slot(struct vma_slot *vma_slot)
2084+{
2085+ kmem_cache_free(vma_slot_cache, vma_slot);
2086+}
2087+
2088+
2089+
2090+static inline struct rmap_item *alloc_rmap_item(void)
2091+{
2092+ struct rmap_item *rmap_item;
2093+
2094+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
2095+ if (rmap_item) {
2096+ /* bug on lowest bit is not clear for flag use */
2097+ BUG_ON(is_addr(rmap_item));
2098+ }
2099+ return rmap_item;
2100+}
2101+
2102+static inline void free_rmap_item(struct rmap_item *rmap_item)
2103+{
2104+ rmap_item->slot = NULL; /* debug safety */
2105+ kmem_cache_free(rmap_item_cache, rmap_item);
2106+}
2107+
2108+static inline struct stable_node *alloc_stable_node(void)
2109+{
2110+ struct stable_node *node;
2111+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
2112+ if (!node)
2113+ return NULL;
2114+
2115+ INIT_HLIST_HEAD(&node->hlist);
2116+ list_add(&node->all_list, &stable_node_list);
2117+ return node;
2118+}
2119+
2120+static inline void free_stable_node(struct stable_node *stable_node)
2121+{
2122+ list_del(&stable_node->all_list);
2123+ kmem_cache_free(stable_node_cache, stable_node);
2124+}
2125+
2126+static inline struct tree_node *alloc_tree_node(struct list_head *list)
2127+{
2128+ struct tree_node *node;
2129+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
2130+ if (!node)
2131+ return NULL;
2132+
2133+ list_add(&node->all_list, list);
2134+ return node;
2135+}
2136+
2137+static inline void free_tree_node(struct tree_node *node)
2138+{
2139+ list_del(&node->all_list);
2140+ kmem_cache_free(tree_node_cache, node);
2141+}
2142+
2143+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2144+{
2145+ struct anon_vma *anon_vma = rmap_item->anon_vma;
2146+
2147+ put_anon_vma(anon_vma);
2148+}
2149+
2150+
2151+/**
2152+ * Remove a stable node from stable_tree, may unlink from its tree_node and
2153+ * may remove its parent tree_node if no other stable node is pending.
2154+ *
2155+ * @stable_node The node need to be removed
2156+ * @unlink_rb Will this node be unlinked from the rbtree?
2157+ * @remove_tree_ node Will its tree_node be removed if empty?
2158+ */
2159+static void remove_node_from_stable_tree(struct stable_node *stable_node,
2160+ int unlink_rb, int remove_tree_node)
2161+{
2162+ struct node_vma *node_vma;
2163+ struct rmap_item *rmap_item;
2164+ struct hlist_node *n;
2165+
2166+ if (!hlist_empty(&stable_node->hlist)) {
2167+ hlist_for_each_entry_safe(node_vma, n,
2168+ &stable_node->hlist, hlist) {
2169+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2170+ uksm_pages_sharing--;
2171+
2172+ uksm_drop_anon_vma(rmap_item);
2173+ rmap_item->address &= PAGE_MASK;
2174+ }
2175+ free_node_vma(node_vma);
2176+ cond_resched();
2177+ }
2178+
2179+ /* the last one is counted as shared */
2180+ uksm_pages_shared--;
2181+ uksm_pages_sharing++;
2182+ }
2183+
2184+ if (stable_node->tree_node && unlink_rb) {
2185+ rb_erase(&stable_node->node,
2186+ &stable_node->tree_node->sub_root);
2187+
2188+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2189+ remove_tree_node) {
2190+ rb_erase(&stable_node->tree_node->node,
2191+ root_stable_treep);
2192+ free_tree_node(stable_node->tree_node);
2193+ } else {
2194+ stable_node->tree_node->count--;
2195+ }
2196+ }
2197+
2198+ free_stable_node(stable_node);
2199+}
2200+
2201+
2202+/*
2203+ * get_uksm_page: checks if the page indicated by the stable node
2204+ * is still its ksm page, despite having held no reference to it.
2205+ * In which case we can trust the content of the page, and it
2206+ * returns the gotten page; but if the page has now been zapped,
2207+ * remove the stale node from the stable tree and return NULL.
2208+ *
2209+ * You would expect the stable_node to hold a reference to the ksm page.
2210+ * But if it increments the page's count, swapping out has to wait for
2211+ * ksmd to come around again before it can free the page, which may take
2212+ * seconds or even minutes: much too unresponsive. So instead we use a
2213+ * "keyhole reference": access to the ksm page from the stable node peeps
2214+ * out through its keyhole to see if that page still holds the right key,
2215+ * pointing back to this stable node. This relies on freeing a PageAnon
2216+ * page to reset its page->mapping to NULL, and relies on no other use of
2217+ * a page to put something that might look like our key in page->mapping.
2218+ *
2219+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2220+ * but this is different - made simpler by uksm_thread_mutex being held, but
2221+ * interesting for assuming that no other use of the struct page could ever
2222+ * put our expected_mapping into page->mapping (or a field of the union which
2223+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
2224+ * to keep the page_count protocol described with page_cache_get_speculative.
2225+ *
2226+ * Note: it is possible that get_uksm_page() will return NULL one moment,
2227+ * then page the next, if the page is in between page_freeze_refs() and
2228+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2229+ * is on its way to being freed; but it is an anomaly to bear in mind.
2230+ *
2231+ * @unlink_rb: if the removal of this node will firstly unlink from
2232+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
2233+ * node from its old tree.
2234+ *
2235+ * @remove_tree_node: if this is the last one of its tree_node, will the
2236+ * tree_node be freed ? If we are inserting stable node, this tree_node may
2237+ * be reused, so don't free it.
2238+ */
2239+static struct page *get_uksm_page(struct stable_node *stable_node,
2240+ int unlink_rb, int remove_tree_node)
2241+{
2242+ struct page *page;
2243+ void *expected_mapping;
2244+
2245+ page = pfn_to_page(stable_node->kpfn);
2246+ expected_mapping = (void *)stable_node +
2247+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
2248+ rcu_read_lock();
2249+ if (page->mapping != expected_mapping)
2250+ goto stale;
2251+ if (!get_page_unless_zero(page))
2252+ goto stale;
2253+ if (page->mapping != expected_mapping) {
2254+ put_page(page);
2255+ goto stale;
2256+ }
2257+ rcu_read_unlock();
2258+ return page;
2259+stale:
2260+ rcu_read_unlock();
2261+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2262+
2263+ return NULL;
2264+}
2265+
2266+/*
2267+ * Removing rmap_item from stable or unstable tree.
2268+ * This function will clean the information from the stable/unstable tree.
2269+ */
2270+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2271+{
2272+ if (rmap_item->address & STABLE_FLAG) {
2273+ struct stable_node *stable_node;
2274+ struct node_vma *node_vma;
2275+ struct page *page;
2276+
2277+ node_vma = rmap_item->head;
2278+ stable_node = node_vma->head;
2279+ page = get_uksm_page(stable_node, 1, 1);
2280+ if (!page)
2281+ goto out;
2282+
2283+ /*
2284+ * page lock is needed because it's racing with
2285+ * try_to_unmap_ksm(), etc.
2286+ */
2287+ lock_page(page);
2288+ hlist_del(&rmap_item->hlist);
2289+
2290+ if (hlist_empty(&node_vma->rmap_hlist)) {
2291+ hlist_del(&node_vma->hlist);
2292+ free_node_vma(node_vma);
2293+ }
2294+ unlock_page(page);
2295+
2296+ put_page(page);
2297+ if (hlist_empty(&stable_node->hlist)) {
2298+ /* do NOT call remove_node_from_stable_tree() here,
2299+ * it's possible for a forked rmap_item not in
2300+ * stable tree while the in-tree rmap_items were
2301+ * deleted.
2302+ */
2303+ uksm_pages_shared--;
2304+ } else
2305+ uksm_pages_sharing--;
2306+
2307+
2308+ uksm_drop_anon_vma(rmap_item);
2309+ } else if (rmap_item->address & UNSTABLE_FLAG) {
2310+ if (rmap_item->hash_round == uksm_hash_round) {
2311+
2312+ rb_erase(&rmap_item->node,
2313+ &rmap_item->tree_node->sub_root);
2314+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2315+ rb_erase(&rmap_item->tree_node->node,
2316+ &root_unstable_tree);
2317+
2318+ free_tree_node(rmap_item->tree_node);
2319+ } else
2320+ rmap_item->tree_node->count--;
2321+ }
2322+ uksm_pages_unshared--;
2323+ }
2324+
2325+ rmap_item->address &= PAGE_MASK;
2326+ rmap_item->hash_max = 0;
2327+
2328+out:
2329+ cond_resched(); /* we're called from many long loops */
2330+}
2331+
2332+static inline int slot_in_uksm(struct vma_slot *slot)
2333+{
2334+ return list_empty(&slot->slot_list);
2335+}
2336+
2337+/*
2338+ * Test if the mm is exiting
2339+ */
2340+static inline bool uksm_test_exit(struct mm_struct *mm)
2341+{
2342+ return atomic_read(&mm->mm_users) == 0;
2343+}
2344+
2345+/**
2346+ * Need to do two things:
2347+ * 1. check if slot was moved to del list
2348+ * 2. make sure the mmap_sem is manipulated under valid vma.
2349+ *
2350+ * My concern here is that in some cases, this may make
2351+ * vma_slot_list_lock() waiters to serialized further by some
2352+ * sem->wait_lock, can this really be expensive?
2353+ *
2354+ *
2355+ * @return
2356+ * 0: if successfully locked mmap_sem
2357+ * -ENOENT: this slot was moved to del list
2358+ * -EBUSY: vma lock failed
2359+ */
2360+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2361+{
2362+ struct vm_area_struct *vma;
2363+ struct mm_struct *mm;
2364+ struct rw_semaphore *sem;
2365+
2366+ spin_lock(&vma_slot_list_lock);
2367+
2368+ /* the slot_list was removed and inited from new list, when it enters
2369+ * uksm_list. If now it's not empty, then it must be moved to del list
2370+ */
2371+ if (!slot_in_uksm(slot)) {
2372+ spin_unlock(&vma_slot_list_lock);
2373+ return -ENOENT;
2374+ }
2375+
2376+ BUG_ON(slot->pages != vma_pages(slot->vma));
2377+ /* Ok, vma still valid */
2378+ vma = slot->vma;
2379+ mm = vma->vm_mm;
2380+ sem = &mm->mmap_sem;
2381+
2382+ if (uksm_test_exit(mm)) {
2383+ spin_unlock(&vma_slot_list_lock);
2384+ return -ENOENT;
2385+ }
2386+
2387+ if (down_read_trylock(sem)) {
2388+ spin_unlock(&vma_slot_list_lock);
2389+ return 0;
2390+ }
2391+
2392+ spin_unlock(&vma_slot_list_lock);
2393+ return -EBUSY;
2394+}
2395+
2396+static inline unsigned long
2397+vma_page_address(struct page *page, struct vm_area_struct *vma)
2398+{
2399+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2400+ unsigned long address;
2401+
2402+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2403+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2404+ /* page should be within @vma mapping range */
2405+ return -EFAULT;
2406+ }
2407+ return address;
2408+}
2409+
2410+
2411+/* return 0 on success with the item's mmap_sem locked */
2412+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2413+{
2414+ struct mm_struct *mm;
2415+ struct vma_slot *slot = item->slot;
2416+ int err = -EINVAL;
2417+
2418+ struct page *page;
2419+
2420+ /*
2421+ * try_down_read_slot_mmap_sem() returns non-zero if the slot
2422+ * has been removed by uksm_remove_vma().
2423+ */
2424+ if (try_down_read_slot_mmap_sem(slot))
2425+ return -EBUSY;
2426+
2427+ mm = slot->vma->vm_mm;
2428+
2429+ if (uksm_test_exit(mm))
2430+ goto failout_up;
2431+
2432+ page = item->page;
2433+ rcu_read_lock();
2434+ if (!get_page_unless_zero(page)) {
2435+ rcu_read_unlock();
2436+ goto failout_up;
2437+ }
2438+
2439+ /* No need to consider huge page here. */
2440+ if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2441+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2442+ /*
2443+ * TODO:
2444+ * should we release this item becase of its stale page
2445+ * mapping?
2446+ */
2447+ put_page(page);
2448+ rcu_read_unlock();
2449+ goto failout_up;
2450+ }
2451+ rcu_read_unlock();
2452+ return 0;
2453+
2454+failout_up:
2455+ up_read(&mm->mmap_sem);
2456+ return err;
2457+}
2458+
2459+/*
2460+ * What kind of VMA is considered ?
2461+ */
2462+static inline int vma_can_enter(struct vm_area_struct *vma)
2463+{
2464+ return uksm_flags_can_scan(vma->vm_flags);
2465+}
2466+
2467+/*
2468+ * Called whenever a fresh new vma is created A new vma_slot.
2469+ * is created and inserted into a global list Must be called.
2470+ * after vma is inserted to its mm .
2471+ */
2472+void uksm_vma_add_new(struct vm_area_struct *vma)
2473+{
2474+ struct vma_slot *slot;
2475+
2476+ if (!vma_can_enter(vma)) {
2477+ vma->uksm_vma_slot = NULL;
2478+ return;
2479+ }
2480+
2481+ slot = alloc_vma_slot();
2482+ if (!slot) {
2483+ vma->uksm_vma_slot = NULL;
2484+ return;
2485+ }
2486+
2487+ vma->uksm_vma_slot = slot;
2488+ vma->vm_flags |= VM_MERGEABLE;
2489+ slot->vma = vma;
2490+ slot->mm = vma->vm_mm;
2491+ slot->ctime_j = jiffies;
2492+ slot->pages = vma_pages(vma);
2493+ spin_lock(&vma_slot_list_lock);
2494+ list_add_tail(&slot->slot_list, &vma_slot_new);
2495+ spin_unlock(&vma_slot_list_lock);
2496+}
2497+
2498+/*
2499+ * Called after vma is unlinked from its mm
2500+ */
2501+void uksm_remove_vma(struct vm_area_struct *vma)
2502+{
2503+ struct vma_slot *slot;
2504+
2505+ if (!vma->uksm_vma_slot)
2506+ return;
2507+
2508+ slot = vma->uksm_vma_slot;
2509+ spin_lock(&vma_slot_list_lock);
2510+ if (slot_in_uksm(slot)) {
2511+ /**
2512+ * This slot has been added by ksmd, so move to the del list
2513+ * waiting ksmd to free it.
2514+ */
2515+ list_add_tail(&slot->slot_list, &vma_slot_del);
2516+ } else {
2517+ /**
2518+ * It's still on new list. It's ok to free slot directly.
2519+ */
2520+ list_del(&slot->slot_list);
2521+ free_vma_slot(slot);
2522+ }
2523+ spin_unlock(&vma_slot_list_lock);
2524+ vma->uksm_vma_slot = NULL;
2525+}
2526+
2527+/* 32/3 < they < 32/2 */
2528+#define shiftl 8
2529+#define shiftr 12
2530+
2531+#define HASH_FROM_TO(from, to) \
2532+for (index = from; index < to; index++) { \
2533+ pos = random_nums[index]; \
2534+ hash += key[pos]; \
2535+ hash += (hash << shiftl); \
2536+ hash ^= (hash >> shiftr); \
2537+}
2538+
2539+
2540+#define HASH_FROM_DOWN_TO(from, to) \
2541+for (index = from - 1; index >= to; index--) { \
2542+ hash ^= (hash >> shiftr); \
2543+ hash ^= (hash >> (shiftr*2)); \
2544+ hash -= (hash << shiftl); \
2545+ hash += (hash << (shiftl*2)); \
2546+ pos = random_nums[index]; \
2547+ hash -= key[pos]; \
2548+}
2549+
2550+/*
2551+ * The main random sample hash function.
2552+ */
2553+static u32 random_sample_hash(void *addr, u32 hash_strength)
2554+{
2555+ u32 hash = 0xdeadbeef;
2556+ int index, pos, loop = hash_strength;
2557+ u32 *key = (u32 *)addr;
2558+
2559+ if (loop > HASH_STRENGTH_FULL)
2560+ loop = HASH_STRENGTH_FULL;
2561+
2562+ HASH_FROM_TO(0, loop);
2563+
2564+ if (hash_strength > HASH_STRENGTH_FULL) {
2565+ loop = hash_strength - HASH_STRENGTH_FULL;
2566+ HASH_FROM_TO(0, loop);
2567+ }
2568+
2569+ return hash;
2570+}
2571+
2572+
2573+/**
2574+ * It's used when hash strength is adjusted
2575+ *
2576+ * @addr The page's virtual address
2577+ * @from The original hash strength
2578+ * @to The hash strength changed to
2579+ * @hash The hash value generated with "from" hash value
2580+ *
2581+ * return the hash value
2582+ */
2583+static u32 delta_hash(void *addr, int from, int to, u32 hash)
2584+{
2585+ u32 *key = (u32 *)addr;
2586+ int index, pos; /* make sure they are int type */
2587+
2588+ if (to > from) {
2589+ if (from >= HASH_STRENGTH_FULL) {
2590+ from -= HASH_STRENGTH_FULL;
2591+ to -= HASH_STRENGTH_FULL;
2592+ HASH_FROM_TO(from, to);
2593+ } else if (to <= HASH_STRENGTH_FULL) {
2594+ HASH_FROM_TO(from, to);
2595+ } else {
2596+ HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2597+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2598+ }
2599+ } else {
2600+ if (from <= HASH_STRENGTH_FULL) {
2601+ HASH_FROM_DOWN_TO(from, to);
2602+ } else if (to >= HASH_STRENGTH_FULL) {
2603+ from -= HASH_STRENGTH_FULL;
2604+ to -= HASH_STRENGTH_FULL;
2605+ HASH_FROM_DOWN_TO(from, to);
2606+ } else {
2607+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2608+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2609+ }
2610+ }
2611+
2612+ return hash;
2613+}
2614+
2615+
2616+
2617+
2618+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2619+
2620+/**
2621+ *
2622+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2623+ * has finished.
2624+ *
2625+ * return 0 if no page has been scanned since last call, 1 otherwise.
2626+ */
2627+static inline int encode_benefit(void)
2628+{
2629+ u64 scanned_delta, pos_delta, neg_delta;
2630+ unsigned long base = benefit.base;
2631+
2632+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2633+
2634+ if (!scanned_delta)
2635+ return 0;
2636+
2637+ scanned_delta >>= base;
2638+ pos_delta = rshash_pos >> base;
2639+ neg_delta = rshash_neg >> base;
2640+
2641+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2642+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2643+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2644+ benefit.scanned >>= 1;
2645+ benefit.neg >>= 1;
2646+ benefit.pos >>= 1;
2647+ benefit.base++;
2648+ scanned_delta >>= 1;
2649+ pos_delta >>= 1;
2650+ neg_delta >>= 1;
2651+ }
2652+
2653+ benefit.pos += pos_delta;
2654+ benefit.neg += neg_delta;
2655+ benefit.scanned += scanned_delta;
2656+
2657+ BUG_ON(!benefit.scanned);
2658+
2659+ rshash_pos = rshash_neg = 0;
2660+ uksm_pages_scanned_last = uksm_pages_scanned;
2661+
2662+ return 1;
2663+}
2664+
2665+static inline void reset_benefit(void)
2666+{
2667+ benefit.pos = 0;
2668+ benefit.neg = 0;
2669+ benefit.base = 0;
2670+ benefit.scanned = 0;
2671+}
2672+
2673+static inline void inc_rshash_pos(unsigned long delta)
2674+{
2675+ if (CAN_OVERFLOW_U64(rshash_pos, delta))
2676+ encode_benefit();
2677+
2678+ rshash_pos += delta;
2679+}
2680+
2681+static inline void inc_rshash_neg(unsigned long delta)
2682+{
2683+ if (CAN_OVERFLOW_U64(rshash_neg, delta))
2684+ encode_benefit();
2685+
2686+ rshash_neg += delta;
2687+}
2688+
2689+
2690+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2691+ int cost_accounting)
2692+{
2693+ u32 val;
2694+ unsigned long delta;
2695+
2696+ void *addr = kmap_atomic(page);
2697+
2698+ val = random_sample_hash(addr, hash_strength);
2699+ kunmap_atomic(addr);
2700+
2701+ if (cost_accounting) {
2702+ if (HASH_STRENGTH_FULL > hash_strength)
2703+ delta = HASH_STRENGTH_FULL - hash_strength;
2704+ else
2705+ delta = 0;
2706+
2707+ inc_rshash_pos(delta);
2708+ }
2709+
2710+ return val;
2711+}
2712+
2713+static int memcmp_pages(struct page *page1, struct page *page2,
2714+ int cost_accounting)
2715+{
2716+ char *addr1, *addr2;
2717+ int ret;
2718+
2719+ addr1 = kmap_atomic(page1);
2720+ addr2 = kmap_atomic(page2);
2721+ ret = memcmp(addr1, addr2, PAGE_SIZE);
2722+ kunmap_atomic(addr2);
2723+ kunmap_atomic(addr1);
2724+
2725+ if (cost_accounting)
2726+ inc_rshash_neg(memcmp_cost);
2727+
2728+ return ret;
2729+}
2730+
2731+static inline int pages_identical(struct page *page1, struct page *page2)
2732+{
2733+ return !memcmp_pages(page1, page2, 0);
2734+}
2735+
2736+static inline int is_page_full_zero(struct page *page)
2737+{
2738+ char *addr;
2739+ int ret;
2740+
2741+ addr = kmap_atomic(page);
2742+ ret = is_full_zero(addr, PAGE_SIZE);
2743+ kunmap_atomic(addr);
2744+
2745+ return ret;
2746+}
2747+
2748+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2749+ pte_t *orig_pte, pte_t *old_pte)
2750+{
2751+ struct mm_struct *mm = vma->vm_mm;
2752+ unsigned long addr;
2753+ pte_t *ptep;
2754+ spinlock_t *ptl;
2755+ int swapped;
2756+ int err = -EFAULT;
2757+ unsigned long mmun_start; /* For mmu_notifiers */
2758+ unsigned long mmun_end; /* For mmu_notifiers */
2759+
2760+ addr = page_address_in_vma(page, vma);
2761+ if (addr == -EFAULT)
2762+ goto out;
2763+
2764+ BUG_ON(PageTransCompound(page));
2765+
2766+ mmun_start = addr;
2767+ mmun_end = addr + PAGE_SIZE;
2768+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2769+
2770+ ptep = page_check_address(page, mm, addr, &ptl, 0);
2771+ if (!ptep)
2772+ goto out_mn;
2773+
2774+ if (old_pte)
2775+ *old_pte = *ptep;
2776+
2777+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
2778+ pte_t entry;
2779+
2780+ swapped = PageSwapCache(page);
2781+ flush_cache_page(vma, addr, page_to_pfn(page));
2782+ /*
2783+ * Ok this is tricky, when get_user_pages_fast() run it doesnt
2784+ * take any lock, therefore the check that we are going to make
2785+ * with the pagecount against the mapcount is racey and
2786+ * O_DIRECT can happen right after the check.
2787+ * So we clear the pte and flush the tlb before the check
2788+ * this assure us that no O_DIRECT can happen after the check
2789+ * or in the middle of the check.
2790+ */
2791+ entry = ptep_clear_flush(vma, addr, ptep);
2792+ /*
2793+ * Check that no O_DIRECT or similar I/O is in progress on the
2794+ * page
2795+ */
2796+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2797+ set_pte_at(mm, addr, ptep, entry);
2798+ goto out_unlock;
2799+ }
2800+ if (pte_dirty(entry))
2801+ set_page_dirty(page);
2802+ entry = pte_mkclean(pte_wrprotect(entry));
2803+ set_pte_at_notify(mm, addr, ptep, entry);
2804+ }
2805+ *orig_pte = *ptep;
2806+ err = 0;
2807+
2808+out_unlock:
2809+ pte_unmap_unlock(ptep, ptl);
2810+out_mn:
2811+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2812+out:
2813+ return err;
2814+}
2815+
2816+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2817+#define MERGE_ERR_COLLI 2 /* there is a collision */
2818+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
2819+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
2820+
2821+
2822+/**
2823+ * replace_page - replace page in vma by new ksm page
2824+ * @vma: vma that holds the pte pointing to page
2825+ * @page: the page we are replacing by kpage
2826+ * @kpage: the ksm page we replace page by
2827+ * @orig_pte: the original value of the pte
2828+ *
2829+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
2830+ */
2831+static int replace_page(struct vm_area_struct *vma, struct page *page,
2832+ struct page *kpage, pte_t orig_pte)
2833+{
2834+ struct mm_struct *mm = vma->vm_mm;
2835+ pgd_t *pgd;
2836+ pud_t *pud;
2837+ pmd_t *pmd;
2838+ pte_t *ptep;
2839+ spinlock_t *ptl;
2840+ pte_t entry;
2841+
2842+ unsigned long addr;
2843+ int err = MERGE_ERR_PGERR;
2844+ unsigned long mmun_start; /* For mmu_notifiers */
2845+ unsigned long mmun_end; /* For mmu_notifiers */
2846+
2847+ addr = page_address_in_vma(page, vma);
2848+ if (addr == -EFAULT)
2849+ goto out;
2850+
2851+ pgd = pgd_offset(mm, addr);
2852+ if (!pgd_present(*pgd))
2853+ goto out;
2854+
2855+ pud = pud_offset(pgd, addr);
2856+ if (!pud_present(*pud))
2857+ goto out;
2858+
2859+ pmd = pmd_offset(pud, addr);
2860+ BUG_ON(pmd_trans_huge(*pmd));
2861+ if (!pmd_present(*pmd))
2862+ goto out;
2863+
2864+ mmun_start = addr;
2865+ mmun_end = addr + PAGE_SIZE;
2866+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2867+
2868+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2869+ if (!pte_same(*ptep, orig_pte)) {
2870+ pte_unmap_unlock(ptep, ptl);
2871+ goto out_mn;
2872+ }
2873+
2874+ flush_cache_page(vma, addr, pte_pfn(*ptep));
2875+ ptep_clear_flush(vma, addr, ptep);
2876+ entry = mk_pte(kpage, vma->vm_page_prot);
2877+
2878+ /* special treatment is needed for zero_page */
2879+ if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2880+ (page_to_pfn(kpage) == zero_pfn))
2881+ entry = pte_mkspecial(entry);
2882+ else {
2883+ get_page(kpage);
2884+ page_add_anon_rmap(kpage, vma, addr);
2885+ }
2886+
2887+ set_pte_at_notify(mm, addr, ptep, entry);
2888+
2889+ page_remove_rmap(page);
2890+ if (!page_mapped(page))
2891+ try_to_free_swap(page);
2892+ put_page(page);
2893+
2894+ pte_unmap_unlock(ptep, ptl);
2895+ err = 0;
2896+out_mn:
2897+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2898+out:
2899+ return err;
2900+}
2901+
2902+
2903+/**
2904+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2905+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2906+ * hash_max member has not been calculated.
2907+ *
2908+ * @page The page needs to be hashed
2909+ * @hash_old The hash value calculated with current hash strength
2910+ *
2911+ * return the new hash value calculated at HASH_STRENGTH_MAX
2912+ */
2913+static inline u32 page_hash_max(struct page *page, u32 hash_old)
2914+{
2915+ u32 hash_max = 0;
2916+ void *addr;
2917+
2918+ addr = kmap_atomic(page);
2919+ hash_max = delta_hash(addr, hash_strength,
2920+ HASH_STRENGTH_MAX, hash_old);
2921+
2922+ kunmap_atomic(addr);
2923+
2924+ if (!hash_max)
2925+ hash_max = 1;
2926+
2927+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2928+ return hash_max;
2929+}
2930+
2931+/*
2932+ * We compare the hash again, to ensure that it is really a hash collision
2933+ * instead of being caused by page write.
2934+ */
2935+static inline int check_collision(struct rmap_item *rmap_item,
2936+ u32 hash)
2937+{
2938+ int err;
2939+ struct page *page = rmap_item->page;
2940+
2941+ /* if this rmap_item has already been hash_maxed, then the collision
2942+ * must appears in the second-level rbtree search. In this case we check
2943+ * if its hash_max value has been changed. Otherwise, the collision
2944+ * happens in the first-level rbtree search, so we check against it's
2945+ * current hash value.
2946+ */
2947+ if (rmap_item->hash_max) {
2948+ inc_rshash_neg(memcmp_cost);
2949+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2950+
2951+ if (rmap_item->hash_max == page_hash_max(page, hash))
2952+ err = MERGE_ERR_COLLI;
2953+ else
2954+ err = MERGE_ERR_CHANGED;
2955+ } else {
2956+ inc_rshash_neg(memcmp_cost + hash_strength);
2957+
2958+ if (page_hash(page, hash_strength, 0) == hash)
2959+ err = MERGE_ERR_COLLI;
2960+ else
2961+ err = MERGE_ERR_CHANGED;
2962+ }
2963+
2964+ return err;
2965+}
2966+
2967+static struct page *page_trans_compound_anon(struct page *page)
2968+{
2969+ if (PageTransCompound(page)) {
2970+ struct page *head = compound_head(page);
2971+ /*
2972+ * head may actually be splitted and freed from under
2973+ * us but it's ok here.
2974+ */
2975+ if (PageAnon(head))
2976+ return head;
2977+ }
2978+ return NULL;
2979+}
2980+
2981+static int page_trans_compound_anon_split(struct page *page)
2982+{
2983+ int ret = 0;
2984+ struct page *transhuge_head = page_trans_compound_anon(page);
2985+ if (transhuge_head) {
2986+ /* Get the reference on the head to split it. */
2987+ if (get_page_unless_zero(transhuge_head)) {
2988+ /*
2989+ * Recheck we got the reference while the head
2990+ * was still anonymous.
2991+ */
2992+ if (PageAnon(transhuge_head))
2993+ ret = split_huge_page(transhuge_head);
2994+ else
2995+ /*
2996+ * Retry later if split_huge_page run
2997+ * from under us.
2998+ */
2999+ ret = 1;
3000+ put_page(transhuge_head);
3001+ } else
3002+ /* Retry later if split_huge_page run from under us. */
3003+ ret = 1;
3004+ }
3005+ return ret;
3006+}
3007+
3008+/**
3009+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3010+ * already be a ksm page.
3011+ *
3012+ * @return 0 if the pages were merged, -EFAULT otherwise.
3013+ */
3014+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3015+ struct page *kpage, u32 hash)
3016+{
3017+ struct vm_area_struct *vma = rmap_item->slot->vma;
3018+ struct mm_struct *mm = vma->vm_mm;
3019+ pte_t orig_pte = __pte(0);
3020+ int err = MERGE_ERR_PGERR;
3021+ struct page *page;
3022+
3023+ if (uksm_test_exit(mm))
3024+ goto out;
3025+
3026+ page = rmap_item->page;
3027+
3028+ if (page == kpage) { /* ksm page forked */
3029+ err = 0;
3030+ goto out;
3031+ }
3032+
3033+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3034+ goto out;
3035+ BUG_ON(PageTransCompound(page));
3036+
3037+ if (!PageAnon(page) || !PageKsm(kpage))
3038+ goto out;
3039+
3040+ /*
3041+ * We need the page lock to read a stable PageSwapCache in
3042+ * write_protect_page(). We use trylock_page() instead of
3043+ * lock_page() because we don't want to wait here - we
3044+ * prefer to continue scanning and merging different pages,
3045+ * then come back to this page when it is unlocked.
3046+ */
3047+ if (!trylock_page(page))
3048+ goto out;
3049+ /*
3050+ * If this anonymous page is mapped only here, its pte may need
3051+ * to be write-protected. If it's mapped elsewhere, all of its
3052+ * ptes are necessarily already write-protected. But in either
3053+ * case, we need to lock and check page_count is not raised.
3054+ */
3055+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3056+ if (pages_identical(page, kpage))
3057+ err = replace_page(vma, page, kpage, orig_pte);
3058+ else
3059+ err = check_collision(rmap_item, hash);
3060+ }
3061+
3062+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3063+ munlock_vma_page(page);
3064+ if (!PageMlocked(kpage)) {
3065+ unlock_page(page);
3066+ lock_page(kpage);
3067+ mlock_vma_page(kpage);
3068+ page = kpage; /* for final unlock */
3069+ }
3070+ }
3071+
3072+ unlock_page(page);
3073+out:
3074+ return err;
3075+}
3076+
3077+
3078+
3079+/**
3080+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3081+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
3082+ *
3083+ * @return 0 on success.
3084+ */
3085+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3086+ pte_t orig_pte, pte_t wprt_pte)
3087+{
3088+ struct mm_struct *mm = vma->vm_mm;
3089+ pgd_t *pgd;
3090+ pud_t *pud;
3091+ pmd_t *pmd;
3092+ pte_t *ptep;
3093+ spinlock_t *ptl;
3094+
3095+ int err = -EFAULT;
3096+
3097+ pgd = pgd_offset(mm, addr);
3098+ if (!pgd_present(*pgd))
3099+ goto out;
3100+
3101+ pud = pud_offset(pgd, addr);
3102+ if (!pud_present(*pud))
3103+ goto out;
3104+
3105+ pmd = pmd_offset(pud, addr);
3106+ if (!pmd_present(*pmd))
3107+ goto out;
3108+
3109+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3110+ if (!pte_same(*ptep, wprt_pte)) {
3111+ /* already copied, let it be */
3112+ pte_unmap_unlock(ptep, ptl);
3113+ goto out;
3114+ }
3115+
3116+ /*
3117+ * Good boy, still here. When we still get the ksm page, it does not
3118+ * return to the free page pool, there is no way that a pte was changed
3119+ * to other page and gets back to this page. And remind that ksm page
3120+ * do not reuse in do_wp_page(). So it's safe to restore the original
3121+ * pte.
3122+ */
3123+ flush_cache_page(vma, addr, pte_pfn(*ptep));
3124+ ptep_clear_flush(vma, addr, ptep);
3125+ set_pte_at_notify(mm, addr, ptep, orig_pte);
3126+
3127+ pte_unmap_unlock(ptep, ptl);
3128+ err = 0;
3129+out:
3130+ return err;
3131+}
3132+
3133+/**
3134+ * try_to_merge_two_pages() - take two identical pages and prepare
3135+ * them to be merged into one page(rmap_item->page)
3136+ *
3137+ * @return 0 if we successfully merged two identical pages into
3138+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3139+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3140+ * changed since it's hashed. MERGE_ERR_PGERR otherwise.
3141+ *
3142+ */
3143+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3144+ struct rmap_item *tree_rmap_item,
3145+ u32 hash)
3146+{
3147+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3148+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3149+ struct vm_area_struct *vma1 = rmap_item->slot->vma;
3150+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3151+ struct page *page = rmap_item->page;
3152+ struct page *tree_page = tree_rmap_item->page;
3153+ int err = MERGE_ERR_PGERR;
3154+ struct address_space *saved_mapping;
3155+
3156+
3157+ if (rmap_item->page == tree_rmap_item->page)
3158+ goto out;
3159+
3160+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3161+ goto out;
3162+ BUG_ON(PageTransCompound(page));
3163+
3164+ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
3165+ goto out;
3166+ BUG_ON(PageTransCompound(tree_page));
3167+
3168+ if (!PageAnon(page) || !PageAnon(tree_page))
3169+ goto out;
3170+
3171+ if (!trylock_page(page))
3172+ goto out;
3173+
3174+
3175+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3176+ unlock_page(page);
3177+ goto out;
3178+ }
3179+
3180+ /*
3181+ * While we hold page lock, upgrade page from
3182+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
3183+ * stable_tree_insert() will update stable_node.
3184+ */
3185+ saved_mapping = page->mapping;
3186+ set_page_stable_node(page, NULL);
3187+ mark_page_accessed(page);
3188+ unlock_page(page);
3189+
3190+ if (!trylock_page(tree_page))
3191+ goto restore_out;
3192+
3193+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3194+ unlock_page(tree_page);
3195+ goto restore_out;
3196+ }
3197+
3198+ if (pages_identical(page, tree_page)) {
3199+ err = replace_page(vma2, tree_page, page, wprt_pte2);
3200+ if (err) {
3201+ unlock_page(tree_page);
3202+ goto restore_out;
3203+ }
3204+
3205+ if ((vma2->vm_flags & VM_LOCKED)) {
3206+ munlock_vma_page(tree_page);
3207+ if (!PageMlocked(page)) {
3208+ unlock_page(tree_page);
3209+ lock_page(page);
3210+ mlock_vma_page(page);
3211+ tree_page = page; /* for final unlock */
3212+ }
3213+ }
3214+
3215+ unlock_page(tree_page);
3216+
3217+ goto out; /* success */
3218+
3219+ } else {
3220+ if (tree_rmap_item->hash_max &&
3221+ tree_rmap_item->hash_max == rmap_item->hash_max) {
3222+ err = MERGE_ERR_COLLI_MAX;
3223+ } else if (page_hash(page, hash_strength, 0) ==
3224+ page_hash(tree_page, hash_strength, 0)) {
3225+ inc_rshash_neg(memcmp_cost + hash_strength * 2);
3226+ err = MERGE_ERR_COLLI;
3227+ } else {
3228+ err = MERGE_ERR_CHANGED;
3229+ }
3230+
3231+ unlock_page(tree_page);
3232+ }
3233+
3234+restore_out:
3235+ lock_page(page);
3236+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3237+ orig_pte1, wprt_pte1))
3238+ page->mapping = saved_mapping;
3239+
3240+ unlock_page(page);
3241+out:
3242+ return err;
3243+}
3244+
3245+static inline int hash_cmp(u32 new_val, u32 node_val)
3246+{
3247+ if (new_val > node_val)
3248+ return 1;
3249+ else if (new_val < node_val)
3250+ return -1;
3251+ else
3252+ return 0;
3253+}
3254+
3255+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3256+{
3257+ u32 hash_max = item->hash_max;
3258+
3259+ if (!hash_max) {
3260+ hash_max = page_hash_max(item->page, hash);
3261+
3262+ item->hash_max = hash_max;
3263+ }
3264+
3265+ return hash_max;
3266+}
3267+
3268+
3269+
3270+/**
3271+ * stable_tree_search() - search the stable tree for a page
3272+ *
3273+ * @item: the rmap_item we are comparing with
3274+ * @hash: the hash value of this item->page already calculated
3275+ *
3276+ * @return the page we have found, NULL otherwise. The page returned has
3277+ * been gotten.
3278+ */
3279+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3280+{
3281+ struct rb_node *node = root_stable_treep->rb_node;
3282+ struct tree_node *tree_node;
3283+ unsigned long hash_max;
3284+ struct page *page = item->page;
3285+ struct stable_node *stable_node;
3286+
3287+ stable_node = page_stable_node(page);
3288+ if (stable_node) {
3289+ /* ksm page forked, that is
3290+ * if (PageKsm(page) && !in_stable_tree(rmap_item))
3291+ * it's actually gotten once outside.
3292+ */
3293+ get_page(page);
3294+ return page;
3295+ }
3296+
3297+ while (node) {
3298+ int cmp;
3299+
3300+ tree_node = rb_entry(node, struct tree_node, node);
3301+
3302+ cmp = hash_cmp(hash, tree_node->hash);
3303+
3304+ if (cmp < 0)
3305+ node = node->rb_left;
3306+ else if (cmp > 0)
3307+ node = node->rb_right;
3308+ else
3309+ break;
3310+ }
3311+
3312+ if (!node)
3313+ return NULL;
3314+
3315+ if (tree_node->count == 1) {
3316+ stable_node = rb_entry(tree_node->sub_root.rb_node,
3317+ struct stable_node, node);
3318+ BUG_ON(!stable_node);
3319+
3320+ goto get_page_out;
3321+ }
3322+
3323+ /*
3324+ * ok, we have to search the second
3325+ * level subtree, hash the page to a
3326+ * full strength.
3327+ */
3328+ node = tree_node->sub_root.rb_node;
3329+ BUG_ON(!node);
3330+ hash_max = rmap_item_hash_max(item, hash);
3331+
3332+ while (node) {
3333+ int cmp;
3334+
3335+ stable_node = rb_entry(node, struct stable_node, node);
3336+
3337+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3338+
3339+ if (cmp < 0)
3340+ node = node->rb_left;
3341+ else if (cmp > 0)
3342+ node = node->rb_right;
3343+ else
3344+ goto get_page_out;
3345+ }
3346+
3347+ return NULL;
3348+
3349+get_page_out:
3350+ page = get_uksm_page(stable_node, 1, 1);
3351+ return page;
3352+}
3353+
3354+static int try_merge_rmap_item(struct rmap_item *item,
3355+ struct page *kpage,
3356+ struct page *tree_page)
3357+{
3358+ spinlock_t *ptl;
3359+ pte_t *ptep;
3360+ unsigned long addr;
3361+ struct vm_area_struct *vma = item->slot->vma;
3362+
3363+ addr = get_rmap_addr(item);
3364+ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
3365+ if (!ptep)
3366+ return 0;
3367+
3368+ if (pte_write(*ptep)) {
3369+ /* has changed, abort! */
3370+ pte_unmap_unlock(ptep, ptl);
3371+ return 0;
3372+ }
3373+
3374+ get_page(tree_page);
3375+ page_add_anon_rmap(tree_page, vma, addr);
3376+
3377+ flush_cache_page(vma, addr, pte_pfn(*ptep));
3378+ ptep_clear_flush(vma, addr, ptep);
3379+ set_pte_at_notify(vma->vm_mm, addr, ptep,
3380+ mk_pte(tree_page, vma->vm_page_prot));
3381+
3382+ page_remove_rmap(kpage);
3383+ put_page(kpage);
3384+
3385+ pte_unmap_unlock(ptep, ptl);
3386+
3387+ return 1;
3388+}
3389+
3390+/**
3391+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3392+ * into stable tree, the page was found to be identical to a stable ksm page,
3393+ * this is the last chance we can merge them into one.
3394+ *
3395+ * @item1: the rmap_item holding the page which we wanted to insert
3396+ * into stable tree.
3397+ * @item2: the other rmap_item we found when unstable tree search
3398+ * @oldpage: the page currently mapped by the two rmap_items
3399+ * @tree_page: the page we found identical in stable tree node
3400+ * @success1: return if item1 is successfully merged
3401+ * @success2: return if item2 is successfully merged
3402+ */
3403+static void try_merge_with_stable(struct rmap_item *item1,
3404+ struct rmap_item *item2,
3405+ struct page **kpage,
3406+ struct page *tree_page,
3407+ int *success1, int *success2)
3408+{
3409+ struct vm_area_struct *vma1 = item1->slot->vma;
3410+ struct vm_area_struct *vma2 = item2->slot->vma;
3411+ *success1 = 0;
3412+ *success2 = 0;
3413+
3414+ if (unlikely(*kpage == tree_page)) {
3415+ /* I don't think this can really happen */
3416+ printk(KERN_WARNING "UKSM: unexpected condition detected in "
3417+ "try_merge_with_stable() -- *kpage == tree_page !\n");
3418+ *success1 = 1;
3419+ *success2 = 1;
3420+ return;
3421+ }
3422+
3423+ if (!PageAnon(*kpage) || !PageKsm(*kpage))
3424+ goto failed;
3425+
3426+ if (!trylock_page(tree_page))
3427+ goto failed;
3428+
3429+ /* If the oldpage is still ksm and still pointed
3430+ * to in the right place, and still write protected,
3431+ * we are confident it's not changed, no need to
3432+ * memcmp anymore.
3433+ * be ware, we cannot take nested pte locks,
3434+ * deadlock risk.
3435+ */
3436+ if (!try_merge_rmap_item(item1, *kpage, tree_page))
3437+ goto unlock_failed;
3438+
3439+ /* ok, then vma2, remind that pte1 already set */
3440+ if (!try_merge_rmap_item(item2, *kpage, tree_page))
3441+ goto success_1;
3442+
3443+ *success2 = 1;
3444+success_1:
3445+ *success1 = 1;
3446+
3447+
3448+ if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3449+ (*success2 && vma2->vm_flags & VM_LOCKED)) {
3450+ munlock_vma_page(*kpage);
3451+ if (!PageMlocked(tree_page))
3452+ mlock_vma_page(tree_page);
3453+ }
3454+
3455+ /*
3456+ * We do not need oldpage any more in the caller, so can break the lock
3457+ * now.
3458+ */
3459+ unlock_page(*kpage);
3460+ *kpage = tree_page; /* Get unlocked outside. */
3461+ return;
3462+
3463+unlock_failed:
3464+ unlock_page(tree_page);
3465+failed:
3466+ return;
3467+}
3468+
3469+static inline void stable_node_hash_max(struct stable_node *node,
3470+ struct page *page, u32 hash)
3471+{
3472+ u32 hash_max = node->hash_max;
3473+
3474+ if (!hash_max) {
3475+ hash_max = page_hash_max(page, hash);
3476+ node->hash_max = hash_max;
3477+ }
3478+}
3479+
3480+static inline
3481+struct stable_node *new_stable_node(struct tree_node *tree_node,
3482+ struct page *kpage, u32 hash_max)
3483+{
3484+ struct stable_node *new_stable_node;
3485+
3486+ new_stable_node = alloc_stable_node();
3487+ if (!new_stable_node)
3488+ return NULL;
3489+
3490+ new_stable_node->kpfn = page_to_pfn(kpage);
3491+ new_stable_node->hash_max = hash_max;
3492+ new_stable_node->tree_node = tree_node;
3493+ set_page_stable_node(kpage, new_stable_node);
3494+
3495+ return new_stable_node;
3496+}
3497+
3498+static inline
3499+struct stable_node *first_level_insert(struct tree_node *tree_node,
3500+ struct rmap_item *rmap_item,
3501+ struct rmap_item *tree_rmap_item,
3502+ struct page **kpage, u32 hash,
3503+ int *success1, int *success2)
3504+{
3505+ int cmp;
3506+ struct page *tree_page;
3507+ u32 hash_max = 0;
3508+ struct stable_node *stable_node, *new_snode;
3509+ struct rb_node *parent = NULL, **new;
3510+
3511+ /* this tree node contains no sub-tree yet */
3512+ stable_node = rb_entry(tree_node->sub_root.rb_node,
3513+ struct stable_node, node);
3514+
3515+ tree_page = get_uksm_page(stable_node, 1, 0);
3516+ if (tree_page) {
3517+ cmp = memcmp_pages(*kpage, tree_page, 1);
3518+ if (!cmp) {
3519+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3520+ tree_page, success1, success2);
3521+ put_page(tree_page);
3522+ if (!*success1 && !*success2)
3523+ goto failed;
3524+
3525+ return stable_node;
3526+
3527+ } else {
3528+ /*
3529+ * collision in first level try to create a subtree.
3530+ * A new node need to be created.
3531+ */
3532+ put_page(tree_page);
3533+
3534+ stable_node_hash_max(stable_node, tree_page,
3535+ tree_node->hash);
3536+ hash_max = rmap_item_hash_max(rmap_item, hash);
3537+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3538+
3539+ parent = &stable_node->node;
3540+ if (cmp < 0) {
3541+ new = &parent->rb_left;
3542+ } else if (cmp > 0) {
3543+ new = &parent->rb_right;
3544+ } else {
3545+ goto failed;
3546+ }
3547+ }
3548+
3549+ } else {
3550+ /* the only stable_node deleted, we reuse its tree_node.
3551+ */
3552+ parent = NULL;
3553+ new = &tree_node->sub_root.rb_node;
3554+ }
3555+
3556+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
3557+ if (!new_snode)
3558+ goto failed;
3559+
3560+ rb_link_node(&new_snode->node, parent, new);
3561+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
3562+ tree_node->count++;
3563+ *success1 = *success2 = 1;
3564+
3565+ return new_snode;
3566+
3567+failed:
3568+ return NULL;
3569+}
3570+
3571+static inline
3572+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3573+ struct rmap_item *rmap_item,
3574+ struct rmap_item *tree_rmap_item,
3575+ struct page **kpage, u32 hash,
3576+ int *success1, int *success2)
3577+{
3578+ struct page *tree_page;
3579+ u32 hash_max;
3580+ struct stable_node *stable_node, *new_snode;
3581+ struct rb_node *parent, **new;
3582+
3583+research:
3584+ parent = NULL;
3585+ new = &tree_node->sub_root.rb_node;
3586+ BUG_ON(!*new);
3587+ hash_max = rmap_item_hash_max(rmap_item, hash);
3588+ while (*new) {
3589+ int cmp;
3590+
3591+ stable_node = rb_entry(*new, struct stable_node, node);
3592+
3593+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3594+
3595+ if (cmp < 0) {
3596+ parent = *new;
3597+ new = &parent->rb_left;
3598+ } else if (cmp > 0) {
3599+ parent = *new;
3600+ new = &parent->rb_right;
3601+ } else {
3602+ tree_page = get_uksm_page(stable_node, 1, 0);
3603+ if (tree_page) {
3604+ cmp = memcmp_pages(*kpage, tree_page, 1);
3605+ if (!cmp) {
3606+ try_merge_with_stable(rmap_item,
3607+ tree_rmap_item, kpage,
3608+ tree_page, success1, success2);
3609+
3610+ put_page(tree_page);
3611+ if (!*success1 && !*success2)
3612+ goto failed;
3613+ /*
3614+ * successfully merged with a stable
3615+ * node
3616+ */
3617+ return stable_node;
3618+ } else {
3619+ put_page(tree_page);
3620+ goto failed;
3621+ }
3622+ } else {
3623+ /*
3624+ * stable node may be deleted,
3625+ * and subtree maybe
3626+ * restructed, cannot
3627+ * continue, research it.
3628+ */
3629+ if (tree_node->count) {
3630+ goto research;
3631+ } else {
3632+ /* reuse the tree node*/
3633+ parent = NULL;
3634+ new = &tree_node->sub_root.rb_node;
3635+ }
3636+ }
3637+ }
3638+ }
3639+
3640+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
3641+ if (!new_snode)
3642+ goto failed;
3643+
3644+ rb_link_node(&new_snode->node, parent, new);
3645+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
3646+ tree_node->count++;
3647+ *success1 = *success2 = 1;
3648+
3649+ return new_snode;
3650+
3651+failed:
3652+ return NULL;
3653+}
3654+
3655+
3656+/**
3657+ * stable_tree_insert() - try to insert a merged page in unstable tree to
3658+ * the stable tree
3659+ *
3660+ * @kpage: the page need to be inserted
3661+ * @hash: the current hash of this page
3662+ * @rmap_item: the rmap_item being scanned
3663+ * @tree_rmap_item: the rmap_item found on unstable tree
3664+ * @success1: return if rmap_item is merged
3665+ * @success2: return if tree_rmap_item is merged
3666+ *
3667+ * @return the stable_node on stable tree if at least one
3668+ * rmap_item is inserted into stable tree, NULL
3669+ * otherwise.
3670+ */
3671+static struct stable_node *
3672+stable_tree_insert(struct page **kpage, u32 hash,
3673+ struct rmap_item *rmap_item,
3674+ struct rmap_item *tree_rmap_item,
3675+ int *success1, int *success2)
3676+{
3677+ struct rb_node **new = &root_stable_treep->rb_node;
3678+ struct rb_node *parent = NULL;
3679+ struct stable_node *stable_node;
3680+ struct tree_node *tree_node;
3681+ u32 hash_max = 0;
3682+
3683+ *success1 = *success2 = 0;
3684+
3685+ while (*new) {
3686+ int cmp;
3687+
3688+ tree_node = rb_entry(*new, struct tree_node, node);
3689+
3690+ cmp = hash_cmp(hash, tree_node->hash);
3691+
3692+ if (cmp < 0) {
3693+ parent = *new;
3694+ new = &parent->rb_left;
3695+ } else if (cmp > 0) {
3696+ parent = *new;
3697+ new = &parent->rb_right;
3698+ } else
3699+ break;
3700+ }
3701+
3702+ if (*new) {
3703+ if (tree_node->count == 1) {
3704+ stable_node = first_level_insert(tree_node, rmap_item,
3705+ tree_rmap_item, kpage,
3706+ hash, success1, success2);
3707+ } else {
3708+ stable_node = stable_subtree_insert(tree_node,
3709+ rmap_item, tree_rmap_item, kpage,
3710+ hash, success1, success2);
3711+ }
3712+ } else {
3713+
3714+ /* no tree node found */
3715+ tree_node = alloc_tree_node(stable_tree_node_listp);
3716+ if (!tree_node) {
3717+ stable_node = NULL;
3718+ goto out;
3719+ }
3720+
3721+ stable_node = new_stable_node(tree_node, *kpage, hash_max);
3722+ if (!stable_node) {
3723+ free_tree_node(tree_node);
3724+ goto out;
3725+ }
3726+
3727+ tree_node->hash = hash;
3728+ rb_link_node(&tree_node->node, parent, new);
3729+ rb_insert_color(&tree_node->node, root_stable_treep);
3730+ parent = NULL;
3731+ new = &tree_node->sub_root.rb_node;
3732+
3733+ rb_link_node(&stable_node->node, parent, new);
3734+ rb_insert_color(&stable_node->node, &tree_node->sub_root);
3735+ tree_node->count++;
3736+ *success1 = *success2 = 1;
3737+ }
3738+
3739+out:
3740+ return stable_node;
3741+}
3742+
3743+
3744+/**
3745+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3746+ *
3747+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
3748+ * -EINVAL if the page mapping has been changed.
3749+ */
3750+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3751+{
3752+ int err;
3753+
3754+ err = get_mergeable_page_lock_mmap(tree_rmap_item);
3755+
3756+ if (err == -EINVAL) {
3757+ /* its page map has been changed, remove it */
3758+ remove_rmap_item_from_tree(tree_rmap_item);
3759+ }
3760+
3761+ /* The page is gotten and mmap_sem is locked now. */
3762+ return err;
3763+}
3764+
3765+
3766+/**
3767+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3768+ * same hash value. Get its page and trylock the mmap_sem
3769+ */
3770+static inline
3771+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3772+ u32 hash)
3773+
3774+{
3775+ struct rb_node **new = &root_unstable_tree.rb_node;
3776+ struct rb_node *parent = NULL;
3777+ struct tree_node *tree_node;
3778+ u32 hash_max;
3779+ struct rmap_item *tree_rmap_item;
3780+
3781+ while (*new) {
3782+ int cmp;
3783+
3784+ tree_node = rb_entry(*new, struct tree_node, node);
3785+
3786+ cmp = hash_cmp(hash, tree_node->hash);
3787+
3788+ if (cmp < 0) {
3789+ parent = *new;
3790+ new = &parent->rb_left;
3791+ } else if (cmp > 0) {
3792+ parent = *new;
3793+ new = &parent->rb_right;
3794+ } else
3795+ break;
3796+ }
3797+
3798+ if (*new) {
3799+ /* got the tree_node */
3800+ if (tree_node->count == 1) {
3801+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3802+ struct rmap_item, node);
3803+ BUG_ON(!tree_rmap_item);
3804+
3805+ goto get_page_out;
3806+ }
3807+
3808+ /* well, search the collision subtree */
3809+ new = &tree_node->sub_root.rb_node;
3810+ BUG_ON(!*new);
3811+ hash_max = rmap_item_hash_max(rmap_item, hash);
3812+
3813+ while (*new) {
3814+ int cmp;
3815+
3816+ tree_rmap_item = rb_entry(*new, struct rmap_item,
3817+ node);
3818+
3819+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3820+ parent = *new;
3821+ if (cmp < 0)
3822+ new = &parent->rb_left;
3823+ else if (cmp > 0)
3824+ new = &parent->rb_right;
3825+ else
3826+ goto get_page_out;
3827+ }
3828+ } else {
3829+ /* alloc a new tree_node */
3830+ tree_node = alloc_tree_node(&unstable_tree_node_list);
3831+ if (!tree_node)
3832+ return NULL;
3833+
3834+ tree_node->hash = hash;
3835+ rb_link_node(&tree_node->node, parent, new);
3836+ rb_insert_color(&tree_node->node, &root_unstable_tree);
3837+ parent = NULL;
3838+ new = &tree_node->sub_root.rb_node;
3839+ }
3840+
3841+ /* did not found even in sub-tree */
3842+ rmap_item->tree_node = tree_node;
3843+ rmap_item->address |= UNSTABLE_FLAG;
3844+ rmap_item->hash_round = uksm_hash_round;
3845+ rb_link_node(&rmap_item->node, parent, new);
3846+ rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3847+
3848+ uksm_pages_unshared++;
3849+ return NULL;
3850+
3851+get_page_out:
3852+ if (tree_rmap_item->page == rmap_item->page)
3853+ return NULL;
3854+
3855+ if (get_tree_rmap_item_page(tree_rmap_item))
3856+ return NULL;
3857+
3858+ return tree_rmap_item;
3859+}
3860+
3861+static void hold_anon_vma(struct rmap_item *rmap_item,
3862+ struct anon_vma *anon_vma)
3863+{
3864+ rmap_item->anon_vma = anon_vma;
3865+ get_anon_vma(anon_vma);
3866+}
3867+
3868+
3869+/**
3870+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3871+ * ratio statistics is done in this function.
3872+ *
3873+ */
3874+static void stable_tree_append(struct rmap_item *rmap_item,
3875+ struct stable_node *stable_node, int logdedup)
3876+{
3877+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3878+ unsigned long key = (unsigned long)rmap_item->slot;
3879+ unsigned long factor = rmap_item->slot->rung->step;
3880+
3881+ BUG_ON(!stable_node);
3882+ rmap_item->address |= STABLE_FLAG;
3883+
3884+ if (hlist_empty(&stable_node->hlist)) {
3885+ uksm_pages_shared++;
3886+ goto node_vma_new;
3887+ } else {
3888+ uksm_pages_sharing++;
3889+ }
3890+
3891+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3892+ if (node_vma->key >= key)
3893+ break;
3894+
3895+ if (logdedup) {
3896+ node_vma->slot->pages_bemerged += factor;
3897+ if (list_empty(&node_vma->slot->dedup_list))
3898+ list_add(&node_vma->slot->dedup_list,
3899+ &vma_slot_dedup);
3900+ }
3901+ }
3902+
3903+ if (node_vma) {
3904+ if (node_vma->key == key) {
3905+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3906+ goto node_vma_ok;
3907+ } else if (node_vma->key > key) {
3908+ node_vma_cont = node_vma;
3909+ }
3910+ }
3911+
3912+node_vma_new:
3913+ /* no same vma already in node, alloc a new node_vma */
3914+ new_node_vma = alloc_node_vma();
3915+ BUG_ON(!new_node_vma);
3916+ new_node_vma->head = stable_node;
3917+ new_node_vma->slot = rmap_item->slot;
3918+
3919+ if (!node_vma) {
3920+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3921+ } else if (node_vma->key != key) {
3922+ if (node_vma->key < key)
3923+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3924+ else {
3925+ hlist_add_before(&new_node_vma->hlist,
3926+ &node_vma->hlist);
3927+ }
3928+
3929+ }
3930+ node_vma = new_node_vma;
3931+
3932+node_vma_ok: /* ok, ready to add to the list */
3933+ rmap_item->head = node_vma;
3934+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3935+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3936+ if (logdedup) {
3937+ rmap_item->slot->pages_merged++;
3938+ if (node_vma_cont) {
3939+ node_vma = node_vma_cont;
3940+ hlist_for_each_entry_continue(node_vma, hlist) {
3941+ node_vma->slot->pages_bemerged += factor;
3942+ if (list_empty(&node_vma->slot->dedup_list))
3943+ list_add(&node_vma->slot->dedup_list,
3944+ &vma_slot_dedup);
3945+ }
3946+ }
3947+ }
3948+}
3949+
3950+/*
3951+ * We use break_ksm to break COW on a ksm page: it's a stripped down
3952+ *
3953+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3954+ * put_page(page);
3955+ *
3956+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3957+ * in case the application has unmapped and remapped mm,addr meanwhile.
3958+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
3959+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3960+ */
3961+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3962+{
3963+ struct page *page;
3964+ int ret = 0;
3965+
3966+ do {
3967+ cond_resched();
3968+ page = follow_page(vma, addr, FOLL_GET);
3969+ if (IS_ERR_OR_NULL(page))
3970+ break;
3971+ if (PageKsm(page)) {
3972+ ret = handle_mm_fault(vma->vm_mm, vma, addr,
3973+ FAULT_FLAG_WRITE);
3974+ } else
3975+ ret = VM_FAULT_WRITE;
3976+ put_page(page);
3977+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3978+ /*
3979+ * We must loop because handle_mm_fault() may back out if there's
3980+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
3981+ *
3982+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3983+ * COW has been broken, even if the vma does not permit VM_WRITE;
3984+ * but note that a concurrent fault might break PageKsm for us.
3985+ *
3986+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
3987+ * backing file, which also invalidates anonymous pages: that's
3988+ * okay, that truncation will have unmapped the PageKsm for us.
3989+ *
3990+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3991+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3992+ * current task has TIF_MEMDIE set, and will be OOM killed on return
3993+ * to user; and ksmd, having no mm, would never be chosen for that.
3994+ *
3995+ * But if the mm is in a limited mem_cgroup, then the fault may fail
3996+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3997+ * even ksmd can fail in this way - though it's usually breaking ksm
3998+ * just to undo a merge it made a moment before, so unlikely to oom.
3999+ *
4000+ * That's a pity: we might therefore have more kernel pages allocated
4001+ * than we're counting as nodes in the stable tree; but uksm_do_scan
4002+ * will retry to break_cow on each pass, so should recover the page
4003+ * in due course. The important thing is to not let VM_MERGEABLE
4004+ * be cleared while any such pages might remain in the area.
4005+ */
4006+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4007+}
4008+
4009+static void break_cow(struct rmap_item *rmap_item)
4010+{
4011+ struct vm_area_struct *vma = rmap_item->slot->vma;
4012+ struct mm_struct *mm = vma->vm_mm;
4013+ unsigned long addr = get_rmap_addr(rmap_item);
4014+
4015+ if (uksm_test_exit(mm))
4016+ goto out;
4017+
4018+ break_ksm(vma, addr);
4019+out:
4020+ return;
4021+}
4022+
4023+/*
4024+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4025+ * than check every pte of a given vma, the locking doesn't quite work for
4026+ * that - an rmap_item is assigned to the stable tree after inserting ksm
4027+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
4028+ * rmap_items from parent to child at fork time (so as not to waste time
4029+ * if exit comes before the next scan reaches it).
4030+ *
4031+ * Similarly, although we'd like to remove rmap_items (so updating counts
4032+ * and freeing memory) when unmerging an area, it's easier to leave that
4033+ * to the next pass of ksmd - consider, for example, how ksmd might be
4034+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
4035+ */
4036+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4037+ unsigned long start, unsigned long end)
4038+{
4039+ unsigned long addr;
4040+ int err = 0;
4041+
4042+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4043+ if (uksm_test_exit(vma->vm_mm))
4044+ break;
4045+ if (signal_pending(current))
4046+ err = -ERESTARTSYS;
4047+ else
4048+ err = break_ksm(vma, addr);
4049+ }
4050+ return err;
4051+}
4052+
4053+static inline void inc_uksm_pages_scanned(void)
4054+{
4055+ u64 delta;
4056+
4057+
4058+ if (uksm_pages_scanned == U64_MAX) {
4059+ encode_benefit();
4060+
4061+ delta = uksm_pages_scanned >> pages_scanned_base;
4062+
4063+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4064+ pages_scanned_stored >>= 1;
4065+ delta >>= 1;
4066+ pages_scanned_base++;
4067+ }
4068+
4069+ pages_scanned_stored += delta;
4070+
4071+ uksm_pages_scanned = uksm_pages_scanned_last = 0;
4072+ }
4073+
4074+ uksm_pages_scanned++;
4075+}
4076+
4077+static inline int find_zero_page_hash(int strength, u32 hash)
4078+{
4079+ return (zero_hash_table[strength] == hash);
4080+}
4081+
4082+static
4083+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4084+{
4085+ struct page *zero_page = empty_uksm_zero_page;
4086+ struct mm_struct *mm = vma->vm_mm;
4087+ pte_t orig_pte = __pte(0);
4088+ int err = -EFAULT;
4089+
4090+ if (uksm_test_exit(mm))
4091+ goto out;
4092+
4093+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
4094+ goto out;
4095+ BUG_ON(PageTransCompound(page));
4096+
4097+ if (!PageAnon(page))
4098+ goto out;
4099+
4100+ if (!trylock_page(page))
4101+ goto out;
4102+
4103+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4104+ if (is_page_full_zero(page))
4105+ err = replace_page(vma, page, zero_page, orig_pte);
4106+ }
4107+
4108+ unlock_page(page);
4109+out:
4110+ return err;
4111+}
4112+
4113+/*
4114+ * cmp_and_merge_page() - first see if page can be merged into the stable
4115+ * tree; if not, compare hash to previous and if it's the same, see if page
4116+ * can be inserted into the unstable tree, or merged with a page already there
4117+ * and both transferred to the stable tree.
4118+ *
4119+ * @page: the page that we are searching identical page to.
4120+ * @rmap_item: the reverse mapping into the virtual address of this page
4121+ */
4122+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4123+{
4124+ struct rmap_item *tree_rmap_item;
4125+ struct page *page;
4126+ struct page *kpage = NULL;
4127+ u32 hash_max;
4128+ int err;
4129+ unsigned int success1, success2;
4130+ struct stable_node *snode;
4131+ int cmp;
4132+ struct rb_node *parent = NULL, **new;
4133+
4134+ remove_rmap_item_from_tree(rmap_item);
4135+ page = rmap_item->page;
4136+
4137+ /* We first start with searching the page inside the stable tree */
4138+ kpage = stable_tree_search(rmap_item, hash);
4139+ if (kpage) {
4140+ err = try_to_merge_with_uksm_page(rmap_item, kpage,
4141+ hash);
4142+ if (!err) {
4143+ /*
4144+ * The page was successfully merged, add
4145+ * its rmap_item to the stable tree.
4146+ * page lock is needed because it's
4147+ * racing with try_to_unmap_ksm(), etc.
4148+ */
4149+ lock_page(kpage);
4150+ snode = page_stable_node(kpage);
4151+ stable_tree_append(rmap_item, snode, 1);
4152+ unlock_page(kpage);
4153+ put_page(kpage);
4154+ return; /* success */
4155+ }
4156+ put_page(kpage);
4157+
4158+ /*
4159+ * if it's a collision and it has been search in sub-rbtree
4160+ * (hash_max != 0), we want to abort, because if it is
4161+ * successfully merged in unstable tree, the collision trends to
4162+ * happen again.
4163+ */
4164+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4165+ return;
4166+ }
4167+
4168+ tree_rmap_item =
4169+ unstable_tree_search_insert(rmap_item, hash);
4170+ if (tree_rmap_item) {
4171+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4172+ /*
4173+ * As soon as we merge this page, we want to remove the
4174+ * rmap_item of the page we have merged with from the unstable
4175+ * tree, and insert it instead as new node in the stable tree.
4176+ */
4177+ if (!err) {
4178+ kpage = page;
4179+ remove_rmap_item_from_tree(tree_rmap_item);
4180+ lock_page(kpage);
4181+ snode = stable_tree_insert(&kpage, hash,
4182+ rmap_item, tree_rmap_item,
4183+ &success1, &success2);
4184+
4185+ /*
4186+ * Do not log dedup for tree item, it's not counted as
4187+ * scanned in this round.
4188+ */
4189+ if (success2)
4190+ stable_tree_append(tree_rmap_item, snode, 0);
4191+
4192+ /*
4193+ * The order of these two stable append is important:
4194+ * we are scanning rmap_item.
4195+ */
4196+ if (success1)
4197+ stable_tree_append(rmap_item, snode, 1);
4198+
4199+ /*
4200+ * The original kpage may be unlocked inside
4201+ * stable_tree_insert() already. This page
4202+ * should be unlocked before doing
4203+ * break_cow().
4204+ */
4205+ unlock_page(kpage);
4206+
4207+ if (!success1)
4208+ break_cow(rmap_item);
4209+
4210+ if (!success2)
4211+ break_cow(tree_rmap_item);
4212+
4213+ } else if (err == MERGE_ERR_COLLI) {
4214+ BUG_ON(tree_rmap_item->tree_node->count > 1);
4215+
4216+ rmap_item_hash_max(tree_rmap_item,
4217+ tree_rmap_item->tree_node->hash);
4218+
4219+ hash_max = rmap_item_hash_max(rmap_item, hash);
4220+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4221+ parent = &tree_rmap_item->node;
4222+ if (cmp < 0)
4223+ new = &parent->rb_left;
4224+ else if (cmp > 0)
4225+ new = &parent->rb_right;
4226+ else
4227+ goto put_up_out;
4228+
4229+ rmap_item->tree_node = tree_rmap_item->tree_node;
4230+ rmap_item->address |= UNSTABLE_FLAG;
4231+ rmap_item->hash_round = uksm_hash_round;
4232+ rb_link_node(&rmap_item->node, parent, new);
4233+ rb_insert_color(&rmap_item->node,
4234+ &tree_rmap_item->tree_node->sub_root);
4235+ rmap_item->tree_node->count++;
4236+ } else {
4237+ /*
4238+ * either one of the page has changed or they collide
4239+ * at the max hash, we consider them as ill items.
4240+ */
4241+ remove_rmap_item_from_tree(tree_rmap_item);
4242+ }
4243+put_up_out:
4244+ put_page(tree_rmap_item->page);
4245+ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
4246+ }
4247+}
4248+
4249+
4250+
4251+
4252+static inline unsigned long get_pool_index(struct vma_slot *slot,
4253+ unsigned long index)
4254+{
4255+ unsigned long pool_index;
4256+
4257+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4258+ if (pool_index >= slot->pool_size)
4259+ BUG();
4260+ return pool_index;
4261+}
4262+
4263+static inline unsigned long index_page_offset(unsigned long index)
4264+{
4265+ return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4266+}
4267+
4268+static inline
4269+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4270+ unsigned long index, int need_alloc)
4271+{
4272+ unsigned long pool_index;
4273+ struct page *page;
4274+ void *addr;
4275+
4276+
4277+ pool_index = get_pool_index(slot, index);
4278+ if (!slot->rmap_list_pool[pool_index]) {
4279+ if (!need_alloc)
4280+ return NULL;
4281+
4282+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4283+ if (!page)
4284+ return NULL;
4285+
4286+ slot->rmap_list_pool[pool_index] = page;
4287+ }
4288+
4289+ addr = kmap(slot->rmap_list_pool[pool_index]);
4290+ addr += index_page_offset(index);
4291+
4292+ return addr;
4293+}
4294+
4295+static inline void put_rmap_list_entry(struct vma_slot *slot,
4296+ unsigned long index)
4297+{
4298+ unsigned long pool_index;
4299+
4300+ pool_index = get_pool_index(slot, index);
4301+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4302+ kunmap(slot->rmap_list_pool[pool_index]);
4303+}
4304+
4305+static inline int entry_is_new(struct rmap_list_entry *entry)
4306+{
4307+ return !entry->item;
4308+}
4309+
4310+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4311+ unsigned long index)
4312+{
4313+ return slot->vma->vm_start + (index << PAGE_SHIFT);
4314+}
4315+
4316+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4317+{
4318+ unsigned long addr;
4319+
4320+ if (is_addr(entry->addr))
4321+ addr = get_clean_addr(entry->addr);
4322+ else if (entry->item)
4323+ addr = get_rmap_addr(entry->item);
4324+ else
4325+ BUG();
4326+
4327+ return addr;
4328+}
4329+
4330+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4331+{
4332+ if (is_addr(entry->addr))
4333+ return NULL;
4334+
4335+ return entry->item;
4336+}
4337+
4338+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4339+ unsigned long index)
4340+{
4341+ unsigned long pool_index;
4342+
4343+ pool_index = get_pool_index(slot, index);
4344+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4345+ slot->pool_counts[pool_index]++;
4346+}
4347+
4348+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4349+ unsigned long index)
4350+{
4351+ unsigned long pool_index;
4352+
4353+ pool_index = get_pool_index(slot, index);
4354+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4355+ BUG_ON(!slot->pool_counts[pool_index]);
4356+ slot->pool_counts[pool_index]--;
4357+}
4358+
4359+static inline int entry_has_rmap(struct rmap_list_entry *entry)
4360+{
4361+ return !is_addr(entry->addr) && entry->item;
4362+}
4363+
4364+static inline void swap_entries(struct rmap_list_entry *entry1,
4365+ unsigned long index1,
4366+ struct rmap_list_entry *entry2,
4367+ unsigned long index2)
4368+{
4369+ struct rmap_list_entry tmp;
4370+
4371+ /* swapping two new entries is meaningless */
4372+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4373+
4374+ tmp = *entry1;
4375+ *entry1 = *entry2;
4376+ *entry2 = tmp;
4377+
4378+ if (entry_has_rmap(entry1))
4379+ entry1->item->entry_index = index1;
4380+
4381+ if (entry_has_rmap(entry2))
4382+ entry2->item->entry_index = index2;
4383+
4384+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4385+ inc_rmap_list_pool_count(entry1->item->slot, index1);
4386+ dec_rmap_list_pool_count(entry1->item->slot, index2);
4387+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4388+ inc_rmap_list_pool_count(entry2->item->slot, index2);
4389+ dec_rmap_list_pool_count(entry2->item->slot, index1);
4390+ }
4391+}
4392+
4393+static inline void free_entry_item(struct rmap_list_entry *entry)
4394+{
4395+ unsigned long index;
4396+ struct rmap_item *item;
4397+
4398+ if (!is_addr(entry->addr)) {
4399+ BUG_ON(!entry->item);
4400+ item = entry->item;
4401+ entry->addr = get_rmap_addr(item);
4402+ set_is_addr(entry->addr);
4403+ index = item->entry_index;
4404+ remove_rmap_item_from_tree(item);
4405+ dec_rmap_list_pool_count(item->slot, index);
4406+ free_rmap_item(item);
4407+ }
4408+}
4409+
4410+static inline int pool_entry_boundary(unsigned long index)
4411+{
4412+ unsigned long linear_addr;
4413+
4414+ linear_addr = sizeof(struct rmap_list_entry *) * index;
4415+ return index && !offset_in_page(linear_addr);
4416+}
4417+
4418+static inline void try_free_last_pool(struct vma_slot *slot,
4419+ unsigned long index)
4420+{
4421+ unsigned long pool_index;
4422+
4423+ pool_index = get_pool_index(slot, index);
4424+ if (slot->rmap_list_pool[pool_index] &&
4425+ !slot->pool_counts[pool_index]) {
4426+ __free_page(slot->rmap_list_pool[pool_index]);
4427+ slot->rmap_list_pool[pool_index] = NULL;
4428+ slot->flags |= UKSM_SLOT_NEED_SORT;
4429+ }
4430+
4431+}
4432+
4433+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4434+ struct rmap_item *item)
4435+{
4436+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4437+}
4438+
4439+static int within_same_pool(struct vma_slot *slot,
4440+ unsigned long i, unsigned long j)
4441+{
4442+ unsigned long pool_i, pool_j;
4443+
4444+ pool_i = get_pool_index(slot, i);
4445+ pool_j = get_pool_index(slot, j);
4446+
4447+ return (pool_i == pool_j);
4448+}
4449+
4450+static void sort_rmap_entry_list(struct vma_slot *slot)
4451+{
4452+ unsigned long i, j;
4453+ struct rmap_list_entry *entry, *swap_entry;
4454+
4455+ entry = get_rmap_list_entry(slot, 0, 0);
4456+ for (i = 0; i < slot->pages; ) {
4457+
4458+ if (!entry)
4459+ goto skip_whole_pool;
4460+
4461+ if (entry_is_new(entry))
4462+ goto next_entry;
4463+
4464+ if (is_addr(entry->addr)) {
4465+ entry->addr = 0;
4466+ goto next_entry;
4467+ }
4468+
4469+ j = vma_item_index(slot->vma, entry->item);
4470+ if (j == i)
4471+ goto next_entry;
4472+
4473+ if (within_same_pool(slot, i, j))
4474+ swap_entry = entry + j - i;
4475+ else
4476+ swap_entry = get_rmap_list_entry(slot, j, 1);
4477+
4478+ swap_entries(entry, i, swap_entry, j);
4479+ if (!within_same_pool(slot, i, j))
4480+ put_rmap_list_entry(slot, j);
4481+ continue;
4482+
4483+skip_whole_pool:
4484+ i += PAGE_SIZE / sizeof(*entry);
4485+ if (i < slot->pages)
4486+ entry = get_rmap_list_entry(slot, i, 0);
4487+ continue;
4488+
4489+next_entry:
4490+ if (i >= slot->pages - 1 ||
4491+ !within_same_pool(slot, i, i + 1)) {
4492+ put_rmap_list_entry(slot, i);
4493+ if (i + 1 < slot->pages)
4494+ entry = get_rmap_list_entry(slot, i + 1, 0);
4495+ } else
4496+ entry++;
4497+ i++;
4498+ continue;
4499+ }
4500+
4501+ /* free empty pool entries which contain no rmap_item */
4502+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4503+ for (i = 0; i < slot->pool_size; i++) {
4504+ unsigned char has_rmap;
4505+ void *addr;
4506+
4507+ if (!slot->rmap_list_pool[i])
4508+ continue;
4509+
4510+ has_rmap = 0;
4511+ addr = kmap(slot->rmap_list_pool[i]);
4512+ BUG_ON(!addr);
4513+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4514+ entry = (struct rmap_list_entry *)addr + j;
4515+ if (is_addr(entry->addr))
4516+ continue;
4517+ if (!entry->item)
4518+ continue;
4519+ has_rmap = 1;
4520+ }
4521+ kunmap(slot->rmap_list_pool[i]);
4522+ if (!has_rmap) {
4523+ BUG_ON(slot->pool_counts[i]);
4524+ __free_page(slot->rmap_list_pool[i]);
4525+ slot->rmap_list_pool[i] = NULL;
4526+ }
4527+ }
4528+
4529+ slot->flags &= ~UKSM_SLOT_NEED_SORT;
4530+}
4531+
4532+/*
4533+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
4534+ */
4535+static inline int vma_fully_scanned(struct vma_slot *slot)
4536+{
4537+ return slot->pages_scanned == slot->pages;
4538+}
4539+
4540+/**
4541+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4542+ * its random permutation. This function is embedded with the random
4543+ * permutation index management code.
4544+ */
4545+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4546+{
4547+ unsigned long rand_range, addr, swap_index, scan_index;
4548+ struct rmap_item *item = NULL;
4549+ struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4550+ struct page *page;
4551+
4552+ scan_index = swap_index = slot->pages_scanned % slot->pages;
4553+
4554+ if (pool_entry_boundary(scan_index))
4555+ try_free_last_pool(slot, scan_index - 1);
4556+
4557+ if (vma_fully_scanned(slot)) {
4558+ if (slot->flags & UKSM_SLOT_NEED_SORT)
4559+ slot->flags |= UKSM_SLOT_NEED_RERAND;
4560+ else
4561+ slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4562+ if (slot->flags & UKSM_SLOT_NEED_SORT)
4563+ sort_rmap_entry_list(slot);
4564+ }
4565+
4566+ scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4567+ if (!scan_entry)
4568+ return NULL;
4569+
4570+ if (entry_is_new(scan_entry)) {
4571+ scan_entry->addr = get_index_orig_addr(slot, scan_index);
4572+ set_is_addr(scan_entry->addr);
4573+ }
4574+
4575+ if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4576+ rand_range = slot->pages - scan_index;
4577+ BUG_ON(!rand_range);
4578+ swap_index = scan_index + (prandom_u32() % rand_range);
4579+ }
4580+
4581+ if (swap_index != scan_index) {
4582+ swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4583+ if (entry_is_new(swap_entry)) {
4584+ swap_entry->addr = get_index_orig_addr(slot,
4585+ swap_index);
4586+ set_is_addr(swap_entry->addr);
4587+ }
4588+ swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4589+ }
4590+
4591+ addr = get_entry_address(scan_entry);
4592+ item = get_entry_item(scan_entry);
4593+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4594+
4595+ page = follow_page(slot->vma, addr, FOLL_GET);
4596+ if (IS_ERR_OR_NULL(page))
4597+ goto nopage;
4598+
4599+ if (!PageAnon(page) && !page_trans_compound_anon(page))
4600+ goto putpage;
4601+
4602+ /*check is zero_page pfn or uksm_zero_page*/
4603+ if ((page_to_pfn(page) == zero_pfn)
4604+ || (page_to_pfn(page) == uksm_zero_pfn))
4605+ goto putpage;
4606+
4607+ flush_anon_page(slot->vma, page, addr);
4608+ flush_dcache_page(page);
4609+
4610+
4611+ *hash = page_hash(page, hash_strength, 1);
4612+ inc_uksm_pages_scanned();
4613+ /*if the page content all zero, re-map to zero-page*/
4614+ if (find_zero_page_hash(hash_strength, *hash)) {
4615+ if (!cmp_and_merge_zero_page(slot->vma, page)) {
4616+ slot->pages_merged++;
4617+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
4618+ dec_mm_counter(slot->mm, MM_ANONPAGES);
4619+
4620+ /* For full-zero pages, no need to create rmap item */
4621+ goto putpage;
4622+ } else {
4623+ inc_rshash_neg(memcmp_cost / 2);
4624+ }
4625+ }
4626+
4627+ if (!item) {
4628+ item = alloc_rmap_item();
4629+ if (item) {
4630+ /* It has already been zeroed */
4631+ item->slot = slot;
4632+ item->address = addr;
4633+ item->entry_index = scan_index;
4634+ scan_entry->item = item;
4635+ inc_rmap_list_pool_count(slot, scan_index);
4636+ } else
4637+ goto putpage;
4638+ }
4639+
4640+ BUG_ON(item->slot != slot);
4641+ /* the page may have changed */
4642+ item->page = page;
4643+ put_rmap_list_entry(slot, scan_index);
4644+ if (swap_entry)
4645+ put_rmap_list_entry(slot, swap_index);
4646+ return item;
4647+
4648+putpage:
4649+ put_page(page);
4650+ page = NULL;
4651+nopage:
4652+ /* no page, store addr back and free rmap_item if possible */
4653+ free_entry_item(scan_entry);
4654+ put_rmap_list_entry(slot, scan_index);
4655+ if (swap_entry)
4656+ put_rmap_list_entry(slot, swap_index);
4657+ return NULL;
4658+}
4659+
4660+static inline int in_stable_tree(struct rmap_item *rmap_item)
4661+{
4662+ return rmap_item->address & STABLE_FLAG;
4663+}
4664+
4665+/**
4666+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4667+ * mmap_sem locked.
4668+ */
4669+static noinline void scan_vma_one_page(struct vma_slot *slot)
4670+{
4671+ u32 hash;
4672+ struct mm_struct *mm;
4673+ struct rmap_item *rmap_item = NULL;
4674+ struct vm_area_struct *vma = slot->vma;
4675+
4676+ mm = vma->vm_mm;
4677+ BUG_ON(!mm);
4678+ BUG_ON(!slot);
4679+
4680+ rmap_item = get_next_rmap_item(slot, &hash);
4681+ if (!rmap_item)
4682+ goto out1;
4683+
4684+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4685+ goto out2;
4686+
4687+ cmp_and_merge_page(rmap_item, hash);
4688+out2:
4689+ put_page(rmap_item->page);
4690+out1:
4691+ slot->pages_scanned++;
4692+ if (slot->fully_scanned_round != fully_scanned_round)
4693+ scanned_virtual_pages++;
4694+
4695+ if (vma_fully_scanned(slot))
4696+ slot->fully_scanned_round = fully_scanned_round;
4697+}
4698+
4699+static inline unsigned long rung_get_pages(struct scan_rung *rung)
4700+{
4701+ struct slot_tree_node *node;
4702+
4703+ if (!rung->vma_root.rnode)
4704+ return 0;
4705+
4706+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4707+
4708+ return node->size;
4709+}
4710+
4711+#define RUNG_SAMPLED_MIN 3
4712+
4713+static inline
4714+void uksm_calc_rung_step(struct scan_rung *rung,
4715+ unsigned long page_time, unsigned long ratio)
4716+{
4717+ unsigned long sampled, pages;
4718+
4719+ /* will be fully scanned ? */
4720+ if (!rung->cover_msecs) {
4721+ rung->step = 1;
4722+ return;
4723+ }
4724+
4725+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4726+ * ratio / page_time;
4727+
4728+ /*
4729+ * Before we finsish a scan round and expensive per-round jobs,
4730+ * we need to have a chance to estimate the per page time. So
4731+ * the sampled number can not be too small.
4732+ */
4733+ if (sampled < RUNG_SAMPLED_MIN)
4734+ sampled = RUNG_SAMPLED_MIN;
4735+
4736+ pages = rung_get_pages(rung);
4737+ if (likely(pages > sampled))
4738+ rung->step = pages / sampled;
4739+ else
4740+ rung->step = 1;
4741+}
4742+
4743+static inline int step_need_recalc(struct scan_rung *rung)
4744+{
4745+ unsigned long pages, stepmax;
4746+
4747+ pages = rung_get_pages(rung);
4748+ stepmax = pages / RUNG_SAMPLED_MIN;
4749+
4750+ return pages && (rung->step > pages ||
4751+ (stepmax && rung->step > stepmax));
4752+}
4753+
4754+static inline
4755+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4756+{
4757+ struct vma_slot *slot;
4758+
4759+ if (finished)
4760+ rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4761+
4762+ if (step_recalc || step_need_recalc(rung)) {
4763+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4764+ BUG_ON(step_need_recalc(rung));
4765+ }
4766+
4767+ slot_iter_index = prandom_u32() % rung->step;
4768+ BUG_ON(!rung->vma_root.rnode);
4769+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4770+ BUG_ON(!slot);
4771+
4772+ rung->current_scan = slot;
4773+ rung->current_offset = slot_iter_index;
4774+}
4775+
4776+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4777+{
4778+ return &slot->rung->vma_root;
4779+}
4780+
4781+/*
4782+ * return if resetted.
4783+ */
4784+static int advance_current_scan(struct scan_rung *rung)
4785+{
4786+ unsigned short n;
4787+ struct vma_slot *slot, *next = NULL;
4788+
4789+ BUG_ON(!rung->vma_root.num);
4790+
4791+ slot = rung->current_scan;
4792+ n = (slot->pages - rung->current_offset) % rung->step;
4793+ slot_iter_index = rung->step - n;
4794+ next = sradix_tree_next(&rung->vma_root, slot->snode,
4795+ slot->sindex, slot_iter);
4796+
4797+ if (next) {
4798+ rung->current_offset = slot_iter_index;
4799+ rung->current_scan = next;
4800+ return 0;
4801+ } else {
4802+ reset_current_scan(rung, 1, 0);
4803+ return 1;
4804+ }
4805+}
4806+
4807+static inline void rung_rm_slot(struct vma_slot *slot)
4808+{
4809+ struct scan_rung *rung = slot->rung;
4810+ struct sradix_tree_root *root;
4811+
4812+ if (rung->current_scan == slot)
4813+ advance_current_scan(rung);
4814+
4815+ root = slot_get_root(slot);
4816+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4817+ slot->snode = NULL;
4818+ if (step_need_recalc(rung)) {
4819+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4820+ BUG_ON(step_need_recalc(rung));
4821+ }
4822+
4823+ /* In case advance_current_scan loop back to this slot again */
4824+ if (rung->vma_root.num && rung->current_scan == slot)
4825+ reset_current_scan(slot->rung, 1, 0);
4826+}
4827+
4828+static inline void rung_add_new_slots(struct scan_rung *rung,
4829+ struct vma_slot **slots, unsigned long num)
4830+{
4831+ int err;
4832+ struct vma_slot *slot;
4833+ unsigned long i;
4834+ struct sradix_tree_root *root = &rung->vma_root;
4835+
4836+ err = sradix_tree_enter(root, (void **)slots, num);
4837+ BUG_ON(err);
4838+
4839+ for (i = 0; i < num; i++) {
4840+ slot = slots[i];
4841+ slot->rung = rung;
4842+ BUG_ON(vma_fully_scanned(slot));
4843+ }
4844+
4845+ if (rung->vma_root.num == num)
4846+ reset_current_scan(rung, 0, 1);
4847+}
4848+
4849+static inline int rung_add_one_slot(struct scan_rung *rung,
4850+ struct vma_slot *slot)
4851+{
4852+ int err;
4853+
4854+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4855+ if (err)
4856+ return err;
4857+
4858+ slot->rung = rung;
4859+ if (rung->vma_root.num == 1)
4860+ reset_current_scan(rung, 0, 1);
4861+
4862+ return 0;
4863+}
4864+
4865+/*
4866+ * Return true if the slot is deleted from its rung.
4867+ */
4868+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4869+{
4870+ struct scan_rung *old_rung = slot->rung;
4871+ int err;
4872+
4873+ if (old_rung == rung)
4874+ return 0;
4875+
4876+ rung_rm_slot(slot);
4877+ err = rung_add_one_slot(rung, slot);
4878+ if (err) {
4879+ err = rung_add_one_slot(old_rung, slot);
4880+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4881+ }
4882+
4883+ return 1;
4884+}
4885+
4886+static inline int vma_rung_up(struct vma_slot *slot)
4887+{
4888+ struct scan_rung *rung;
4889+
4890+ rung = slot->rung;
4891+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4892+ rung++;
4893+
4894+ return vma_rung_enter(slot, rung);
4895+}
4896+
4897+static inline int vma_rung_down(struct vma_slot *slot)
4898+{
4899+ struct scan_rung *rung;
4900+
4901+ rung = slot->rung;
4902+ if (slot->rung != &uksm_scan_ladder[0])
4903+ rung--;
4904+
4905+ return vma_rung_enter(slot, rung);
4906+}
4907+
4908+/**
4909+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4910+ */
4911+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4912+{
4913+ unsigned long ret;
4914+
4915+ BUG_ON(slot->pages_scanned == slot->last_scanned);
4916+
4917+ ret = slot->pages_merged;
4918+
4919+ /* Thrashing area filtering */
4920+ if (ret && uksm_thrash_threshold) {
4921+ if (slot->pages_cowed * 100 / slot->pages_merged
4922+ > uksm_thrash_threshold) {
4923+ ret = 0;
4924+ } else {
4925+ ret = slot->pages_merged - slot->pages_cowed;
4926+ }
4927+ }
4928+
4929+ return ret;
4930+}
4931+
4932+/**
4933+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4934+ */
4935+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4936+{
4937+ unsigned long ret;
4938+ unsigned long pages_scanned;
4939+
4940+ pages_scanned = slot->pages_scanned;
4941+ if (!pages_scanned) {
4942+ if (uksm_thrash_threshold)
4943+ return 0;
4944+ else
4945+ pages_scanned = slot->pages_scanned;
4946+ }
4947+
4948+ ret = slot->pages_bemerged * 100 / pages_scanned;
4949+
4950+ /* Thrashing area filtering */
4951+ if (ret && uksm_thrash_threshold) {
4952+ if (slot->pages_cowed * 100 / slot->pages_bemerged
4953+ > uksm_thrash_threshold) {
4954+ ret = 0;
4955+ } else {
4956+ ret = slot->pages_bemerged - slot->pages_cowed;
4957+ }
4958+ }
4959+
4960+ return ret;
4961+}
4962+
4963+/**
4964+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
4965+ * stable tree need to be restructured, this is the function re-inserting the
4966+ * stable node.
4967+ */
4968+static inline void stable_node_reinsert(struct stable_node *new_node,
4969+ struct page *page,
4970+ struct rb_root *root_treep,
4971+ struct list_head *tree_node_listp,
4972+ u32 hash)
4973+{
4974+ struct rb_node **new = &root_treep->rb_node;
4975+ struct rb_node *parent = NULL;
4976+ struct stable_node *stable_node;
4977+ struct tree_node *tree_node;
4978+ struct page *tree_page;
4979+ int cmp;
4980+
4981+ while (*new) {
4982+ int cmp;
4983+
4984+ tree_node = rb_entry(*new, struct tree_node, node);
4985+
4986+ cmp = hash_cmp(hash, tree_node->hash);
4987+
4988+ if (cmp < 0) {
4989+ parent = *new;
4990+ new = &parent->rb_left;
4991+ } else if (cmp > 0) {
4992+ parent = *new;
4993+ new = &parent->rb_right;
4994+ } else
4995+ break;
4996+ }
4997+
4998+ if (*new) {
4999+ /* find a stable tree node with same first level hash value */
5000+ stable_node_hash_max(new_node, page, hash);
5001+ if (tree_node->count == 1) {
5002+ stable_node = rb_entry(tree_node->sub_root.rb_node,
5003+ struct stable_node, node);
5004+ tree_page = get_uksm_page(stable_node, 1, 0);
5005+ if (tree_page) {
5006+ stable_node_hash_max(stable_node,
5007+ tree_page, hash);
5008+ put_page(tree_page);
5009+
5010+ /* prepare for stable node insertion */
5011+
5012+ cmp = hash_cmp(new_node->hash_max,
5013+ stable_node->hash_max);
5014+ parent = &stable_node->node;
5015+ if (cmp < 0)
5016+ new = &parent->rb_left;
5017+ else if (cmp > 0)
5018+ new = &parent->rb_right;
5019+ else
5020+ goto failed;
5021+
5022+ goto add_node;
5023+ } else {
5024+ /* the only stable_node deleted, the tree node
5025+ * was not deleted.
5026+ */
5027+ goto tree_node_reuse;
5028+ }
5029+ }
5030+
5031+ /* well, search the collision subtree */
5032+ new = &tree_node->sub_root.rb_node;
5033+ parent = NULL;
5034+ BUG_ON(!*new);
5035+ while (*new) {
5036+ int cmp;
5037+
5038+ stable_node = rb_entry(*new, struct stable_node, node);
5039+
5040+ cmp = hash_cmp(new_node->hash_max,
5041+ stable_node->hash_max);
5042+
5043+ if (cmp < 0) {
5044+ parent = *new;
5045+ new = &parent->rb_left;
5046+ } else if (cmp > 0) {
5047+ parent = *new;
5048+ new = &parent->rb_right;
5049+ } else {
5050+ /* oh, no, still a collision */
5051+ goto failed;
5052+ }
5053+ }
5054+
5055+ goto add_node;
5056+ }
5057+
5058+ /* no tree node found */
5059+ tree_node = alloc_tree_node(tree_node_listp);
5060+ if (!tree_node) {
5061+ printk(KERN_ERR "UKSM: memory allocation error!\n");
5062+ goto failed;
5063+ } else {
5064+ tree_node->hash = hash;
5065+ rb_link_node(&tree_node->node, parent, new);
5066+ rb_insert_color(&tree_node->node, root_treep);
5067+
5068+tree_node_reuse:
5069+ /* prepare for stable node insertion */
5070+ parent = NULL;
5071+ new = &tree_node->sub_root.rb_node;
5072+ }
5073+
5074+add_node:
5075+ rb_link_node(&new_node->node, parent, new);
5076+ rb_insert_color(&new_node->node, &tree_node->sub_root);
5077+ new_node->tree_node = tree_node;
5078+ tree_node->count++;
5079+ return;
5080+
5081+failed:
5082+ /* This can only happen when two nodes have collided
5083+ * in two levels.
5084+ */
5085+ new_node->tree_node = NULL;
5086+ return;
5087+}
5088+
5089+static inline void free_all_tree_nodes(struct list_head *list)
5090+{
5091+ struct tree_node *node, *tmp;
5092+
5093+ list_for_each_entry_safe(node, tmp, list, all_list) {
5094+ free_tree_node(node);
5095+ }
5096+}
5097+
5098+/**
5099+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5100+ * strength to the current hash_strength. It re-structures the hole tree.
5101+ */
5102+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5103+{
5104+ struct stable_node *node, *tmp;
5105+ struct rb_root *root_new_treep;
5106+ struct list_head *new_tree_node_listp;
5107+
5108+ stable_tree_index = (stable_tree_index + 1) % 2;
5109+ root_new_treep = &root_stable_tree[stable_tree_index];
5110+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5111+ *root_new_treep = RB_ROOT;
5112+ BUG_ON(!list_empty(new_tree_node_listp));
5113+
5114+ /*
5115+ * we need to be safe, the node could be removed by get_uksm_page()
5116+ */
5117+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5118+ void *addr;
5119+ struct page *node_page;
5120+ u32 hash;
5121+
5122+ /*
5123+ * We are completely re-structuring the stable nodes to a new
5124+ * stable tree. We don't want to touch the old tree unlinks and
5125+ * old tree_nodes. The old tree_nodes will be freed at once.
5126+ */
5127+ node_page = get_uksm_page(node, 0, 0);
5128+ if (!node_page)
5129+ continue;
5130+
5131+ if (node->tree_node) {
5132+ hash = node->tree_node->hash;
5133+
5134+ addr = kmap_atomic(node_page);
5135+
5136+ hash = delta_hash(addr, prev_hash_strength,
5137+ hash_strength, hash);
5138+ kunmap_atomic(addr);
5139+ } else {
5140+ /*
5141+ *it was not inserted to rbtree due to collision in last
5142+ *round scan.
5143+ */
5144+ hash = page_hash(node_page, hash_strength, 0);
5145+ }
5146+
5147+ stable_node_reinsert(node, node_page, root_new_treep,
5148+ new_tree_node_listp, hash);
5149+ put_page(node_page);
5150+ }
5151+
5152+ root_stable_treep = root_new_treep;
5153+ free_all_tree_nodes(stable_tree_node_listp);
5154+ BUG_ON(!list_empty(stable_tree_node_listp));
5155+ stable_tree_node_listp = new_tree_node_listp;
5156+}
5157+
5158+static inline void inc_hash_strength(unsigned long delta)
5159+{
5160+ hash_strength += 1 << delta;
5161+ if (hash_strength > HASH_STRENGTH_MAX)
5162+ hash_strength = HASH_STRENGTH_MAX;
5163+}
5164+
5165+static inline void dec_hash_strength(unsigned long delta)
5166+{
5167+ unsigned long change = 1 << delta;
5168+
5169+ if (hash_strength <= change + 1)
5170+ hash_strength = 1;
5171+ else
5172+ hash_strength -= change;
5173+}
5174+
5175+static inline void inc_hash_strength_delta(void)
5176+{
5177+ hash_strength_delta++;
5178+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5179+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5180+}
5181+
5182+/*
5183+static inline unsigned long get_current_neg_ratio(void)
5184+{
5185+ if (!rshash_pos || rshash_neg > rshash_pos)
5186+ return 100;
5187+
5188+ return div64_u64(100 * rshash_neg , rshash_pos);
5189+}
5190+*/
5191+
5192+static inline unsigned long get_current_neg_ratio(void)
5193+{
5194+ u64 pos = benefit.pos;
5195+ u64 neg = benefit.neg;
5196+
5197+ if (!neg)
5198+ return 0;
5199+
5200+ if (!pos || neg > pos)
5201+ return 100;
5202+
5203+ if (neg > div64_u64(U64_MAX, 100))
5204+ pos = div64_u64(pos, 100);
5205+ else
5206+ neg *= 100;
5207+
5208+ return div64_u64(neg, pos);
5209+}
5210+
5211+static inline unsigned long get_current_benefit(void)
5212+{
5213+ u64 pos = benefit.pos;
5214+ u64 neg = benefit.neg;
5215+ u64 scanned = benefit.scanned;
5216+
5217+ if (neg > pos)
5218+ return 0;
5219+
5220+ return div64_u64((pos - neg), scanned);
5221+}
5222+
5223+static inline int judge_rshash_direction(void)
5224+{
5225+ u64 current_neg_ratio, stable_benefit;
5226+ u64 current_benefit, delta = 0;
5227+ int ret = STILL;
5228+
5229+ /* Try to probe a value after the boot, and in case the system
5230+ are still for a long time. */
5231+ if ((fully_scanned_round & 0xFFULL) == 10) {
5232+ ret = OBSCURE;
5233+ goto out;
5234+ }
5235+
5236+ current_neg_ratio = get_current_neg_ratio();
5237+
5238+ if (current_neg_ratio == 0) {
5239+ rshash_neg_cont_zero++;
5240+ if (rshash_neg_cont_zero > 2)
5241+ return GO_DOWN;
5242+ else
5243+ return STILL;
5244+ }
5245+ rshash_neg_cont_zero = 0;
5246+
5247+ if (current_neg_ratio > 90) {
5248+ ret = GO_UP;
5249+ goto out;
5250+ }
5251+
5252+ current_benefit = get_current_benefit();
5253+ stable_benefit = rshash_state.stable_benefit;
5254+
5255+ if (!stable_benefit) {
5256+ ret = OBSCURE;
5257+ goto out;
5258+ }
5259+
5260+ if (current_benefit > stable_benefit)
5261+ delta = current_benefit - stable_benefit;
5262+ else if (current_benefit < stable_benefit)
5263+ delta = stable_benefit - current_benefit;
5264+
5265+ delta = div64_u64(100 * delta , stable_benefit);
5266+
5267+ if (delta > 50) {
5268+ rshash_cont_obscure++;
5269+ if (rshash_cont_obscure > 2)
5270+ return OBSCURE;
5271+ else
5272+ return STILL;
5273+ }
5274+
5275+out:
5276+ rshash_cont_obscure = 0;
5277+ return ret;
5278+}
5279+
5280+/**
5281+ * rshash_adjust() - The main function to control the random sampling state
5282+ * machine for hash strength adapting.
5283+ *
5284+ * return true if hash_strength has changed.
5285+ */
5286+static inline int rshash_adjust(void)
5287+{
5288+ unsigned long prev_hash_strength = hash_strength;
5289+
5290+ if (!encode_benefit())
5291+ return 0;
5292+
5293+ switch (rshash_state.state) {
5294+ case RSHASH_STILL:
5295+ switch (judge_rshash_direction()) {
5296+ case GO_UP:
5297+ if (rshash_state.pre_direct == GO_DOWN)
5298+ hash_strength_delta = 0;
5299+
5300+ inc_hash_strength(hash_strength_delta);
5301+ inc_hash_strength_delta();
5302+ rshash_state.stable_benefit = get_current_benefit();
5303+ rshash_state.pre_direct = GO_UP;
5304+ break;
5305+
5306+ case GO_DOWN:
5307+ if (rshash_state.pre_direct == GO_UP)
5308+ hash_strength_delta = 0;
5309+
5310+ dec_hash_strength(hash_strength_delta);
5311+ inc_hash_strength_delta();
5312+ rshash_state.stable_benefit = get_current_benefit();
5313+ rshash_state.pre_direct = GO_DOWN;
5314+ break;
5315+
5316+ case OBSCURE:
5317+ rshash_state.stable_point = hash_strength;
5318+ rshash_state.turn_point_down = hash_strength;
5319+ rshash_state.turn_point_up = hash_strength;
5320+ rshash_state.turn_benefit_down = get_current_benefit();
5321+ rshash_state.turn_benefit_up = get_current_benefit();
5322+ rshash_state.lookup_window_index = 0;
5323+ rshash_state.state = RSHASH_TRYDOWN;
5324+ dec_hash_strength(hash_strength_delta);
5325+ inc_hash_strength_delta();
5326+ break;
5327+
5328+ case STILL:
5329+ break;
5330+ default:
5331+ BUG();
5332+ }
5333+ break;
5334+
5335+ case RSHASH_TRYDOWN:
5336+ if (rshash_state.lookup_window_index++ % 5 == 0)
5337+ rshash_state.below_count = 0;
5338+
5339+ if (get_current_benefit() < rshash_state.stable_benefit)
5340+ rshash_state.below_count++;
5341+ else if (get_current_benefit() >
5342+ rshash_state.turn_benefit_down) {
5343+ rshash_state.turn_point_down = hash_strength;
5344+ rshash_state.turn_benefit_down = get_current_benefit();
5345+ }
5346+
5347+ if (rshash_state.below_count >= 3 ||
5348+ judge_rshash_direction() == GO_UP ||
5349+ hash_strength == 1) {
5350+ hash_strength = rshash_state.stable_point;
5351+ hash_strength_delta = 0;
5352+ inc_hash_strength(hash_strength_delta);
5353+ inc_hash_strength_delta();
5354+ rshash_state.lookup_window_index = 0;
5355+ rshash_state.state = RSHASH_TRYUP;
5356+ hash_strength_delta = 0;
5357+ } else {
5358+ dec_hash_strength(hash_strength_delta);
5359+ inc_hash_strength_delta();
5360+ }
5361+ break;
5362+
5363+ case RSHASH_TRYUP:
5364+ if (rshash_state.lookup_window_index++ % 5 == 0)
5365+ rshash_state.below_count = 0;
5366+
5367+ if (get_current_benefit() < rshash_state.turn_benefit_down)
5368+ rshash_state.below_count++;
5369+ else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5370+ rshash_state.turn_point_up = hash_strength;
5371+ rshash_state.turn_benefit_up = get_current_benefit();
5372+ }
5373+
5374+ if (rshash_state.below_count >= 3 ||
5375+ judge_rshash_direction() == GO_DOWN ||
5376+ hash_strength == HASH_STRENGTH_MAX) {
5377+ hash_strength = rshash_state.turn_benefit_up >
5378+ rshash_state.turn_benefit_down ?
5379+ rshash_state.turn_point_up :
5380+ rshash_state.turn_point_down;
5381+
5382+ rshash_state.state = RSHASH_PRE_STILL;
5383+ } else {
5384+ inc_hash_strength(hash_strength_delta);
5385+ inc_hash_strength_delta();
5386+ }
5387+
5388+ break;
5389+
5390+ case RSHASH_NEW:
5391+ case RSHASH_PRE_STILL:
5392+ rshash_state.stable_benefit = get_current_benefit();
5393+ rshash_state.state = RSHASH_STILL;
5394+ hash_strength_delta = 0;
5395+ break;
5396+ default:
5397+ BUG();
5398+ }
5399+
5400+ /* rshash_neg = rshash_pos = 0; */
5401+ reset_benefit();
5402+
5403+ if (prev_hash_strength != hash_strength)
5404+ stable_tree_delta_hash(prev_hash_strength);
5405+
5406+ return prev_hash_strength != hash_strength;
5407+}
5408+
5409+/**
5410+ * round_update_ladder() - The main function to do update of all the
5411+ * adjustments whenever a scan round is finished.
5412+ */
5413+static noinline void round_update_ladder(void)
5414+{
5415+ int i;
5416+ unsigned long dedup;
5417+ struct vma_slot *slot, *tmp_slot;
5418+
5419+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5420+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5421+ }
5422+
5423+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5424+
5425+ /* slot may be rung_rm_slot() when mm exits */
5426+ if (slot->snode) {
5427+ dedup = cal_dedup_ratio_old(slot);
5428+ if (dedup && dedup >= uksm_abundant_threshold)
5429+ vma_rung_up(slot);
5430+ }
5431+
5432+ slot->pages_bemerged = 0;
5433+ slot->pages_cowed = 0;
5434+
5435+ list_del_init(&slot->dedup_list);
5436+ }
5437+}
5438+
5439+static void uksm_del_vma_slot(struct vma_slot *slot)
5440+{
5441+ int i, j;
5442+ struct rmap_list_entry *entry;
5443+
5444+ if (slot->snode) {
5445+ /*
5446+ * In case it just failed when entering the rung, it's not
5447+ * necessary.
5448+ */
5449+ rung_rm_slot(slot);
5450+ }
5451+
5452+ if (!list_empty(&slot->dedup_list))
5453+ list_del(&slot->dedup_list);
5454+
5455+ if (!slot->rmap_list_pool || !slot->pool_counts) {
5456+ /* In case it OOMed in uksm_vma_enter() */
5457+ goto out;
5458+ }
5459+
5460+ for (i = 0; i < slot->pool_size; i++) {
5461+ void *addr;
5462+
5463+ if (!slot->rmap_list_pool[i])
5464+ continue;
5465+
5466+ addr = kmap(slot->rmap_list_pool[i]);
5467+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5468+ entry = (struct rmap_list_entry *)addr + j;
5469+ if (is_addr(entry->addr))
5470+ continue;
5471+ if (!entry->item)
5472+ continue;
5473+
5474+ remove_rmap_item_from_tree(entry->item);
5475+ free_rmap_item(entry->item);
5476+ slot->pool_counts[i]--;
5477+ }
5478+ BUG_ON(slot->pool_counts[i]);
5479+ kunmap(slot->rmap_list_pool[i]);
5480+ __free_page(slot->rmap_list_pool[i]);
5481+ }
5482+ kfree(slot->rmap_list_pool);
5483+ kfree(slot->pool_counts);
5484+
5485+out:
5486+ slot->rung = NULL;
5487+ BUG_ON(uksm_pages_total < slot->pages);
5488+ if (slot->flags & UKSM_SLOT_IN_UKSM)
5489+ uksm_pages_total -= slot->pages;
5490+
5491+ if (slot->fully_scanned_round == fully_scanned_round)
5492+ scanned_virtual_pages -= slot->pages;
5493+ else
5494+ scanned_virtual_pages -= slot->pages_scanned;
5495+ free_vma_slot(slot);
5496+}
5497+
5498+
5499+#define SPIN_LOCK_PERIOD 32
5500+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5501+static inline void cleanup_vma_slots(void)
5502+{
5503+ struct vma_slot *slot;
5504+ int i;
5505+
5506+ i = 0;
5507+ spin_lock(&vma_slot_list_lock);
5508+ while (!list_empty(&vma_slot_del)) {
5509+ slot = list_entry(vma_slot_del.next,
5510+ struct vma_slot, slot_list);
5511+ list_del(&slot->slot_list);
5512+ cleanup_slots[i++] = slot;
5513+ if (i == SPIN_LOCK_PERIOD) {
5514+ spin_unlock(&vma_slot_list_lock);
5515+ while (--i >= 0)
5516+ uksm_del_vma_slot(cleanup_slots[i]);
5517+ i = 0;
5518+ spin_lock(&vma_slot_list_lock);
5519+ }
5520+ }
5521+ spin_unlock(&vma_slot_list_lock);
5522+
5523+ while (--i >= 0)
5524+ uksm_del_vma_slot(cleanup_slots[i]);
5525+}
5526+
5527+/*
5528+*expotional moving average formula
5529+*/
5530+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5531+{
5532+ /*
5533+ * For a very high burst, even the ema cannot work well, a false very
5534+ * high per-page time estimation can result in feedback in very high
5535+ * overhead of context swith and rung update -- this will then lead
5536+ * to higher per-paper time, this may not converge.
5537+ *
5538+ * Instead, we try to approach this value in a binary manner.
5539+ */
5540+ if (curr > last_ema * 10)
5541+ return last_ema * 2;
5542+
5543+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5544+}
5545+
5546+/*
5547+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5548+ * nanoseconds based on current uksm_sleep_jiffies.
5549+ */
5550+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5551+{
5552+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5553+ (TIME_RATIO_SCALE - ratio) * ratio;
5554+}
5555+
5556+
5557+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5558+{
5559+ unsigned long ret;
5560+
5561+ BUG_ON(!cpu_time_ratio);
5562+
5563+ if (cpu_time_ratio > 0)
5564+ ret = cpu_time_ratio;
5565+ else
5566+ ret = (unsigned long)(-cpu_time_ratio) *
5567+ uksm_max_cpu_percentage / 100UL;
5568+
5569+ return ret ? ret : 1;
5570+}
5571+
5572+static noinline void uksm_calc_scan_pages(void)
5573+{
5574+ struct scan_rung *ladder = uksm_scan_ladder;
5575+ unsigned long sleep_usecs, nsecs;
5576+ unsigned long ratio;
5577+ int i;
5578+ unsigned long per_page;
5579+
5580+ if (uksm_ema_page_time > 100000 ||
5581+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5582+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5583+
5584+ per_page = uksm_ema_page_time;
5585+ BUG_ON(!per_page);
5586+
5587+ /*
5588+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5589+ * based on saved user input.
5590+ */
5591+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5592+ uksm_sleep_jiffies = uksm_sleep_saved;
5593+
5594+ /* We require a rung scan at least 1 page in a period. */
5595+ nsecs = per_page;
5596+ ratio = rung_real_ratio(ladder[0].cpu_ratio);
5597+ if (cpu_ratio_to_nsec(ratio) < nsecs) {
5598+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5599+ / NSEC_PER_USEC;
5600+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5601+ }
5602+
5603+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5604+ ratio = rung_real_ratio(ladder[i].cpu_ratio);
5605+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5606+ per_page;
5607+ BUG_ON(!ladder[i].pages_to_scan);
5608+ uksm_calc_rung_step(&ladder[i], per_page, ratio);
5609+ }
5610+}
5611+
5612+/*
5613+ * From the scan time of this round (ns) to next expected min sleep time
5614+ * (ms), be careful of the possible overflows. ratio is taken from
5615+ * rung_real_ratio()
5616+ */
5617+static inline
5618+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5619+{
5620+ scan_time >>= 20; /* to msec level now */
5621+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5622+
5623+ return (unsigned int) ((unsigned long) scan_time *
5624+ (TIME_RATIO_SCALE - ratio) / ratio);
5625+}
5626+
5627+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5628+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5629+
5630+static inline unsigned long vma_pool_size(struct vma_slot *slot)
5631+{
5632+ return round_up(sizeof(struct rmap_list_entry) * slot->pages,
5633+ PAGE_SIZE) >> PAGE_SHIFT;
5634+}
5635+
5636+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5637+{
5638+ struct scan_rung *rung;
5639+ unsigned long pool_size, i;
5640+ struct vma_slot *slot;
5641+ int failed;
5642+
5643+ rung = &uksm_scan_ladder[0];
5644+
5645+ failed = 0;
5646+ for (i = 0; i < num; i++) {
5647+ slot = slots[i];
5648+
5649+ pool_size = vma_pool_size(slot);
5650+ slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5651+ pool_size, GFP_KERNEL);
5652+ if (!slot->rmap_list_pool)
5653+ break;
5654+
5655+ slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
5656+ GFP_KERNEL);
5657+ if (!slot->pool_counts) {
5658+ kfree(slot->rmap_list_pool);
5659+ break;
5660+ }
5661+
5662+ slot->pool_size = pool_size;
5663+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
5664+ slot->flags |= UKSM_SLOT_IN_UKSM;
5665+ uksm_pages_total += slot->pages;
5666+ }
5667+
5668+ if (i)
5669+ rung_add_new_slots(rung, slots, i);
5670+
5671+ return;
5672+}
5673+
5674+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5675+
5676+static void uksm_enter_all_slots(void)
5677+{
5678+ struct vma_slot *slot;
5679+ unsigned long index;
5680+ struct list_head empty_vma_list;
5681+ int i;
5682+
5683+ i = 0;
5684+ index = 0;
5685+ INIT_LIST_HEAD(&empty_vma_list);
5686+
5687+ spin_lock(&vma_slot_list_lock);
5688+ while (!list_empty(&vma_slot_new)) {
5689+ slot = list_entry(vma_slot_new.next,
5690+ struct vma_slot, slot_list);
5691+
5692+ if (!slot->vma->anon_vma) {
5693+ list_move(&slot->slot_list, &empty_vma_list);
5694+ } else if (vma_can_enter(slot->vma)) {
5695+ batch_slots[index++] = slot;
5696+ list_del_init(&slot->slot_list);
5697+ } else {
5698+ list_move(&slot->slot_list, &vma_slot_noadd);
5699+ }
5700+
5701+ if (++i == SPIN_LOCK_PERIOD ||
5702+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5703+ spin_unlock(&vma_slot_list_lock);
5704+
5705+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5706+ uksm_vma_enter(batch_slots, index);
5707+ index = 0;
5708+ }
5709+ i = 0;
5710+ cond_resched();
5711+ spin_lock(&vma_slot_list_lock);
5712+ }
5713+ }
5714+
5715+ list_splice(&empty_vma_list, &vma_slot_new);
5716+
5717+ spin_unlock(&vma_slot_list_lock);
5718+
5719+ if (index)
5720+ uksm_vma_enter(batch_slots, index);
5721+
5722+}
5723+
5724+static inline int rung_round_finished(struct scan_rung *rung)
5725+{
5726+ return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5727+}
5728+
5729+static inline void judge_slot(struct vma_slot *slot)
5730+{
5731+ struct scan_rung *rung = slot->rung;
5732+ unsigned long dedup;
5733+ int deleted;
5734+
5735+ dedup = cal_dedup_ratio(slot);
5736+ if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5737+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5738+ else if (dedup && dedup >= uksm_abundant_threshold)
5739+ deleted = vma_rung_up(slot);
5740+ else
5741+ deleted = vma_rung_down(slot);
5742+
5743+ slot->pages_merged = 0;
5744+ slot->pages_cowed = 0;
5745+
5746+ if (vma_fully_scanned(slot))
5747+ slot->pages_scanned = 0;
5748+
5749+ slot->last_scanned = slot->pages_scanned;
5750+
5751+ /* If its deleted in above, then rung was already advanced. */
5752+ if (!deleted)
5753+ advance_current_scan(rung);
5754+}
5755+
5756+
5757+static inline int hash_round_finished(void)
5758+{
5759+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5760+ scanned_virtual_pages = 0;
5761+ if (uksm_pages_scanned)
5762+ fully_scanned_round++;
5763+
5764+ return 1;
5765+ } else {
5766+ return 0;
5767+ }
5768+}
5769+
5770+#define UKSM_MMSEM_BATCH 5
5771+#define BUSY_RETRY 100
5772+
5773+/**
5774+ * uksm_do_scan() - the main worker function.
5775+ */
5776+static noinline void uksm_do_scan(void)
5777+{
5778+ struct vma_slot *slot, *iter;
5779+ struct mm_struct *busy_mm;
5780+ unsigned char round_finished, all_rungs_emtpy;
5781+ int i, err, mmsem_batch;
5782+ unsigned long pcost;
5783+ long long delta_exec;
5784+ unsigned long vpages, max_cpu_ratio;
5785+ unsigned long long start_time, end_time, scan_time;
5786+ unsigned int expected_jiffies;
5787+
5788+ might_sleep();
5789+
5790+ vpages = 0;
5791+
5792+ start_time = task_sched_runtime(current);
5793+ max_cpu_ratio = 0;
5794+ mmsem_batch = 0;
5795+
5796+ for (i = 0; i < SCAN_LADDER_SIZE;) {
5797+ struct scan_rung *rung = &uksm_scan_ladder[i];
5798+ unsigned long ratio;
5799+ int busy_retry;
5800+
5801+ if (!rung->pages_to_scan) {
5802+ i++;
5803+ continue;
5804+ }
5805+
5806+ if (!rung->vma_root.num) {
5807+ rung->pages_to_scan = 0;
5808+ i++;
5809+ continue;
5810+ }
5811+
5812+ ratio = rung_real_ratio(rung->cpu_ratio);
5813+ if (ratio > max_cpu_ratio)
5814+ max_cpu_ratio = ratio;
5815+
5816+ busy_retry = BUSY_RETRY;
5817+ /*
5818+ * Do not consider rung_round_finished() here, just used up the
5819+ * rung->pages_to_scan quota.
5820+ */
5821+ while (rung->pages_to_scan && rung->vma_root.num &&
5822+ likely(!freezing(current))) {
5823+ int reset = 0;
5824+
5825+ slot = rung->current_scan;
5826+
5827+ BUG_ON(vma_fully_scanned(slot));
5828+
5829+ if (mmsem_batch) {
5830+ err = 0;
5831+ } else {
5832+ err = try_down_read_slot_mmap_sem(slot);
5833+ }
5834+
5835+ if (err == -ENOENT) {
5836+rm_slot:
5837+ rung_rm_slot(slot);
5838+ continue;
5839+ }
5840+
5841+ busy_mm = slot->mm;
5842+
5843+ if (err == -EBUSY) {
5844+ /* skip other vmas on the same mm */
5845+ do {
5846+ reset = advance_current_scan(rung);
5847+ iter = rung->current_scan;
5848+ busy_retry--;
5849+ if (iter->vma->vm_mm != busy_mm ||
5850+ !busy_retry || reset)
5851+ break;
5852+ } while (1);
5853+
5854+ if (iter->vma->vm_mm != busy_mm) {
5855+ continue;
5856+ } else {
5857+ /* scan round finsished */
5858+ break;
5859+ }
5860+ }
5861+
5862+ BUG_ON(!vma_can_enter(slot->vma));
5863+ if (uksm_test_exit(slot->vma->vm_mm)) {
5864+ mmsem_batch = 0;
5865+ up_read(&slot->vma->vm_mm->mmap_sem);
5866+ goto rm_slot;
5867+ }
5868+
5869+ if (mmsem_batch)
5870+ mmsem_batch--;
5871+ else
5872+ mmsem_batch = UKSM_MMSEM_BATCH;
5873+
5874+ /* Ok, we have take the mmap_sem, ready to scan */
5875+ scan_vma_one_page(slot);
5876+ rung->pages_to_scan--;
5877+ vpages++;
5878+
5879+ if (rung->current_offset + rung->step > slot->pages - 1
5880+ || vma_fully_scanned(slot)) {
5881+ up_read(&slot->vma->vm_mm->mmap_sem);
5882+ judge_slot(slot);
5883+ mmsem_batch = 0;
5884+ } else {
5885+ rung->current_offset += rung->step;
5886+ if (!mmsem_batch)
5887+ up_read(&slot->vma->vm_mm->mmap_sem);
5888+ }
5889+
5890+ busy_retry = BUSY_RETRY;
5891+ cond_resched();
5892+ }
5893+
5894+ if (mmsem_batch) {
5895+ up_read(&slot->vma->vm_mm->mmap_sem);
5896+ mmsem_batch = 0;
5897+ }
5898+
5899+ if (freezing(current))
5900+ break;
5901+
5902+ cond_resched();
5903+ }
5904+ end_time = task_sched_runtime(current);
5905+ delta_exec = end_time - start_time;
5906+
5907+ if (freezing(current))
5908+ return;
5909+
5910+ cleanup_vma_slots();
5911+ uksm_enter_all_slots();
5912+
5913+ round_finished = 1;
5914+ all_rungs_emtpy = 1;
5915+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5916+ struct scan_rung *rung = &uksm_scan_ladder[i];
5917+
5918+ if (rung->vma_root.num) {
5919+ all_rungs_emtpy = 0;
5920+ if (!rung_round_finished(rung))
5921+ round_finished = 0;
5922+ }
5923+ }
5924+
5925+ if (all_rungs_emtpy)
5926+ round_finished = 0;
5927+
5928+ if (round_finished) {
5929+ round_update_ladder();
5930+ uksm_eval_round++;
5931+
5932+ if (hash_round_finished() && rshash_adjust()) {
5933+ /* Reset the unstable root iff hash strength changed */
5934+ uksm_hash_round++;
5935+ root_unstable_tree = RB_ROOT;
5936+ free_all_tree_nodes(&unstable_tree_node_list);
5937+ }
5938+
5939+ /*
5940+ * A number of pages can hang around indefinitely on per-cpu
5941+ * pagevecs, raised page count preventing write_protect_page
5942+ * from merging them. Though it doesn't really matter much,
5943+ * it is puzzling to see some stuck in pages_volatile until
5944+ * other activity jostles them out, and they also prevented
5945+ * LTP's KSM test from succeeding deterministically; so drain
5946+ * them here (here rather than on entry to uksm_do_scan(),
5947+ * so we don't IPI too often when pages_to_scan is set low).
5948+ */
5949+ lru_add_drain_all();
5950+ }
5951+
5952+
5953+ if (vpages && delta_exec > 0) {
5954+ pcost = (unsigned long) delta_exec / vpages;
5955+ if (likely(uksm_ema_page_time))
5956+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5957+ else
5958+ uksm_ema_page_time = pcost;
5959+ }
5960+
5961+ uksm_calc_scan_pages();
5962+ uksm_sleep_real = uksm_sleep_jiffies;
5963+ /* in case of radical cpu bursts, apply the upper bound */
5964+ end_time = task_sched_runtime(current);
5965+ if (max_cpu_ratio && end_time > start_time) {
5966+ scan_time = end_time - start_time;
5967+ expected_jiffies = msecs_to_jiffies(
5968+ scan_time_to_sleep(scan_time, max_cpu_ratio));
5969+
5970+ if (expected_jiffies > uksm_sleep_real)
5971+ uksm_sleep_real = expected_jiffies;
5972+
5973+ /* We have a 1 second up bound for responsiveness. */
5974+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5975+ uksm_sleep_real = msecs_to_jiffies(1000);
5976+ }
5977+
5978+ return;
5979+}
5980+
5981+static int ksmd_should_run(void)
5982+{
5983+ return uksm_run & UKSM_RUN_MERGE;
5984+}
5985+
5986+static int uksm_scan_thread(void *nothing)
5987+{
5988+ set_freezable();
5989+ set_user_nice(current, 5);
5990+
5991+ while (!kthread_should_stop()) {
5992+ mutex_lock(&uksm_thread_mutex);
5993+ if (ksmd_should_run()) {
5994+ uksm_do_scan();
5995+ }
5996+ mutex_unlock(&uksm_thread_mutex);
5997+
5998+ try_to_freeze();
5999+
6000+ if (ksmd_should_run()) {
6001+ schedule_timeout_interruptible(uksm_sleep_real);
6002+ uksm_sleep_times++;
6003+ } else {
6004+ wait_event_freezable(uksm_thread_wait,
6005+ ksmd_should_run() || kthread_should_stop());
6006+ }
6007+ }
6008+ return 0;
6009+}
6010+
6011+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6012+{
6013+ struct stable_node *stable_node;
6014+ struct node_vma *node_vma;
6015+ struct rmap_item *rmap_item;
6016+ int ret = SWAP_AGAIN;
6017+ int search_new_forks = 0;
6018+ unsigned long address;
6019+
6020+ VM_BUG_ON_PAGE(!PageKsm(page), page);
6021+ VM_BUG_ON_PAGE(!PageLocked(page), page);
6022+
6023+ stable_node = page_stable_node(page);
6024+ if (!stable_node)
6025+ return ret;
6026+again:
6027+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6028+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6029+ struct anon_vma *anon_vma = rmap_item->anon_vma;
6030+ struct anon_vma_chain *vmac;
6031+ struct vm_area_struct *vma;
6032+
6033+ anon_vma_lock_read(anon_vma);
6034+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6035+ 0, ULONG_MAX) {
6036+ vma = vmac->vma;
6037+ address = get_rmap_addr(rmap_item);
6038+
6039+ if (address < vma->vm_start ||
6040+ address >= vma->vm_end)
6041+ continue;
6042+
6043+ if ((rmap_item->slot->vma == vma) ==
6044+ search_new_forks)
6045+ continue;
6046+
6047+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6048+ continue;
6049+
6050+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
6051+ if (ret != SWAP_AGAIN) {
6052+ anon_vma_unlock_read(anon_vma);
6053+ goto out;
6054+ }
6055+
6056+ if (rwc->done && rwc->done(page)) {
6057+ anon_vma_unlock_read(anon_vma);
6058+ goto out;
6059+ }
6060+ }
6061+ anon_vma_unlock_read(anon_vma);
6062+ }
6063+ }
6064+ if (!search_new_forks++)
6065+ goto again;
6066+out:
6067+ return ret;
6068+}
6069+
6070+#ifdef CONFIG_MIGRATION
6071+/* Common ksm interface but may be specific to uksm */
6072+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6073+{
6074+ struct stable_node *stable_node;
6075+
6076+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6077+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6078+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
6079+
6080+ stable_node = page_stable_node(newpage);
6081+ if (stable_node) {
6082+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6083+ stable_node->kpfn = page_to_pfn(newpage);
6084+ }
6085+}
6086+#endif /* CONFIG_MIGRATION */
6087+
6088+#ifdef CONFIG_MEMORY_HOTREMOVE
6089+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6090+ unsigned long end_pfn)
6091+{
6092+ struct rb_node *node;
6093+
6094+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6095+ struct stable_node *stable_node;
6096+
6097+ stable_node = rb_entry(node, struct stable_node, node);
6098+ if (stable_node->kpfn >= start_pfn &&
6099+ stable_node->kpfn < end_pfn)
6100+ return stable_node;
6101+ }
6102+ return NULL;
6103+}
6104+
6105+static int uksm_memory_callback(struct notifier_block *self,
6106+ unsigned long action, void *arg)
6107+{
6108+ struct memory_notify *mn = arg;
6109+ struct stable_node *stable_node;
6110+
6111+ switch (action) {
6112+ case MEM_GOING_OFFLINE:
6113+ /*
6114+ * Keep it very simple for now: just lock out ksmd and
6115+ * MADV_UNMERGEABLE while any memory is going offline.
6116+ * mutex_lock_nested() is necessary because lockdep was alarmed
6117+ * that here we take uksm_thread_mutex inside notifier chain
6118+ * mutex, and later take notifier chain mutex inside
6119+ * uksm_thread_mutex to unlock it. But that's safe because both
6120+ * are inside mem_hotplug_mutex.
6121+ */
6122+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6123+ break;
6124+
6125+ case MEM_OFFLINE:
6126+ /*
6127+ * Most of the work is done by page migration; but there might
6128+ * be a few stable_nodes left over, still pointing to struct
6129+ * pages which have been offlined: prune those from the tree.
6130+ */
6131+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6132+ mn->start_pfn + mn->nr_pages)) != NULL)
6133+ remove_node_from_stable_tree(stable_node, 1, 1);
6134+ /* fallthrough */
6135+
6136+ case MEM_CANCEL_OFFLINE:
6137+ mutex_unlock(&uksm_thread_mutex);
6138+ break;
6139+ }
6140+ return NOTIFY_OK;
6141+}
6142+#endif /* CONFIG_MEMORY_HOTREMOVE */
6143+
6144+#ifdef CONFIG_SYSFS
6145+/*
6146+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
6147+ */
6148+
6149+#define UKSM_ATTR_RO(_name) \
6150+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6151+#define UKSM_ATTR(_name) \
6152+ static struct kobj_attribute _name##_attr = \
6153+ __ATTR(_name, 0644, _name##_show, _name##_store)
6154+
6155+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6156+ struct kobj_attribute *attr, char *buf)
6157+{
6158+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6159+}
6160+
6161+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6162+ struct kobj_attribute *attr,
6163+ const char *buf, size_t count)
6164+{
6165+ unsigned long max_cpu_percentage;
6166+ int err;
6167+
6168+ err = kstrtoul(buf, 10, &max_cpu_percentage);
6169+ if (err || max_cpu_percentage > 100)
6170+ return -EINVAL;
6171+
6172+ if (max_cpu_percentage == 100)
6173+ max_cpu_percentage = 99;
6174+ else if (max_cpu_percentage < 10)
6175+ max_cpu_percentage = 10;
6176+
6177+ uksm_max_cpu_percentage = max_cpu_percentage;
6178+
6179+ return count;
6180+}
6181+UKSM_ATTR(max_cpu_percentage);
6182+
6183+static ssize_t sleep_millisecs_show(struct kobject *kobj,
6184+ struct kobj_attribute *attr, char *buf)
6185+{
6186+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6187+}
6188+
6189+static ssize_t sleep_millisecs_store(struct kobject *kobj,
6190+ struct kobj_attribute *attr,
6191+ const char *buf, size_t count)
6192+{
6193+ unsigned long msecs;
6194+ int err;
6195+
6196+ err = kstrtoul(buf, 10, &msecs);
6197+ if (err || msecs > MSEC_PER_SEC)
6198+ return -EINVAL;
6199+
6200+ uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6201+ uksm_sleep_saved = uksm_sleep_jiffies;
6202+
6203+ return count;
6204+}
6205+UKSM_ATTR(sleep_millisecs);
6206+
6207+
6208+static ssize_t cpu_governor_show(struct kobject *kobj,
6209+ struct kobj_attribute *attr, char *buf)
6210+{
6211+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6212+ int i;
6213+
6214+ buf[0] = '\0';
6215+ for (i = 0; i < n ; i++) {
6216+ if (uksm_cpu_governor == i)
6217+ strcat(buf, "[");
6218+
6219+ strcat(buf, uksm_cpu_governor_str[i]);
6220+
6221+ if (uksm_cpu_governor == i)
6222+ strcat(buf, "]");
6223+
6224+ strcat(buf, " ");
6225+ }
6226+ strcat(buf, "\n");
6227+
6228+ return strlen(buf);
6229+}
6230+
6231+static inline void init_performance_values(void)
6232+{
6233+ int i;
6234+ struct scan_rung *rung;
6235+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6236+
6237+
6238+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6239+ rung = uksm_scan_ladder + i;
6240+ rung->cpu_ratio = preset->cpu_ratio[i];
6241+ rung->cover_msecs = preset->cover_msecs[i];
6242+ }
6243+
6244+ uksm_max_cpu_percentage = preset->max_cpu;
6245+}
6246+
6247+static ssize_t cpu_governor_store(struct kobject *kobj,
6248+ struct kobj_attribute *attr,
6249+ const char *buf, size_t count)
6250+{
6251+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6252+
6253+ for (n--; n >=0 ; n--) {
6254+ if (!strncmp(buf, uksm_cpu_governor_str[n],
6255+ strlen(uksm_cpu_governor_str[n])))
6256+ break;
6257+ }
6258+
6259+ if (n < 0)
6260+ return -EINVAL;
6261+ else
6262+ uksm_cpu_governor = n;
6263+
6264+ init_performance_values();
6265+
6266+ return count;
6267+}
6268+UKSM_ATTR(cpu_governor);
6269+
6270+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6271+ char *buf)
6272+{
6273+ return sprintf(buf, "%u\n", uksm_run);
6274+}
6275+
6276+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6277+ const char *buf, size_t count)
6278+{
6279+ int err;
6280+ unsigned long flags;
6281+
6282+ err = kstrtoul(buf, 10, &flags);
6283+ if (err || flags > UINT_MAX)
6284+ return -EINVAL;
6285+ if (flags > UKSM_RUN_MERGE)
6286+ return -EINVAL;
6287+
6288+ mutex_lock(&uksm_thread_mutex);
6289+ if (uksm_run != flags) {
6290+ uksm_run = flags;
6291+ }
6292+ mutex_unlock(&uksm_thread_mutex);
6293+
6294+ if (flags & UKSM_RUN_MERGE)
6295+ wake_up_interruptible(&uksm_thread_wait);
6296+
6297+ return count;
6298+}
6299+UKSM_ATTR(run);
6300+
6301+static ssize_t abundant_threshold_show(struct kobject *kobj,
6302+ struct kobj_attribute *attr, char *buf)
6303+{
6304+ return sprintf(buf, "%u\n", uksm_abundant_threshold);
6305+}
6306+
6307+static ssize_t abundant_threshold_store(struct kobject *kobj,
6308+ struct kobj_attribute *attr,
6309+ const char *buf, size_t count)
6310+{
6311+ int err;
6312+ unsigned long flags;
6313+
6314+ err = kstrtoul(buf, 10, &flags);
6315+ if (err || flags > 99)
6316+ return -EINVAL;
6317+
6318+ uksm_abundant_threshold = flags;
6319+
6320+ return count;
6321+}
6322+UKSM_ATTR(abundant_threshold);
6323+
6324+static ssize_t thrash_threshold_show(struct kobject *kobj,
6325+ struct kobj_attribute *attr, char *buf)
6326+{
6327+ return sprintf(buf, "%u\n", uksm_thrash_threshold);
6328+}
6329+
6330+static ssize_t thrash_threshold_store(struct kobject *kobj,
6331+ struct kobj_attribute *attr,
6332+ const char *buf, size_t count)
6333+{
6334+ int err;
6335+ unsigned long flags;
6336+
6337+ err = kstrtoul(buf, 10, &flags);
6338+ if (err || flags > 99)
6339+ return -EINVAL;
6340+
6341+ uksm_thrash_threshold = flags;
6342+
6343+ return count;
6344+}
6345+UKSM_ATTR(thrash_threshold);
6346+
6347+static ssize_t cpu_ratios_show(struct kobject *kobj,
6348+ struct kobj_attribute *attr, char *buf)
6349+{
6350+ int i, size;
6351+ struct scan_rung *rung;
6352+ char *p = buf;
6353+
6354+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6355+ rung = &uksm_scan_ladder[i];
6356+
6357+ if (rung->cpu_ratio > 0)
6358+ size = sprintf(p, "%d ", rung->cpu_ratio);
6359+ else
6360+ size = sprintf(p, "MAX/%d ",
6361+ TIME_RATIO_SCALE / -rung->cpu_ratio);
6362+
6363+ p += size;
6364+ }
6365+
6366+ *p++ = '\n';
6367+ *p = '\0';
6368+
6369+ return p - buf;
6370+}
6371+
6372+static ssize_t cpu_ratios_store(struct kobject *kobj,
6373+ struct kobj_attribute *attr,
6374+ const char *buf, size_t count)
6375+{
6376+ int i, cpuratios[SCAN_LADDER_SIZE], err;
6377+ unsigned long value;
6378+ struct scan_rung *rung;
6379+ char *p, *end = NULL;
6380+
6381+ p = kzalloc(count, GFP_KERNEL);
6382+ if (!p)
6383+ return -ENOMEM;
6384+
6385+ memcpy(p, buf, count);
6386+
6387+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6388+ if (i != SCAN_LADDER_SIZE -1) {
6389+ end = strchr(p, ' ');
6390+ if (!end)
6391+ return -EINVAL;
6392+
6393+ *end = '\0';
6394+ }
6395+
6396+ if (strstr(p, "MAX/")) {
6397+ p = strchr(p, '/') + 1;
6398+ err = kstrtoul(p, 10, &value);
6399+ if (err || value > TIME_RATIO_SCALE || !value)
6400+ return -EINVAL;
6401+
6402+ cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
6403+ } else {
6404+ err = kstrtoul(p, 10, &value);
6405+ if (err || value > TIME_RATIO_SCALE || !value)
6406+ return -EINVAL;
6407+
6408+ cpuratios[i] = value;
6409+ }
6410+
6411+ p = end + 1;
6412+ }
6413+
6414+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6415+ rung = &uksm_scan_ladder[i];
6416+
6417+ rung->cpu_ratio = cpuratios[i];
6418+ }
6419+
6420+ return count;
6421+}
6422+UKSM_ATTR(cpu_ratios);
6423+
6424+static ssize_t eval_intervals_show(struct kobject *kobj,
6425+ struct kobj_attribute *attr, char *buf)
6426+{
6427+ int i, size;
6428+ struct scan_rung *rung;
6429+ char *p = buf;
6430+
6431+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6432+ rung = &uksm_scan_ladder[i];
6433+ size = sprintf(p, "%u ", rung->cover_msecs);
6434+ p += size;
6435+ }
6436+
6437+ *p++ = '\n';
6438+ *p = '\0';
6439+
6440+ return p - buf;
6441+}
6442+
6443+static ssize_t eval_intervals_store(struct kobject *kobj,
6444+ struct kobj_attribute *attr,
6445+ const char *buf, size_t count)
6446+{
6447+ int i, err;
6448+ unsigned long values[SCAN_LADDER_SIZE];
6449+ struct scan_rung *rung;
6450+ char *p, *end = NULL;
6451+
6452+ p = kzalloc(count, GFP_KERNEL);
6453+ if (!p)
6454+ return -ENOMEM;
6455+
6456+ memcpy(p, buf, count);
6457+
6458+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6459+ if (i != SCAN_LADDER_SIZE -1) {
6460+ end = strchr(p, ' ');
6461+ if (!end)
6462+ return -EINVAL;
6463+
6464+ *end = '\0';
6465+ }
6466+
6467+ err = kstrtoul(p, 10, &values[i]);
6468+ if (err)
6469+ return -EINVAL;
6470+
6471+ p = end + 1;
6472+ }
6473+
6474+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6475+ rung = &uksm_scan_ladder[i];
6476+
6477+ rung->cover_msecs = values[i];
6478+ }
6479+
6480+ return count;
6481+}
6482+UKSM_ATTR(eval_intervals);
6483+
6484+static ssize_t ema_per_page_time_show(struct kobject *kobj,
6485+ struct kobj_attribute *attr, char *buf)
6486+{
6487+ return sprintf(buf, "%lu\n", uksm_ema_page_time);
6488+}
6489+UKSM_ATTR_RO(ema_per_page_time);
6490+
6491+static ssize_t pages_shared_show(struct kobject *kobj,
6492+ struct kobj_attribute *attr, char *buf)
6493+{
6494+ return sprintf(buf, "%lu\n", uksm_pages_shared);
6495+}
6496+UKSM_ATTR_RO(pages_shared);
6497+
6498+static ssize_t pages_sharing_show(struct kobject *kobj,
6499+ struct kobj_attribute *attr, char *buf)
6500+{
6501+ return sprintf(buf, "%lu\n", uksm_pages_sharing);
6502+}
6503+UKSM_ATTR_RO(pages_sharing);
6504+
6505+static ssize_t pages_unshared_show(struct kobject *kobj,
6506+ struct kobj_attribute *attr, char *buf)
6507+{
6508+ return sprintf(buf, "%lu\n", uksm_pages_unshared);
6509+}
6510+UKSM_ATTR_RO(pages_unshared);
6511+
6512+static ssize_t full_scans_show(struct kobject *kobj,
6513+ struct kobj_attribute *attr, char *buf)
6514+{
6515+ return sprintf(buf, "%llu\n", fully_scanned_round);
6516+}
6517+UKSM_ATTR_RO(full_scans);
6518+
6519+static ssize_t pages_scanned_show(struct kobject *kobj,
6520+ struct kobj_attribute *attr, char *buf)
6521+{
6522+ unsigned long base = 0;
6523+ u64 delta, ret;
6524+
6525+ if (pages_scanned_stored) {
6526+ base = pages_scanned_base;
6527+ ret = pages_scanned_stored;
6528+ delta = uksm_pages_scanned >> base;
6529+ if (CAN_OVERFLOW_U64(ret, delta)) {
6530+ ret >>= 1;
6531+ delta >>= 1;
6532+ base++;
6533+ ret += delta;
6534+ }
6535+ } else {
6536+ ret = uksm_pages_scanned;
6537+ }
6538+
6539+ while (ret > ULONG_MAX) {
6540+ ret >>= 1;
6541+ base++;
6542+ }
6543+
6544+ if (base)
6545+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6546+ else
6547+ return sprintf(buf, "%lu\n", (unsigned long)ret);
6548+}
6549+UKSM_ATTR_RO(pages_scanned);
6550+
6551+static ssize_t hash_strength_show(struct kobject *kobj,
6552+ struct kobj_attribute *attr, char *buf)
6553+{
6554+ return sprintf(buf, "%lu\n", hash_strength);
6555+}
6556+UKSM_ATTR_RO(hash_strength);
6557+
6558+static ssize_t sleep_times_show(struct kobject *kobj,
6559+ struct kobj_attribute *attr, char *buf)
6560+{
6561+ return sprintf(buf, "%llu\n", uksm_sleep_times);
6562+}
6563+UKSM_ATTR_RO(sleep_times);
6564+
6565+
6566+static struct attribute *uksm_attrs[] = {
6567+ &max_cpu_percentage_attr.attr,
6568+ &sleep_millisecs_attr.attr,
6569+ &cpu_governor_attr.attr,
6570+ &run_attr.attr,
6571+ &ema_per_page_time_attr.attr,
6572+ &pages_shared_attr.attr,
6573+ &pages_sharing_attr.attr,
6574+ &pages_unshared_attr.attr,
6575+ &full_scans_attr.attr,
6576+ &pages_scanned_attr.attr,
6577+ &hash_strength_attr.attr,
6578+ &sleep_times_attr.attr,
6579+ &thrash_threshold_attr.attr,
6580+ &abundant_threshold_attr.attr,
6581+ &cpu_ratios_attr.attr,
6582+ &eval_intervals_attr.attr,
6583+ NULL,
6584+};
6585+
6586+static struct attribute_group uksm_attr_group = {
6587+ .attrs = uksm_attrs,
6588+ .name = "uksm",
6589+};
6590+#endif /* CONFIG_SYSFS */
6591+
6592+static inline void init_scan_ladder(void)
6593+{
6594+ int i;
6595+ struct scan_rung *rung;
6596+
6597+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6598+ rung = uksm_scan_ladder + i;
6599+ slot_tree_init_root(&rung->vma_root);
6600+ }
6601+
6602+ init_performance_values();
6603+ uksm_calc_scan_pages();
6604+}
6605+
6606+static inline int cal_positive_negative_costs(void)
6607+{
6608+ struct page *p1, *p2;
6609+ unsigned char *addr1, *addr2;
6610+ unsigned long i, time_start, hash_cost;
6611+ unsigned long loopnum = 0;
6612+
6613+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6614+ volatile u32 hash;
6615+ volatile int ret;
6616+
6617+ p1 = alloc_page(GFP_KERNEL);
6618+ if (!p1)
6619+ return -ENOMEM;
6620+
6621+ p2 = alloc_page(GFP_KERNEL);
6622+ if (!p2)
6623+ return -ENOMEM;
6624+
6625+ addr1 = kmap_atomic(p1);
6626+ addr2 = kmap_atomic(p2);
6627+ memset(addr1, prandom_u32(), PAGE_SIZE);
6628+ memcpy(addr2, addr1, PAGE_SIZE);
6629+
6630+ /* make sure that the two pages differ in last byte */
6631+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6632+ kunmap_atomic(addr2);
6633+ kunmap_atomic(addr1);
6634+
6635+ time_start = jiffies;
6636+ while (jiffies - time_start < 100) {
6637+ for (i = 0; i < 100; i++)
6638+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6639+ loopnum += 100;
6640+ }
6641+ hash_cost = (jiffies - time_start);
6642+
6643+ time_start = jiffies;
6644+ for (i = 0; i < loopnum; i++)
6645+ ret = pages_identical(p1, p2);
6646+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6647+ memcmp_cost /= hash_cost;
6648+ printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
6649+ "hash=%u cmp_ret=%d.\n",
6650+ memcmp_cost, hash, ret);
6651+
6652+ __free_page(p1);
6653+ __free_page(p2);
6654+ return 0;
6655+}
6656+
6657+static int init_zeropage_hash_table(void)
6658+{
6659+ struct page *page;
6660+ char *addr;
6661+ int i;
6662+
6663+ page = alloc_page(GFP_KERNEL);
6664+ if (!page)
6665+ return -ENOMEM;
6666+
6667+ addr = kmap_atomic(page);
6668+ memset(addr, 0, PAGE_SIZE);
6669+ kunmap_atomic(addr);
6670+
6671+ zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6672+ GFP_KERNEL);
6673+ if (!zero_hash_table)
6674+ return -ENOMEM;
6675+
6676+ for (i = 0; i < HASH_STRENGTH_MAX; i++)
6677+ zero_hash_table[i] = page_hash(page, i, 0);
6678+
6679+ __free_page(page);
6680+
6681+ return 0;
6682+}
6683+
6684+static inline int init_random_sampling(void)
6685+{
6686+ unsigned long i;
6687+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6688+ if (!random_nums)
6689+ return -ENOMEM;
6690+
6691+ for (i = 0; i < HASH_STRENGTH_FULL; i++)
6692+ random_nums[i] = i;
6693+
6694+ for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6695+ unsigned long rand_range, swap_index, tmp;
6696+
6697+ rand_range = HASH_STRENGTH_FULL - i;
6698+ swap_index = i + prandom_u32() % rand_range;
6699+ tmp = random_nums[i];
6700+ random_nums[i] = random_nums[swap_index];
6701+ random_nums[swap_index] = tmp;
6702+ }
6703+
6704+ rshash_state.state = RSHASH_NEW;
6705+ rshash_state.below_count = 0;
6706+ rshash_state.lookup_window_index = 0;
6707+
6708+ return cal_positive_negative_costs();
6709+}
6710+
6711+static int __init uksm_slab_init(void)
6712+{
6713+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6714+ if (!rmap_item_cache)
6715+ goto out;
6716+
6717+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6718+ if (!stable_node_cache)
6719+ goto out_free1;
6720+
6721+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6722+ if (!node_vma_cache)
6723+ goto out_free2;
6724+
6725+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6726+ if (!vma_slot_cache)
6727+ goto out_free3;
6728+
6729+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6730+ if (!tree_node_cache)
6731+ goto out_free4;
6732+
6733+ return 0;
6734+
6735+out_free4:
6736+ kmem_cache_destroy(vma_slot_cache);
6737+out_free3:
6738+ kmem_cache_destroy(node_vma_cache);
6739+out_free2:
6740+ kmem_cache_destroy(stable_node_cache);
6741+out_free1:
6742+ kmem_cache_destroy(rmap_item_cache);
6743+out:
6744+ return -ENOMEM;
6745+}
6746+
6747+static void __init uksm_slab_free(void)
6748+{
6749+ kmem_cache_destroy(stable_node_cache);
6750+ kmem_cache_destroy(rmap_item_cache);
6751+ kmem_cache_destroy(node_vma_cache);
6752+ kmem_cache_destroy(vma_slot_cache);
6753+ kmem_cache_destroy(tree_node_cache);
6754+}
6755+
6756+/* Common interface to ksm, different to it. */
6757+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6758+ unsigned long end, int advice, unsigned long *vm_flags)
6759+{
6760+ int err;
6761+
6762+ switch (advice) {
6763+ case MADV_MERGEABLE:
6764+ return 0; /* just ignore the advice */
6765+
6766+ case MADV_UNMERGEABLE:
6767+ if (!(*vm_flags & VM_MERGEABLE))
6768+ return 0; /* just ignore the advice */
6769+
6770+ if (vma->anon_vma) {
6771+ err = unmerge_uksm_pages(vma, start, end);
6772+ if (err)
6773+ return err;
6774+ }
6775+
6776+ uksm_remove_vma(vma);
6777+ *vm_flags &= ~VM_MERGEABLE;
6778+ break;
6779+ }
6780+
6781+ return 0;
6782+}
6783+
6784+/* Common interface to ksm, actually the same. */
6785+struct page *ksm_might_need_to_copy(struct page *page,
6786+ struct vm_area_struct *vma, unsigned long address)
6787+{
6788+ struct anon_vma *anon_vma = page_anon_vma(page);
6789+ struct page *new_page;
6790+
6791+ if (PageKsm(page)) {
6792+ if (page_stable_node(page))
6793+ return page; /* no need to copy it */
6794+ } else if (!anon_vma) {
6795+ return page; /* no need to copy it */
6796+ } else if (anon_vma->root == vma->anon_vma->root &&
6797+ page->index == linear_page_index(vma, address)) {
6798+ return page; /* still no need to copy it */
6799+ }
6800+ if (!PageUptodate(page))
6801+ return page; /* let do_swap_page report the error */
6802+
6803+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6804+ if (new_page) {
6805+ copy_user_highpage(new_page, page, address, vma);
6806+
6807+ SetPageDirty(new_page);
6808+ __SetPageUptodate(new_page);
6809+ __set_page_locked(new_page);
6810+ }
6811+
6812+ return new_page;
6813+}
6814+
6815+static int __init uksm_init(void)
6816+{
6817+ struct task_struct *uksm_thread;
6818+ int err;
6819+
6820+ uksm_sleep_jiffies = msecs_to_jiffies(100);
6821+ uksm_sleep_saved = uksm_sleep_jiffies;
6822+
6823+ slot_tree_init();
6824+ init_scan_ladder();
6825+
6826+
6827+ err = init_random_sampling();
6828+ if (err)
6829+ goto out_free2;
6830+
6831+ err = uksm_slab_init();
6832+ if (err)
6833+ goto out_free1;
6834+
6835+ err = init_zeropage_hash_table();
6836+ if (err)
6837+ goto out_free0;
6838+
6839+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6840+ if (IS_ERR(uksm_thread)) {
6841+ printk(KERN_ERR "uksm: creating kthread failed\n");
6842+ err = PTR_ERR(uksm_thread);
6843+ goto out_free;
6844+ }
6845+
6846+#ifdef CONFIG_SYSFS
6847+ err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6848+ if (err) {
6849+ printk(KERN_ERR "uksm: register sysfs failed\n");
6850+ kthread_stop(uksm_thread);
6851+ goto out_free;
6852+ }
6853+#else
6854+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
6855+
6856+#endif /* CONFIG_SYSFS */
6857+
6858+#ifdef CONFIG_MEMORY_HOTREMOVE
6859+ /*
6860+ * Choose a high priority since the callback takes uksm_thread_mutex:
6861+ * later callbacks could only be taking locks which nest within that.
6862+ */
6863+ hotplug_memory_notifier(uksm_memory_callback, 100);
6864+#endif
6865+ return 0;
6866+
6867+out_free:
6868+ kfree(zero_hash_table);
6869+out_free0:
6870+ uksm_slab_free();
6871+out_free1:
6872+ kfree(random_nums);
6873+out_free2:
6874+ kfree(uksm_scan_ladder);
6875+ return err;
6876+}
6877+
6878+#ifdef MODULE
6879+subsys_initcall(ksm_init);
6880+#else
6881+late_initcall(uksm_init);
6882+#endif
6883+
6884diff --git a/mm/vmstat.c b/mm/vmstat.c
6885index 1b12d39..24b803f 100644
6886--- a/mm/vmstat.c
6887+++ b/mm/vmstat.c
6888@@ -795,6 +795,9 @@ const char * const vmstat_text[] = {
6889 "nr_anon_transparent_hugepages",
6890 "nr_free_cma",
6891
6892+#ifdef CONFIG_UKSM
6893+ "nr_uksm_zero_pages",
6894+#endif
6895 /* enum writeback_stat_item counters */
6896 "nr_dirty_threshold",
6897 "nr_dirty_background_threshold",
This page took 0.888411 seconds and 4 git commands to generate.