]>
Commit | Line | Data |
---|---|---|
04bb9bf1 JR |
1 | diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX |
2 | index 081c497..75bde3d 100644 | |
3 | --- a/Documentation/vm/00-INDEX | |
4 | +++ b/Documentation/vm/00-INDEX | |
5 | @@ -16,6 +16,8 @@ hwpoison.txt | |
6 | - explains what hwpoison is | |
7 | ksm.txt | |
8 | - how to use the Kernel Samepage Merging feature. | |
9 | +uksm.txt | |
10 | + - Introduction to Ultra KSM | |
11 | numa | |
12 | - information about NUMA specific code in the Linux vm. | |
13 | numa_memory_policy.txt | |
14 | diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt | |
15 | new file mode 100644 | |
16 | index 0000000..08bd645 | |
17 | --- /dev/null | |
18 | +++ b/Documentation/vm/uksm.txt | |
19 | @@ -0,0 +1,58 @@ | |
20 | +The Ultra Kernel Samepage Merging feature | |
21 | +---------------------------------------------- | |
22 | +/* | |
23 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia | |
24 | + * | |
25 | + * This is an improvement upon KSM. Some basic data structures and routines | |
26 | + * are borrowed from ksm.c . | |
27 | + * | |
28 | + * Its new features: | |
29 | + * 1. Full system scan: | |
30 | + * It automatically scans all user processes' anonymous VMAs. Kernel-user | |
31 | + * interaction to submit a memory area to KSM is no longer needed. | |
32 | + * | |
33 | + * 2. Rich area detection: | |
34 | + * It automatically detects rich areas containing abundant duplicated | |
35 | + * pages based. Rich areas are given a full scan speed. Poor areas are | |
36 | + * sampled at a reasonable speed with very low CPU consumption. | |
37 | + * | |
38 | + * 3. Ultra Per-page scan speed improvement: | |
39 | + * A new hash algorithm is proposed. As a result, on a machine with | |
40 | + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it | |
41 | + * can scan memory areas that does not contain duplicated pages at speed of | |
42 | + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of | |
43 | + * 477MB/sec ~ 923MB/sec. | |
44 | + * | |
45 | + * 4. Thrashing area avoidance: | |
46 | + * Thrashing area(an VMA that has frequent Ksm page break-out) can be | |
47 | + * filtered out. My benchmark shows it's more efficient than KSM's per-page | |
48 | + * hash value based volatile page detection. | |
49 | + * | |
50 | + * | |
51 | + * 5. Misc changes upon KSM: | |
52 | + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page | |
53 | + * comparison. It's much faster than default C version on x86. | |
54 | + * * rmap_item now has an struct *page member to loosely cache a | |
55 | + * address-->page mapping, which reduces too much time-costly | |
56 | + * follow_page(). | |
57 | + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. | |
58 | + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ | |
59 | + * ksm is needed for this case. | |
60 | + * | |
61 | + * 6. Full Zero Page consideration(contributed by Figo Zhang) | |
62 | + * Now uksmd consider full zero pages as special pages and merge them to an | |
63 | + * special unswappable uksm zero page. | |
64 | + */ | |
65 | + | |
66 | +ChangeLog: | |
67 | + | |
68 | +2012-05-05 The creation of this Doc | |
69 | +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. | |
70 | +2012-05-28 UKSM 0.1.1.2 bug fix release | |
71 | +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 | |
72 | +2012-07-2 UKSM 0.1.2-beta2 | |
73 | +2012-07-10 UKSM 0.1.2-beta3 | |
74 | +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. | |
75 | +2012-10-13 UKSM 0.1.2.1 Bug fixes. | |
76 | +2012-12-31 UKSM 0.1.2.2 Minor bug fixes | |
77 | +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug" | |
78 | diff --git a/fs/exec.c b/fs/exec.c | |
79 | index 7302b75..84b0df5 100644 | |
80 | --- a/fs/exec.c | |
81 | +++ b/fs/exec.c | |
82 | @@ -19,7 +19,7 @@ | |
83 | * current->executable is only used by the procfs. This allows a dispatch | |
84 | * table to check for several different types of binary formats. We keep | |
85 | * trying until we recognize the file or we run out of supported binary | |
86 | - * formats. | |
87 | + * formats. | |
88 | */ | |
89 | ||
90 | #include <linux/slab.h> | |
91 | @@ -56,6 +56,7 @@ | |
92 | #include <linux/pipe_fs_i.h> | |
93 | #include <linux/oom.h> | |
94 | #include <linux/compat.h> | |
95 | +#include <linux/ksm.h> | |
96 | ||
97 | #include <asm/uaccess.h> | |
98 | #include <asm/mmu_context.h> | |
99 | @@ -1128,6 +1129,7 @@ void setup_new_exec(struct linux_binprm * bprm) | |
100 | /* An exec changes our domain. We are no longer part of the thread | |
101 | group */ | |
102 | current->self_exec_id++; | |
103 | + | |
104 | flush_signal_handlers(current, 0); | |
105 | do_close_on_exec(current->files); | |
106 | } | |
107 | diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c | |
108 | index aa1eee0..5a9149c 100644 | |
109 | --- a/fs/proc/meminfo.c | |
110 | +++ b/fs/proc/meminfo.c | |
111 | @@ -121,6 +121,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |
112 | "SUnreclaim: %8lu kB\n" | |
113 | "KernelStack: %8lu kB\n" | |
114 | "PageTables: %8lu kB\n" | |
115 | +#ifdef CONFIG_UKSM | |
116 | + "KsmZeroPages: %8lu kB\n" | |
117 | +#endif | |
118 | #ifdef CONFIG_QUICKLIST | |
119 | "Quicklists: %8lu kB\n" | |
120 | #endif | |
121 | @@ -175,6 +178,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |
122 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), | |
123 | global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, | |
124 | K(global_page_state(NR_PAGETABLE)), | |
125 | +#ifdef CONFIG_UKSM | |
126 | + K(global_page_state(NR_UKSM_ZERO_PAGES)), | |
127 | +#endif | |
128 | #ifdef CONFIG_QUICKLIST | |
129 | K(quicklist_total_size()), | |
130 | #endif | |
131 | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h | |
132 | index 752e30d..1e7c826 100644 | |
133 | --- a/include/asm-generic/pgtable.h | |
134 | +++ b/include/asm-generic/pgtable.h | |
135 | @@ -537,12 +537,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | |
136 | unsigned long size); | |
137 | #endif | |
138 | ||
139 | +#ifdef CONFIG_UKSM | |
140 | +static inline int is_uksm_zero_pfn(unsigned long pfn) | |
141 | +{ | |
142 | + extern unsigned long uksm_zero_pfn; | |
143 | + return pfn == uksm_zero_pfn; | |
144 | +} | |
145 | +#else | |
146 | +static inline int is_uksm_zero_pfn(unsigned long pfn) | |
147 | +{ | |
148 | + return 0; | |
149 | +} | |
150 | +#endif | |
151 | + | |
152 | #ifdef __HAVE_COLOR_ZERO_PAGE | |
153 | static inline int is_zero_pfn(unsigned long pfn) | |
154 | { | |
155 | extern unsigned long zero_pfn; | |
156 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | |
157 | - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | |
158 | + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); | |
159 | } | |
160 | ||
161 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | |
162 | @@ -551,7 +564,7 @@ static inline int is_zero_pfn(unsigned long pfn) | |
163 | static inline int is_zero_pfn(unsigned long pfn) | |
164 | { | |
165 | extern unsigned long zero_pfn; | |
166 | - return pfn == zero_pfn; | |
167 | + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); | |
168 | } | |
169 | ||
170 | static inline unsigned long my_zero_pfn(unsigned long addr) | |
171 | diff --git a/include/linux/ksm.h b/include/linux/ksm.h | |
172 | index 3be6bb1..51557d1 100644 | |
173 | --- a/include/linux/ksm.h | |
174 | +++ b/include/linux/ksm.h | |
175 | @@ -19,21 +19,6 @@ struct mem_cgroup; | |
176 | #ifdef CONFIG_KSM | |
177 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |
178 | unsigned long end, int advice, unsigned long *vm_flags); | |
179 | -int __ksm_enter(struct mm_struct *mm); | |
180 | -void __ksm_exit(struct mm_struct *mm); | |
181 | - | |
182 | -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
183 | -{ | |
184 | - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) | |
185 | - return __ksm_enter(mm); | |
186 | - return 0; | |
187 | -} | |
188 | - | |
189 | -static inline void ksm_exit(struct mm_struct *mm) | |
190 | -{ | |
191 | - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) | |
192 | - __ksm_exit(mm); | |
193 | -} | |
194 | ||
195 | /* | |
196 | * A KSM page is one of those write-protected "shared pages" or "merged pages" | |
197 | @@ -76,6 +61,33 @@ struct page *ksm_might_need_to_copy(struct page *page, | |
198 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); | |
199 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); | |
200 | ||
201 | +#ifdef CONFIG_KSM_LEGACY | |
202 | +int __ksm_enter(struct mm_struct *mm); | |
203 | +void __ksm_exit(struct mm_struct *mm); | |
204 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
205 | +{ | |
206 | + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) | |
207 | + return __ksm_enter(mm); | |
208 | + return 0; | |
209 | +} | |
210 | + | |
211 | +static inline void ksm_exit(struct mm_struct *mm) | |
212 | +{ | |
213 | + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) | |
214 | + __ksm_exit(mm); | |
215 | +} | |
216 | + | |
217 | +#elif defined(CONFIG_UKSM) | |
218 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
219 | +{ | |
220 | + return 0; | |
221 | +} | |
222 | + | |
223 | +static inline void ksm_exit(struct mm_struct *mm) | |
224 | +{ | |
225 | +} | |
226 | +#endif /* !CONFIG_UKSM */ | |
227 | + | |
228 | #else /* !CONFIG_KSM */ | |
229 | ||
230 | static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
231 | @@ -123,4 +135,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |
232 | #endif /* CONFIG_MMU */ | |
233 | #endif /* !CONFIG_KSM */ | |
234 | ||
235 | +#include <linux/uksm.h> | |
236 | + | |
237 | #endif /* __LINUX_KSM_H */ | |
238 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h | |
239 | index 6e0b286..ebd6243 100644 | |
240 | --- a/include/linux/mm_types.h | |
241 | +++ b/include/linux/mm_types.h | |
242 | @@ -308,6 +308,9 @@ struct vm_area_struct { | |
243 | #ifdef CONFIG_NUMA | |
244 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | |
245 | #endif | |
246 | +#ifdef CONFIG_UKSM | |
247 | + struct vma_slot *uksm_vma_slot; | |
248 | +#endif | |
249 | }; | |
250 | ||
251 | struct core_thread { | |
252 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | |
253 | index ffe66e3..355f0e9 100644 | |
254 | --- a/include/linux/mmzone.h | |
255 | +++ b/include/linux/mmzone.h | |
256 | @@ -157,6 +157,9 @@ enum zone_stat_item { | |
257 | WORKINGSET_NODERECLAIM, | |
258 | NR_ANON_TRANSPARENT_HUGEPAGES, | |
259 | NR_FREE_CMA_PAGES, | |
260 | +#ifdef CONFIG_UKSM | |
261 | + NR_UKSM_ZERO_PAGES, | |
262 | +#endif | |
263 | NR_VM_ZONE_STAT_ITEMS }; | |
264 | ||
265 | /* | |
266 | @@ -865,7 +868,7 @@ static inline int is_highmem_idx(enum zone_type idx) | |
267 | } | |
268 | ||
269 | /** | |
270 | - * is_highmem - helper function to quickly check if a struct zone is a | |
271 | + * is_highmem - helper function to quickly check if a struct zone is a | |
272 | * highmem zone or not. This is an attempt to keep references | |
273 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. | |
274 | * @zone - pointer to struct zone variable | |
275 | diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h | |
276 | new file mode 100644 | |
277 | index 0000000..6780fdb | |
278 | --- /dev/null | |
279 | +++ b/include/linux/sradix-tree.h | |
280 | @@ -0,0 +1,77 @@ | |
281 | +#ifndef _LINUX_SRADIX_TREE_H | |
282 | +#define _LINUX_SRADIX_TREE_H | |
283 | + | |
284 | + | |
285 | +#define INIT_SRADIX_TREE(root, mask) \ | |
286 | +do { \ | |
287 | + (root)->height = 0; \ | |
288 | + (root)->gfp_mask = (mask); \ | |
289 | + (root)->rnode = NULL; \ | |
290 | +} while (0) | |
291 | + | |
292 | +#define ULONG_BITS (sizeof(unsigned long) * 8) | |
293 | +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) | |
294 | +//#define SRADIX_TREE_MAP_SHIFT 6 | |
295 | +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) | |
296 | +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) | |
297 | + | |
298 | +struct sradix_tree_node { | |
299 | + unsigned int height; /* Height from the bottom */ | |
300 | + unsigned int count; | |
301 | + unsigned int fulls; /* Number of full sublevel trees */ | |
302 | + struct sradix_tree_node *parent; | |
303 | + void *stores[0]; | |
304 | +}; | |
305 | + | |
306 | +/* A simple radix tree implementation */ | |
307 | +struct sradix_tree_root { | |
308 | + unsigned int height; | |
309 | + struct sradix_tree_node *rnode; | |
310 | + | |
311 | + /* Where found to have available empty stores in its sublevels */ | |
312 | + struct sradix_tree_node *enter_node; | |
313 | + unsigned int shift; | |
314 | + unsigned int stores_size; | |
315 | + unsigned int mask; | |
316 | + unsigned long min; /* The first hole index */ | |
317 | + unsigned long num; | |
318 | + //unsigned long *height_to_maxindex; | |
319 | + | |
320 | + /* How the node is allocated and freed. */ | |
321 | + struct sradix_tree_node *(*alloc)(void); | |
322 | + void (*free)(struct sradix_tree_node *node); | |
323 | + | |
324 | + /* When a new node is added and removed */ | |
325 | + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); | |
326 | + void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); | |
327 | + void (*rm)(struct sradix_tree_node *node, unsigned offset); | |
328 | +}; | |
329 | + | |
330 | +struct sradix_tree_path { | |
331 | + struct sradix_tree_node *node; | |
332 | + int offset; | |
333 | +}; | |
334 | + | |
335 | +static inline | |
336 | +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) | |
337 | +{ | |
338 | + root->height = 0; | |
339 | + root->rnode = NULL; | |
340 | + root->shift = shift; | |
341 | + root->stores_size = 1UL << shift; | |
342 | + root->mask = root->stores_size - 1; | |
343 | +} | |
344 | + | |
345 | + | |
346 | +extern void *sradix_tree_next(struct sradix_tree_root *root, | |
347 | + struct sradix_tree_node *node, unsigned long index, | |
348 | + int (*iter)(void *, unsigned long)); | |
349 | + | |
350 | +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); | |
351 | + | |
352 | +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, | |
353 | + struct sradix_tree_node *node, unsigned long index); | |
354 | + | |
355 | +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); | |
356 | + | |
357 | +#endif /* _LINUX_SRADIX_TREE_H */ | |
358 | diff --git a/include/linux/uksm.h b/include/linux/uksm.h | |
359 | new file mode 100644 | |
360 | index 0000000..a644bca | |
361 | --- /dev/null | |
362 | +++ b/include/linux/uksm.h | |
363 | @@ -0,0 +1,146 @@ | |
364 | +#ifndef __LINUX_UKSM_H | |
365 | +#define __LINUX_UKSM_H | |
366 | +/* | |
367 | + * Memory merging support. | |
368 | + * | |
369 | + * This code enables dynamic sharing of identical pages found in different | |
370 | + * memory areas, even if they are not shared by fork(). | |
371 | + */ | |
372 | + | |
373 | +/* if !CONFIG_UKSM this file should not be compiled at all. */ | |
374 | +#ifdef CONFIG_UKSM | |
375 | + | |
376 | +#include <linux/bitops.h> | |
377 | +#include <linux/mm.h> | |
378 | +#include <linux/pagemap.h> | |
379 | +#include <linux/rmap.h> | |
380 | +#include <linux/sched.h> | |
381 | + | |
382 | +extern unsigned long zero_pfn __read_mostly; | |
383 | +extern unsigned long uksm_zero_pfn __read_mostly; | |
384 | +extern struct page *empty_uksm_zero_page; | |
385 | + | |
386 | +/* must be done before linked to mm */ | |
387 | +extern void uksm_vma_add_new(struct vm_area_struct *vma); | |
388 | +extern void uksm_remove_vma(struct vm_area_struct *vma); | |
389 | + | |
390 | +#define UKSM_SLOT_NEED_SORT (1 << 0) | |
391 | +#define UKSM_SLOT_NEED_RERAND (1 << 1) | |
392 | +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ | |
393 | +#define UKSM_SLOT_FUL_SCANNED (1 << 3) | |
394 | +#define UKSM_SLOT_IN_UKSM (1 << 4) | |
395 | + | |
396 | +struct vma_slot { | |
397 | + struct sradix_tree_node *snode; | |
398 | + unsigned long sindex; | |
399 | + | |
400 | + struct list_head slot_list; | |
401 | + unsigned long fully_scanned_round; | |
402 | + unsigned long dedup_num; | |
403 | + unsigned long pages_scanned; | |
404 | + unsigned long last_scanned; | |
405 | + unsigned long pages_to_scan; | |
406 | + struct scan_rung *rung; | |
407 | + struct page **rmap_list_pool; | |
408 | + unsigned int *pool_counts; | |
409 | + unsigned long pool_size; | |
410 | + struct vm_area_struct *vma; | |
411 | + struct mm_struct *mm; | |
412 | + unsigned long ctime_j; | |
413 | + unsigned long pages; | |
414 | + unsigned long flags; | |
415 | + unsigned long pages_cowed; /* pages cowed this round */ | |
416 | + unsigned long pages_merged; /* pages merged this round */ | |
417 | + unsigned long pages_bemerged; | |
418 | + | |
419 | + /* when it has page merged in this eval round */ | |
420 | + struct list_head dedup_list; | |
421 | +}; | |
422 | + | |
423 | +static inline void uksm_unmap_zero_page(pte_t pte) | |
424 | +{ | |
425 | + if (pte_pfn(pte) == uksm_zero_pfn) | |
426 | + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); | |
427 | +} | |
428 | + | |
429 | +static inline void uksm_map_zero_page(pte_t pte) | |
430 | +{ | |
431 | + if (pte_pfn(pte) == uksm_zero_pfn) | |
432 | + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); | |
433 | +} | |
434 | + | |
435 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) | |
436 | +{ | |
437 | + if (vma->uksm_vma_slot && PageKsm(page)) | |
438 | + vma->uksm_vma_slot->pages_cowed++; | |
439 | +} | |
440 | + | |
441 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) | |
442 | +{ | |
443 | + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) | |
444 | + vma->uksm_vma_slot->pages_cowed++; | |
445 | +} | |
446 | + | |
447 | +static inline int uksm_flags_can_scan(unsigned long vm_flags) | |
448 | +{ | |
449 | +#ifndef VM_SAO | |
450 | +#define VM_SAO 0 | |
451 | +#endif | |
452 | + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | | |
453 | + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP | | |
454 | + VM_SHARED | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); | |
455 | +} | |
456 | + | |
457 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) | |
458 | +{ | |
459 | + if (uksm_flags_can_scan(*vm_flags_p)) | |
460 | + *vm_flags_p |= VM_MERGEABLE; | |
461 | +} | |
462 | + | |
463 | +/* | |
464 | + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will | |
465 | + * be removed when uksm zero page patch is stable enough. | |
466 | + */ | |
467 | +static inline void uksm_bugon_zeropage(pte_t pte) | |
468 | +{ | |
469 | + BUG_ON(pte_pfn(pte) == uksm_zero_pfn); | |
470 | +} | |
471 | +#else | |
472 | +static inline void uksm_vma_add_new(struct vm_area_struct *vma) | |
473 | +{ | |
474 | +} | |
475 | + | |
476 | +static inline void uksm_remove_vma(struct vm_area_struct *vma) | |
477 | +{ | |
478 | +} | |
479 | + | |
480 | +static inline void uksm_unmap_zero_page(pte_t pte) | |
481 | +{ | |
482 | +} | |
483 | + | |
484 | +static inline void uksm_map_zero_page(pte_t pte) | |
485 | +{ | |
486 | +} | |
487 | + | |
488 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) | |
489 | +{ | |
490 | +} | |
491 | + | |
492 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) | |
493 | +{ | |
494 | +} | |
495 | + | |
496 | +static inline int uksm_flags_can_scan(unsigned long vm_flags) | |
497 | +{ | |
498 | + return 0; | |
499 | +} | |
500 | + | |
501 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) | |
502 | +{ | |
503 | +} | |
504 | + | |
505 | +static inline void uksm_bugon_zeropage(pte_t pte) | |
506 | +{ | |
507 | +} | |
508 | +#endif /* !CONFIG_UKSM */ | |
509 | +#endif /* __LINUX_UKSM_H */ | |
510 | diff --git a/kernel/fork.c b/kernel/fork.c | |
511 | index 9b7d746..73ad90d 100644 | |
512 | --- a/kernel/fork.c | |
513 | +++ b/kernel/fork.c | |
514 | @@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |
515 | goto fail_nomem; | |
516 | charge = len; | |
517 | } | |
518 | - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | |
519 | + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | |
520 | if (!tmp) | |
521 | goto fail_nomem; | |
522 | *tmp = *mpnt; | |
523 | @@ -467,7 +467,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |
524 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | |
525 | rb_link = &tmp->vm_rb.rb_right; | |
526 | rb_parent = &tmp->vm_rb; | |
527 | - | |
528 | + uksm_vma_add_new(tmp); | |
529 | mm->map_count++; | |
530 | retval = copy_page_range(mm, oldmm, mpnt); | |
531 | ||
532 | diff --git a/lib/Makefile b/lib/Makefile | |
533 | index 0211d2b..426536f 100644 | |
534 | --- a/lib/Makefile | |
535 | +++ b/lib/Makefile | |
536 | @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) | |
537 | endif | |
538 | ||
539 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |
540 | - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ | |
541 | + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ | |
542 | idr.o int_sqrt.o extable.o \ | |
543 | sha1.o md5.o irq_regs.o argv_split.o \ | |
544 | proportions.o flex_proportions.o ratelimit.o show_mem.o \ | |
545 | diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c | |
546 | new file mode 100644 | |
547 | index 0000000..8d06329 | |
548 | --- /dev/null | |
549 | +++ b/lib/sradix-tree.c | |
550 | @@ -0,0 +1,476 @@ | |
551 | +#include <linux/errno.h> | |
552 | +#include <linux/mm.h> | |
553 | +#include <linux/mman.h> | |
554 | +#include <linux/spinlock.h> | |
555 | +#include <linux/slab.h> | |
556 | +#include <linux/gcd.h> | |
557 | +#include <linux/sradix-tree.h> | |
558 | + | |
559 | +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) | |
560 | +{ | |
561 | + return node->fulls == root->stores_size || | |
562 | + (node->height == 1 && node->count == root->stores_size); | |
563 | +} | |
564 | + | |
565 | +/* | |
566 | + * Extend a sradix tree so it can store key @index. | |
567 | + */ | |
568 | +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) | |
569 | +{ | |
570 | + struct sradix_tree_node *node; | |
571 | + unsigned int height; | |
572 | + | |
573 | + if (unlikely(root->rnode == NULL)) { | |
574 | + if (!(node = root->alloc())) | |
575 | + return -ENOMEM; | |
576 | + | |
577 | + node->height = 1; | |
578 | + root->rnode = node; | |
579 | + root->height = 1; | |
580 | + } | |
581 | + | |
582 | + /* Figure out what the height should be. */ | |
583 | + height = root->height; | |
584 | + index >>= root->shift * height; | |
585 | + | |
586 | + while (index) { | |
587 | + index >>= root->shift; | |
588 | + height++; | |
589 | + } | |
590 | + | |
591 | + while (height > root->height) { | |
592 | + unsigned int newheight; | |
593 | + if (!(node = root->alloc())) | |
594 | + return -ENOMEM; | |
595 | + | |
596 | + /* Increase the height. */ | |
597 | + node->stores[0] = root->rnode; | |
598 | + root->rnode->parent = node; | |
599 | + if (root->extend) | |
600 | + root->extend(node, root->rnode); | |
601 | + | |
602 | + newheight = root->height + 1; | |
603 | + node->height = newheight; | |
604 | + node->count = 1; | |
605 | + if (sradix_node_full(root, root->rnode)) | |
606 | + node->fulls = 1; | |
607 | + | |
608 | + root->rnode = node; | |
609 | + root->height = newheight; | |
610 | + } | |
611 | + | |
612 | + return 0; | |
613 | +} | |
614 | + | |
615 | +/* | |
616 | + * Search the next item from the current node, that is not NULL | |
617 | + * and can satify root->iter(). | |
618 | + */ | |
619 | +void *sradix_tree_next(struct sradix_tree_root *root, | |
620 | + struct sradix_tree_node *node, unsigned long index, | |
621 | + int (*iter)(void *item, unsigned long height)) | |
622 | +{ | |
623 | + unsigned long offset; | |
624 | + void *item; | |
625 | + | |
626 | + if (unlikely(node == NULL)) { | |
627 | + node = root->rnode; | |
628 | + for (offset = 0; offset < root->stores_size; offset++) { | |
629 | + item = node->stores[offset]; | |
630 | + if (item && (!iter || iter(item, node->height))) | |
631 | + break; | |
632 | + } | |
633 | + | |
634 | + if (unlikely(offset >= root->stores_size)) | |
635 | + return NULL; | |
636 | + | |
637 | + if (node->height == 1) | |
638 | + return item; | |
639 | + else | |
640 | + goto go_down; | |
641 | + } | |
642 | + | |
643 | + while (node) { | |
644 | + offset = (index & root->mask) + 1; | |
645 | + for (;offset < root->stores_size; offset++) { | |
646 | + item = node->stores[offset]; | |
647 | + if (item && (!iter || iter(item, node->height))) | |
648 | + break; | |
649 | + } | |
650 | + | |
651 | + if (offset < root->stores_size) | |
652 | + break; | |
653 | + | |
654 | + node = node->parent; | |
655 | + index >>= root->shift; | |
656 | + } | |
657 | + | |
658 | + if (!node) | |
659 | + return NULL; | |
660 | + | |
661 | + while (node->height > 1) { | |
662 | +go_down: | |
663 | + node = item; | |
664 | + for (offset = 0; offset < root->stores_size; offset++) { | |
665 | + item = node->stores[offset]; | |
666 | + if (item && (!iter || iter(item, node->height))) | |
667 | + break; | |
668 | + } | |
669 | + | |
670 | + if (unlikely(offset >= root->stores_size)) | |
671 | + return NULL; | |
672 | + } | |
673 | + | |
674 | + BUG_ON(offset > root->stores_size); | |
675 | + | |
676 | + return item; | |
677 | +} | |
678 | + | |
679 | +/* | |
680 | + * Blindly insert the item to the tree. Typically, we reuse the | |
681 | + * first empty store item. | |
682 | + */ | |
683 | +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) | |
684 | +{ | |
685 | + unsigned long index; | |
686 | + unsigned int height; | |
687 | + struct sradix_tree_node *node, *tmp = NULL; | |
688 | + int offset, offset_saved; | |
689 | + void **store = NULL; | |
690 | + int error, i, j, shift; | |
691 | + | |
692 | +go_on: | |
693 | + index = root->min; | |
694 | + | |
695 | + if (root->enter_node && !sradix_node_full(root, root->enter_node)) { | |
696 | + node = root->enter_node; | |
697 | + BUG_ON((index >> (root->shift * root->height))); | |
698 | + } else { | |
699 | + node = root->rnode; | |
700 | + if (node == NULL || (index >> (root->shift * root->height)) | |
701 | + || sradix_node_full(root, node)) { | |
702 | + error = sradix_tree_extend(root, index); | |
703 | + if (error) | |
704 | + return error; | |
705 | + | |
706 | + node = root->rnode; | |
707 | + } | |
708 | + } | |
709 | + | |
710 | + | |
711 | + height = node->height; | |
712 | + shift = (height - 1) * root->shift; | |
713 | + offset = (index >> shift) & root->mask; | |
714 | + while (shift > 0) { | |
715 | + offset_saved = offset; | |
716 | + for (; offset < root->stores_size; offset++) { | |
717 | + store = &node->stores[offset]; | |
718 | + tmp = *store; | |
719 | + | |
720 | + if (!tmp || !sradix_node_full(root, tmp)) | |
721 | + break; | |
722 | + } | |
723 | + BUG_ON(offset >= root->stores_size); | |
724 | + | |
725 | + if (offset != offset_saved) { | |
726 | + index += (offset - offset_saved) << shift; | |
727 | + index &= ~((1UL << shift) - 1); | |
728 | + } | |
729 | + | |
730 | + if (!tmp) { | |
731 | + if (!(tmp = root->alloc())) | |
732 | + return -ENOMEM; | |
733 | + | |
734 | + tmp->height = shift / root->shift; | |
735 | + *store = tmp; | |
736 | + tmp->parent = node; | |
737 | + node->count++; | |
738 | +// if (root->extend) | |
739 | +// root->extend(node, tmp); | |
740 | + } | |
741 | + | |
742 | + node = tmp; | |
743 | + shift -= root->shift; | |
744 | + offset = (index >> shift) & root->mask; | |
745 | + } | |
746 | + | |
747 | + BUG_ON(node->height != 1); | |
748 | + | |
749 | + | |
750 | + store = &node->stores[offset]; | |
751 | + for (i = 0, j = 0; | |
752 | + j < root->stores_size - node->count && | |
753 | + i < root->stores_size - offset && j < num; i++) { | |
754 | + if (!store[i]) { | |
755 | + store[i] = item[j]; | |
756 | + if (root->assign) | |
757 | + root->assign(node, index + i, item[j]); | |
758 | + j++; | |
759 | + } | |
760 | + } | |
761 | + | |
762 | + node->count += j; | |
763 | + root->num += j; | |
764 | + num -= j; | |
765 | + | |
766 | + while (sradix_node_full(root, node)) { | |
767 | + node = node->parent; | |
768 | + if (!node) | |
769 | + break; | |
770 | + | |
771 | + node->fulls++; | |
772 | + } | |
773 | + | |
774 | + if (unlikely(!node)) { | |
775 | + /* All nodes are full */ | |
776 | + root->min = 1 << (root->height * root->shift); | |
777 | + root->enter_node = NULL; | |
778 | + } else { | |
779 | + root->min = index + i - 1; | |
780 | + root->min |= (1UL << (node->height - 1)) - 1; | |
781 | + root->min++; | |
782 | + root->enter_node = node; | |
783 | + } | |
784 | + | |
785 | + if (num) { | |
786 | + item += j; | |
787 | + goto go_on; | |
788 | + } | |
789 | + | |
790 | + return 0; | |
791 | +} | |
792 | + | |
793 | + | |
794 | +/** | |
795 | + * sradix_tree_shrink - shrink height of a sradix tree to minimal | |
796 | + * @root sradix tree root | |
797 | + * | |
798 | + */ | |
799 | +static inline void sradix_tree_shrink(struct sradix_tree_root *root) | |
800 | +{ | |
801 | + /* try to shrink tree height */ | |
802 | + while (root->height > 1) { | |
803 | + struct sradix_tree_node *to_free = root->rnode; | |
804 | + | |
805 | + /* | |
806 | + * The candidate node has more than one child, or its child | |
807 | + * is not at the leftmost store, we cannot shrink. | |
808 | + */ | |
809 | + if (to_free->count != 1 || !to_free->stores[0]) | |
810 | + break; | |
811 | + | |
812 | + root->rnode = to_free->stores[0]; | |
813 | + root->rnode->parent = NULL; | |
814 | + root->height--; | |
815 | + if (unlikely(root->enter_node == to_free)) { | |
816 | + root->enter_node = NULL; | |
817 | + } | |
818 | + root->free(to_free); | |
819 | + } | |
820 | +} | |
821 | + | |
822 | +/* | |
823 | + * Del the item on the known leaf node and index | |
824 | + */ | |
825 | +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, | |
826 | + struct sradix_tree_node *node, unsigned long index) | |
827 | +{ | |
828 | + unsigned int offset; | |
829 | + struct sradix_tree_node *start, *end; | |
830 | + | |
831 | + BUG_ON(node->height != 1); | |
832 | + | |
833 | + start = node; | |
834 | + while (node && !(--node->count)) | |
835 | + node = node->parent; | |
836 | + | |
837 | + end = node; | |
838 | + if (!node) { | |
839 | + root->rnode = NULL; | |
840 | + root->height = 0; | |
841 | + root->min = 0; | |
842 | + root->num = 0; | |
843 | + root->enter_node = NULL; | |
844 | + } else { | |
845 | + offset = (index >> (root->shift * (node->height - 1))) & root->mask; | |
846 | + if (root->rm) | |
847 | + root->rm(node, offset); | |
848 | + node->stores[offset] = NULL; | |
849 | + root->num--; | |
850 | + if (root->min > index) { | |
851 | + root->min = index; | |
852 | + root->enter_node = node; | |
853 | + } | |
854 | + } | |
855 | + | |
856 | + if (start != end) { | |
857 | + do { | |
858 | + node = start; | |
859 | + start = start->parent; | |
860 | + if (unlikely(root->enter_node == node)) | |
861 | + root->enter_node = end; | |
862 | + root->free(node); | |
863 | + } while (start != end); | |
864 | + | |
865 | + /* | |
866 | + * Note that shrink may free "end", so enter_node still need to | |
867 | + * be checked inside. | |
868 | + */ | |
869 | + sradix_tree_shrink(root); | |
870 | + } else if (node->count == root->stores_size - 1) { | |
871 | + /* It WAS a full leaf node. Update the ancestors */ | |
872 | + node = node->parent; | |
873 | + while (node) { | |
874 | + node->fulls--; | |
875 | + if (node->fulls != root->stores_size - 1) | |
876 | + break; | |
877 | + | |
878 | + node = node->parent; | |
879 | + } | |
880 | + } | |
881 | +} | |
882 | + | |
883 | +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) | |
884 | +{ | |
885 | + unsigned int height, offset; | |
886 | + struct sradix_tree_node *node; | |
887 | + int shift; | |
888 | + | |
889 | + node = root->rnode; | |
890 | + if (node == NULL || (index >> (root->shift * root->height))) | |
891 | + return NULL; | |
892 | + | |
893 | + height = root->height; | |
894 | + shift = (height - 1) * root->shift; | |
895 | + | |
896 | + do { | |
897 | + offset = (index >> shift) & root->mask; | |
898 | + node = node->stores[offset]; | |
899 | + if (!node) | |
900 | + return NULL; | |
901 | + | |
902 | + shift -= root->shift; | |
903 | + } while (shift >= 0); | |
904 | + | |
905 | + return node; | |
906 | +} | |
907 | + | |
908 | +/* | |
909 | + * Return the item if it exists, otherwise create it in place | |
910 | + * and return the created item. | |
911 | + */ | |
912 | +void *sradix_tree_lookup_create(struct sradix_tree_root *root, | |
913 | + unsigned long index, void *(*item_alloc)(void)) | |
914 | +{ | |
915 | + unsigned int height, offset; | |
916 | + struct sradix_tree_node *node, *tmp; | |
917 | + void *item; | |
918 | + int shift, error; | |
919 | + | |
920 | + if (root->rnode == NULL || (index >> (root->shift * root->height))) { | |
921 | + if (item_alloc) { | |
922 | + error = sradix_tree_extend(root, index); | |
923 | + if (error) | |
924 | + return NULL; | |
925 | + } else { | |
926 | + return NULL; | |
927 | + } | |
928 | + } | |
929 | + | |
930 | + node = root->rnode; | |
931 | + height = root->height; | |
932 | + shift = (height - 1) * root->shift; | |
933 | + | |
934 | + do { | |
935 | + offset = (index >> shift) & root->mask; | |
936 | + if (!node->stores[offset]) { | |
937 | + if (!(tmp = root->alloc())) | |
938 | + return NULL; | |
939 | + | |
940 | + tmp->height = shift / root->shift; | |
941 | + node->stores[offset] = tmp; | |
942 | + tmp->parent = node; | |
943 | + node->count++; | |
944 | + node = tmp; | |
945 | + } else { | |
946 | + node = node->stores[offset]; | |
947 | + } | |
948 | + | |
949 | + shift -= root->shift; | |
950 | + } while (shift > 0); | |
951 | + | |
952 | + BUG_ON(node->height != 1); | |
953 | + offset = index & root->mask; | |
954 | + if (node->stores[offset]) { | |
955 | + return node->stores[offset]; | |
956 | + } else if (item_alloc) { | |
957 | + if (!(item = item_alloc())) | |
958 | + return NULL; | |
959 | + | |
960 | + node->stores[offset] = item; | |
961 | + | |
962 | + /* | |
963 | + * NOTE: we do NOT call root->assign here, since this item is | |
964 | + * newly created by us having no meaning. Caller can call this | |
965 | + * if it's necessary to do so. | |
966 | + */ | |
967 | + | |
968 | + node->count++; | |
969 | + root->num++; | |
970 | + | |
971 | + while (sradix_node_full(root, node)) { | |
972 | + node = node->parent; | |
973 | + if (!node) | |
974 | + break; | |
975 | + | |
976 | + node->fulls++; | |
977 | + } | |
978 | + | |
979 | + if (unlikely(!node)) { | |
980 | + /* All nodes are full */ | |
981 | + root->min = 1 << (root->height * root->shift); | |
982 | + } else { | |
983 | + if (root->min == index) { | |
984 | + root->min |= (1UL << (node->height - 1)) - 1; | |
985 | + root->min++; | |
986 | + root->enter_node = node; | |
987 | + } | |
988 | + } | |
989 | + | |
990 | + return item; | |
991 | + } else { | |
992 | + return NULL; | |
993 | + } | |
994 | + | |
995 | +} | |
996 | + | |
997 | +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) | |
998 | +{ | |
999 | + unsigned int height, offset; | |
1000 | + struct sradix_tree_node *node; | |
1001 | + int shift; | |
1002 | + | |
1003 | + node = root->rnode; | |
1004 | + if (node == NULL || (index >> (root->shift * root->height))) | |
1005 | + return -ENOENT; | |
1006 | + | |
1007 | + height = root->height; | |
1008 | + shift = (height - 1) * root->shift; | |
1009 | + | |
1010 | + do { | |
1011 | + offset = (index >> shift) & root->mask; | |
1012 | + node = node->stores[offset]; | |
1013 | + if (!node) | |
1014 | + return -ENOENT; | |
1015 | + | |
1016 | + shift -= root->shift; | |
1017 | + } while (shift > 0); | |
1018 | + | |
1019 | + offset = index & root->mask; | |
1020 | + if (!node->stores[offset]) | |
1021 | + return -ENOENT; | |
1022 | + | |
1023 | + sradix_tree_delete_from_leaf(root, node, index); | |
1024 | + | |
1025 | + return 0; | |
1026 | +} | |
1027 | diff --git a/mm/Kconfig b/mm/Kconfig | |
1028 | index 1d1ae6b..511dde6 100644 | |
1029 | --- a/mm/Kconfig | |
1030 | +++ b/mm/Kconfig | |
1031 | @@ -339,6 +339,32 @@ config KSM | |
1032 | See Documentation/vm/ksm.txt for more information: KSM is inactive | |
1033 | until a program has madvised that an area is MADV_MERGEABLE, and | |
1034 | root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). | |
1035 | +choice | |
1036 | + prompt "Choose UKSM/KSM strategy" | |
1037 | + default UKSM | |
1038 | + depends on KSM | |
1039 | + help | |
1040 | + This option allows to select a UKSM/KSM stragety. | |
1041 | + | |
1042 | +config UKSM | |
1043 | + bool "Ultra-KSM for page merging" | |
1044 | + depends on KSM | |
1045 | + help | |
1046 | + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same | |
1047 | + page Merging), but with a fundamentally rewritten core algorithm. With | |
1048 | + an advanced algorithm, UKSM now can transparently scans all anonymously | |
1049 | + mapped user space applications with an significantly improved scan speed | |
1050 | + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from | |
1051 | + UKSM. Now UKSM has its first stable release and first real world enterprise user. | |
1052 | + For more information, please goto its project page. | |
1053 | + (www.kerneldedup.org) | |
1054 | + | |
1055 | +config KSM_LEGACY | |
1056 | + bool "Legacy KSM implementation" | |
1057 | + depends on KSM | |
1058 | + help | |
1059 | + The legacy KSM implementation from Redhat. | |
1060 | +endchoice | |
1061 | ||
1062 | config DEFAULT_MMAP_MIN_ADDR | |
1063 | int "Low address space to protect from user allocation" | |
1064 | diff --git a/mm/Makefile b/mm/Makefile | |
1065 | index 8405eb0..7689f0c 100644 | |
1066 | --- a/mm/Makefile | |
1067 | +++ b/mm/Makefile | |
1068 | @@ -44,7 +44,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o | |
1069 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | |
1070 | obj-$(CONFIG_SLOB) += slob.o | |
1071 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |
1072 | -obj-$(CONFIG_KSM) += ksm.o | |
1073 | +obj-$(CONFIG_KSM_LEGACY) += ksm.o | |
1074 | +obj-$(CONFIG_UKSM) += uksm.o | |
1075 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | |
1076 | obj-$(CONFIG_SLAB) += slab.o | |
1077 | obj-$(CONFIG_SLUB) += slub.o | |
1078 | diff --git a/mm/memory.c b/mm/memory.c | |
1079 | index d5f2ae9..86b5d09 100644 | |
1080 | --- a/mm/memory.c | |
1081 | +++ b/mm/memory.c | |
1082 | @@ -120,6 +120,28 @@ unsigned long highest_memmap_pfn __read_mostly; | |
1083 | ||
1084 | EXPORT_SYMBOL(zero_pfn); | |
1085 | ||
1086 | +#ifdef CONFIG_UKSM | |
1087 | +unsigned long uksm_zero_pfn __read_mostly; | |
1088 | +EXPORT_SYMBOL_GPL(uksm_zero_pfn); | |
1089 | +struct page *empty_uksm_zero_page; | |
1090 | + | |
1091 | +static int __init setup_uksm_zero_page(void) | |
1092 | +{ | |
1093 | + unsigned long addr; | |
1094 | + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); | |
1095 | + if (!addr) | |
1096 | + panic("Oh boy, that early out of memory?"); | |
1097 | + | |
1098 | + empty_uksm_zero_page = virt_to_page((void *) addr); | |
1099 | + SetPageReserved(empty_uksm_zero_page); | |
1100 | + | |
1101 | + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); | |
1102 | + | |
1103 | + return 0; | |
1104 | +} | |
1105 | +core_initcall(setup_uksm_zero_page); | |
1106 | +#endif | |
1107 | + | |
1108 | /* | |
1109 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | |
1110 | */ | |
1111 | @@ -131,6 +153,7 @@ static int __init init_zero_pfn(void) | |
1112 | core_initcall(init_zero_pfn); | |
1113 | ||
1114 | ||
1115 | + | |
1116 | #if defined(SPLIT_RSS_COUNTING) | |
1117 | ||
1118 | void sync_mm_rss(struct mm_struct *mm) | |
1119 | @@ -878,6 +901,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |
1120 | rss[MM_ANONPAGES]++; | |
1121 | else | |
1122 | rss[MM_FILEPAGES]++; | |
1123 | + | |
1124 | + /* Should return NULL in vm_normal_page() */ | |
1125 | + uksm_bugon_zeropage(pte); | |
1126 | + } else { | |
1127 | + uksm_map_zero_page(pte); | |
1128 | } | |
1129 | ||
1130 | out_set_pte: | |
1131 | @@ -1120,8 +1148,10 @@ again: | |
1132 | ptent = ptep_get_and_clear_full(mm, addr, pte, | |
1133 | tlb->fullmm); | |
1134 | tlb_remove_tlb_entry(tlb, pte, addr); | |
1135 | - if (unlikely(!page)) | |
1136 | + if (unlikely(!page)) { | |
1137 | + uksm_unmap_zero_page(ptent); | |
1138 | continue; | |
1139 | + } | |
1140 | if (unlikely(details) && details->nonlinear_vma | |
1141 | && linear_page_index(details->nonlinear_vma, | |
1142 | addr) != page->index) { | |
1143 | @@ -1983,8 +2013,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |
1144 | clear_page(kaddr); | |
1145 | kunmap_atomic(kaddr); | |
1146 | flush_dcache_page(dst); | |
1147 | - } else | |
1148 | + } else { | |
1149 | copy_user_highpage(dst, src, va, vma); | |
1150 | + uksm_cow_page(vma, src); | |
1151 | + } | |
1152 | } | |
1153 | ||
1154 | /* | |
1155 | @@ -2198,6 +2230,7 @@ gotten: | |
1156 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | |
1157 | if (!new_page) | |
1158 | goto oom; | |
1159 | + uksm_cow_pte(vma, orig_pte); | |
1160 | } else { | |
1161 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | |
1162 | if (!new_page) | |
1163 | @@ -2223,8 +2256,11 @@ gotten: | |
1164 | dec_mm_counter_fast(mm, MM_FILEPAGES); | |
1165 | inc_mm_counter_fast(mm, MM_ANONPAGES); | |
1166 | } | |
1167 | - } else | |
1168 | + uksm_bugon_zeropage(orig_pte); | |
1169 | + } else { | |
1170 | + uksm_unmap_zero_page(orig_pte); | |
1171 | inc_mm_counter_fast(mm, MM_ANONPAGES); | |
1172 | + } | |
1173 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | |
1174 | entry = mk_pte(new_page, vma->vm_page_prot); | |
1175 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | |
1176 | diff --git a/mm/mmap.c b/mm/mmap.c | |
1177 | index ae91989..844f366 100644 | |
1178 | --- a/mm/mmap.c | |
1179 | +++ b/mm/mmap.c | |
1180 | @@ -41,6 +41,7 @@ | |
1181 | #include <linux/notifier.h> | |
1182 | #include <linux/memory.h> | |
1183 | #include <linux/printk.h> | |
1184 | +#include <linux/ksm.h> | |
1185 | ||
1186 | #include <asm/uaccess.h> | |
1187 | #include <asm/cacheflush.h> | |
1188 | @@ -279,6 +280,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |
1189 | if (vma->vm_file) | |
1190 | fput(vma->vm_file); | |
1191 | mpol_put(vma_policy(vma)); | |
1192 | + uksm_remove_vma(vma); | |
1193 | kmem_cache_free(vm_area_cachep, vma); | |
1194 | return next; | |
1195 | } | |
1196 | @@ -739,9 +741,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |
1197 | long adjust_next = 0; | |
1198 | int remove_next = 0; | |
1199 | ||
1200 | +/* | |
1201 | + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is | |
1202 | + * acquired | |
1203 | + */ | |
1204 | + uksm_remove_vma(vma); | |
1205 | + | |
1206 | if (next && !insert) { | |
1207 | struct vm_area_struct *exporter = NULL; | |
1208 | ||
1209 | + uksm_remove_vma(next); | |
1210 | if (end >= next->vm_end) { | |
1211 | /* | |
1212 | * vma expands, overlapping all the next, and | |
1213 | @@ -838,6 +847,7 @@ again: remove_next = 1 + (end > next->vm_end); | |
1214 | end_changed = true; | |
1215 | } | |
1216 | vma->vm_pgoff = pgoff; | |
1217 | + | |
1218 | if (adjust_next) { | |
1219 | next->vm_start += adjust_next << PAGE_SHIFT; | |
1220 | next->vm_pgoff += adjust_next; | |
1221 | @@ -908,16 +918,22 @@ again: remove_next = 1 + (end > next->vm_end); | |
1222 | * up the code too much to do both in one go. | |
1223 | */ | |
1224 | next = vma->vm_next; | |
1225 | - if (remove_next == 2) | |
1226 | + if (remove_next == 2) { | |
1227 | + uksm_remove_vma(next); | |
1228 | goto again; | |
1229 | - else if (next) | |
1230 | + } else if (next) { | |
1231 | vma_gap_update(next); | |
1232 | - else | |
1233 | + } else { | |
1234 | mm->highest_vm_end = end; | |
1235 | + } | |
1236 | + } else { | |
1237 | + if (next && !insert) | |
1238 | + uksm_vma_add_new(next); | |
1239 | } | |
1240 | if (insert && file) | |
1241 | uprobe_mmap(insert); | |
1242 | ||
1243 | + uksm_vma_add_new(vma); | |
1244 | validate_mm(mm); | |
1245 | ||
1246 | return 0; | |
1247 | @@ -1310,6 +1326,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |
1248 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | |
1249 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | |
1250 | ||
1251 | + /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ | |
1252 | + uksm_vm_flags_mod(&vm_flags); | |
1253 | + | |
1254 | if (flags & MAP_LOCKED) | |
1255 | if (!can_do_mlock()) | |
1256 | return -EPERM; | |
1257 | @@ -1651,6 +1670,7 @@ munmap_back: | |
1258 | allow_write_access(file); | |
1259 | } | |
1260 | file = vma->vm_file; | |
1261 | + uksm_vma_add_new(vma); | |
1262 | out: | |
1263 | perf_event_mmap(vma); | |
1264 | ||
1265 | @@ -1692,6 +1712,7 @@ allow_write_and_free_vma: | |
1266 | if (vm_flags & VM_DENYWRITE) | |
1267 | allow_write_access(file); | |
1268 | free_vma: | |
1269 | + uksm_remove_vma(vma); | |
1270 | kmem_cache_free(vm_area_cachep, vma); | |
1271 | unacct_error: | |
1272 | if (charged) | |
1273 | @@ -2488,6 +2509,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |
1274 | else | |
1275 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | |
1276 | ||
1277 | + uksm_vma_add_new(new); | |
1278 | + | |
1279 | /* Success. */ | |
1280 | if (!err) | |
1281 | return 0; | |
1282 | @@ -2654,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |
1283 | return addr; | |
1284 | ||
1285 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | |
1286 | + uksm_vm_flags_mod(&flags); | |
1287 | ||
1288 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | |
1289 | if (error & ~PAGE_MASK) | |
1290 | @@ -2712,6 +2736,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |
1291 | vma->vm_flags = flags; | |
1292 | vma->vm_page_prot = vm_get_page_prot(flags); | |
1293 | vma_link(mm, vma, prev, rb_link, rb_parent); | |
1294 | + uksm_vma_add_new(vma); | |
1295 | out: | |
1296 | perf_event_mmap(vma); | |
1297 | mm->total_vm += len >> PAGE_SHIFT; | |
1298 | @@ -2747,6 +2772,12 @@ void exit_mmap(struct mm_struct *mm) | |
1299 | /* mm's last user has gone, and its about to be pulled down */ | |
1300 | mmu_notifier_release(mm); | |
1301 | ||
1302 | + /* | |
1303 | + * Taking write lock on mmap_sem does not harm others, | |
1304 | + * but it's crucial for uksm to avoid races. | |
1305 | + */ | |
1306 | + down_write(&mm->mmap_sem); | |
1307 | + | |
1308 | if (mm->locked_vm) { | |
1309 | vma = mm->mmap; | |
1310 | while (vma) { | |
1311 | @@ -2783,6 +2814,11 @@ void exit_mmap(struct mm_struct *mm) | |
1312 | } | |
1313 | vm_unacct_memory(nr_accounted); | |
1314 | ||
1315 | + mm->mmap = NULL; | |
1316 | + mm->mm_rb = RB_ROOT; | |
1317 | + vmacache_invalidate(mm); | |
1318 | + up_write(&mm->mmap_sem); | |
1319 | + | |
1320 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | |
1321 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | |
1322 | } | |
1323 | @@ -2891,6 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |
1324 | new_vma->vm_ops->open(new_vma); | |
1325 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | |
1326 | *need_rmap_locks = false; | |
1327 | + uksm_vma_add_new(new_vma); | |
1328 | } | |
1329 | } | |
1330 | return new_vma; | |
1331 | @@ -3004,10 +3041,10 @@ static struct vm_area_struct *__install_special_mapping( | |
1332 | ret = insert_vm_struct(mm, vma); | |
1333 | if (ret) | |
1334 | goto out; | |
1335 | - | |
1336 | mm->total_vm += len >> PAGE_SHIFT; | |
1337 | ||
1338 | perf_event_mmap(vma); | |
1339 | + uksm_vma_add_new(vma); | |
1340 | ||
1341 | return vma; | |
1342 | ||
1343 | diff --git a/mm/rmap.c b/mm/rmap.c | |
1344 | index 3e4c721..d39d8a3 100644 | |
1345 | --- a/mm/rmap.c | |
1346 | +++ b/mm/rmap.c | |
1347 | @@ -908,9 +908,9 @@ void page_move_anon_rmap(struct page *page, | |
1348 | ||
1349 | /** | |
1350 | * __page_set_anon_rmap - set up new anonymous rmap | |
1351 | - * @page: Page to add to rmap | |
1352 | + * @page: Page to add to rmap | |
1353 | * @vma: VM area to add page to. | |
1354 | - * @address: User virtual address of the mapping | |
1355 | + * @address: User virtual address of the mapping | |
1356 | * @exclusive: the page is exclusively owned by the current process | |
1357 | */ | |
1358 | static void __page_set_anon_rmap(struct page *page, | |
1359 | diff --git a/mm/uksm.c b/mm/uksm.c | |
1360 | new file mode 100644 | |
1361 | index 0000000..c76fcfc | |
1362 | --- /dev/null | |
1363 | +++ b/mm/uksm.c | |
1364 | @@ -0,0 +1,5519 @@ | |
1365 | +/* | |
1366 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia | |
1367 | + * | |
1368 | + * This is an improvement upon KSM. Some basic data structures and routines | |
1369 | + * are borrowed from ksm.c . | |
1370 | + * | |
1371 | + * Its new features: | |
1372 | + * 1. Full system scan: | |
1373 | + * It automatically scans all user processes' anonymous VMAs. Kernel-user | |
1374 | + * interaction to submit a memory area to KSM is no longer needed. | |
1375 | + * | |
1376 | + * 2. Rich area detection: | |
1377 | + * It automatically detects rich areas containing abundant duplicated | |
1378 | + * pages based. Rich areas are given a full scan speed. Poor areas are | |
1379 | + * sampled at a reasonable speed with very low CPU consumption. | |
1380 | + * | |
1381 | + * 3. Ultra Per-page scan speed improvement: | |
1382 | + * A new hash algorithm is proposed. As a result, on a machine with | |
1383 | + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it | |
1384 | + * can scan memory areas that does not contain duplicated pages at speed of | |
1385 | + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of | |
1386 | + * 477MB/sec ~ 923MB/sec. | |
1387 | + * | |
1388 | + * 4. Thrashing area avoidance: | |
1389 | + * Thrashing area(an VMA that has frequent Ksm page break-out) can be | |
1390 | + * filtered out. My benchmark shows it's more efficient than KSM's per-page | |
1391 | + * hash value based volatile page detection. | |
1392 | + * | |
1393 | + * | |
1394 | + * 5. Misc changes upon KSM: | |
1395 | + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page | |
1396 | + * comparison. It's much faster than default C version on x86. | |
1397 | + * * rmap_item now has an struct *page member to loosely cache a | |
1398 | + * address-->page mapping, which reduces too much time-costly | |
1399 | + * follow_page(). | |
1400 | + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. | |
1401 | + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ | |
1402 | + * ksm is needed for this case. | |
1403 | + * | |
1404 | + * 6. Full Zero Page consideration(contributed by Figo Zhang) | |
1405 | + * Now uksmd consider full zero pages as special pages and merge them to an | |
1406 | + * special unswappable uksm zero page. | |
1407 | + */ | |
1408 | + | |
1409 | +#include <linux/errno.h> | |
1410 | +#include <linux/mm.h> | |
1411 | +#include <linux/fs.h> | |
1412 | +#include <linux/mman.h> | |
1413 | +#include <linux/sched.h> | |
1414 | +#include <linux/rwsem.h> | |
1415 | +#include <linux/pagemap.h> | |
1416 | +#include <linux/rmap.h> | |
1417 | +#include <linux/spinlock.h> | |
1418 | +#include <linux/jhash.h> | |
1419 | +#include <linux/delay.h> | |
1420 | +#include <linux/kthread.h> | |
1421 | +#include <linux/wait.h> | |
1422 | +#include <linux/slab.h> | |
1423 | +#include <linux/rbtree.h> | |
1424 | +#include <linux/memory.h> | |
1425 | +#include <linux/mmu_notifier.h> | |
1426 | +#include <linux/swap.h> | |
1427 | +#include <linux/ksm.h> | |
1428 | +#include <linux/crypto.h> | |
1429 | +#include <linux/scatterlist.h> | |
1430 | +#include <crypto/hash.h> | |
1431 | +#include <linux/random.h> | |
1432 | +#include <linux/math64.h> | |
1433 | +#include <linux/gcd.h> | |
1434 | +#include <linux/freezer.h> | |
1435 | +#include <linux/sradix-tree.h> | |
1436 | + | |
1437 | +#include <asm/tlbflush.h> | |
1438 | +#include "internal.h" | |
1439 | + | |
1440 | +#ifdef CONFIG_X86 | |
1441 | +#undef memcmp | |
1442 | + | |
1443 | +#ifdef CONFIG_X86_32 | |
1444 | +#define memcmp memcmpx86_32 | |
1445 | +/* | |
1446 | + * Compare 4-byte-aligned address s1 and s2, with length n | |
1447 | + */ | |
1448 | +int memcmpx86_32(void *s1, void *s2, size_t n) | |
1449 | +{ | |
1450 | + size_t num = n / 4; | |
1451 | + register int res; | |
1452 | + | |
1453 | + __asm__ __volatile__ | |
1454 | + ( | |
1455 | + "testl %3,%3\n\t" | |
1456 | + "repe; cmpsd\n\t" | |
1457 | + "je 1f\n\t" | |
1458 | + "sbbl %0,%0\n\t" | |
1459 | + "orl $1,%0\n" | |
1460 | + "1:" | |
1461 | + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) | |
1462 | + : "0" (0) | |
1463 | + : "cc"); | |
1464 | + | |
1465 | + return res; | |
1466 | +} | |
1467 | + | |
1468 | +/* | |
1469 | + * Check the page is all zero ? | |
1470 | + */ | |
1471 | +static int is_full_zero(const void *s1, size_t len) | |
1472 | +{ | |
1473 | + unsigned char same; | |
1474 | + | |
1475 | + len /= 4; | |
1476 | + | |
1477 | + __asm__ __volatile__ | |
1478 | + ("repe; scasl;" | |
1479 | + "sete %0" | |
1480 | + : "=qm" (same), "+D" (s1), "+c" (len) | |
1481 | + : "a" (0) | |
1482 | + : "cc"); | |
1483 | + | |
1484 | + return same; | |
1485 | +} | |
1486 | + | |
1487 | + | |
1488 | +#elif defined(CONFIG_X86_64) | |
1489 | +#define memcmp memcmpx86_64 | |
1490 | +/* | |
1491 | + * Compare 8-byte-aligned address s1 and s2, with length n | |
1492 | + */ | |
1493 | +int memcmpx86_64(void *s1, void *s2, size_t n) | |
1494 | +{ | |
1495 | + size_t num = n / 8; | |
1496 | + register int res; | |
1497 | + | |
1498 | + __asm__ __volatile__ | |
1499 | + ( | |
1500 | + "testq %q3,%q3\n\t" | |
1501 | + "repe; cmpsq\n\t" | |
1502 | + "je 1f\n\t" | |
1503 | + "sbbq %q0,%q0\n\t" | |
1504 | + "orq $1,%q0\n" | |
1505 | + "1:" | |
1506 | + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) | |
1507 | + : "0" (0) | |
1508 | + : "cc"); | |
1509 | + | |
1510 | + return res; | |
1511 | +} | |
1512 | + | |
1513 | +static int is_full_zero(const void *s1, size_t len) | |
1514 | +{ | |
1515 | + unsigned char same; | |
1516 | + | |
1517 | + len /= 8; | |
1518 | + | |
1519 | + __asm__ __volatile__ | |
1520 | + ("repe; scasq;" | |
1521 | + "sete %0" | |
1522 | + : "=qm" (same), "+D" (s1), "+c" (len) | |
1523 | + : "a" (0) | |
1524 | + : "cc"); | |
1525 | + | |
1526 | + return same; | |
1527 | +} | |
1528 | + | |
1529 | +#endif | |
1530 | +#else | |
1531 | +static int is_full_zero(const void *s1, size_t len) | |
1532 | +{ | |
1533 | + unsigned long *src = s1; | |
1534 | + int i; | |
1535 | + | |
1536 | + len /= sizeof(*src); | |
1537 | + | |
1538 | + for (i = 0; i < len; i++) { | |
1539 | + if (src[i]) | |
1540 | + return 0; | |
1541 | + } | |
1542 | + | |
1543 | + return 1; | |
1544 | +} | |
1545 | +#endif | |
1546 | + | |
1547 | +#define UKSM_RUNG_ROUND_FINISHED (1 << 0) | |
1548 | +#define TIME_RATIO_SCALE 10000 | |
1549 | + | |
1550 | +#define SLOT_TREE_NODE_SHIFT 8 | |
1551 | +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) | |
1552 | +struct slot_tree_node { | |
1553 | + unsigned long size; | |
1554 | + struct sradix_tree_node snode; | |
1555 | + void *stores[SLOT_TREE_NODE_STORE_SIZE]; | |
1556 | +}; | |
1557 | + | |
1558 | +static struct kmem_cache *slot_tree_node_cachep; | |
1559 | + | |
1560 | +static struct sradix_tree_node *slot_tree_node_alloc(void) | |
1561 | +{ | |
1562 | + struct slot_tree_node *p; | |
1563 | + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); | |
1564 | + if (!p) | |
1565 | + return NULL; | |
1566 | + | |
1567 | + return &p->snode; | |
1568 | +} | |
1569 | + | |
1570 | +static void slot_tree_node_free(struct sradix_tree_node *node) | |
1571 | +{ | |
1572 | + struct slot_tree_node *p; | |
1573 | + | |
1574 | + p = container_of(node, struct slot_tree_node, snode); | |
1575 | + kmem_cache_free(slot_tree_node_cachep, p); | |
1576 | +} | |
1577 | + | |
1578 | +static void slot_tree_node_extend(struct sradix_tree_node *parent, | |
1579 | + struct sradix_tree_node *child) | |
1580 | +{ | |
1581 | + struct slot_tree_node *p, *c; | |
1582 | + | |
1583 | + p = container_of(parent, struct slot_tree_node, snode); | |
1584 | + c = container_of(child, struct slot_tree_node, snode); | |
1585 | + | |
1586 | + p->size += c->size; | |
1587 | +} | |
1588 | + | |
1589 | +void slot_tree_node_assign(struct sradix_tree_node *node, | |
1590 | + unsigned index, void *item) | |
1591 | +{ | |
1592 | + struct vma_slot *slot = item; | |
1593 | + struct slot_tree_node *cur; | |
1594 | + | |
1595 | + slot->snode = node; | |
1596 | + slot->sindex = index; | |
1597 | + | |
1598 | + while (node) { | |
1599 | + cur = container_of(node, struct slot_tree_node, snode); | |
1600 | + cur->size += slot->pages; | |
1601 | + node = node->parent; | |
1602 | + } | |
1603 | +} | |
1604 | + | |
1605 | +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) | |
1606 | +{ | |
1607 | + struct vma_slot *slot; | |
1608 | + struct slot_tree_node *cur; | |
1609 | + unsigned long pages; | |
1610 | + | |
1611 | + if (node->height == 1) { | |
1612 | + slot = node->stores[offset]; | |
1613 | + pages = slot->pages; | |
1614 | + } else { | |
1615 | + cur = container_of(node->stores[offset], | |
1616 | + struct slot_tree_node, snode); | |
1617 | + pages = cur->size; | |
1618 | + } | |
1619 | + | |
1620 | + while (node) { | |
1621 | + cur = container_of(node, struct slot_tree_node, snode); | |
1622 | + cur->size -= pages; | |
1623 | + node = node->parent; | |
1624 | + } | |
1625 | +} | |
1626 | + | |
1627 | +unsigned long slot_iter_index; | |
1628 | +int slot_iter(void *item, unsigned long height) | |
1629 | +{ | |
1630 | + struct slot_tree_node *node; | |
1631 | + struct vma_slot *slot; | |
1632 | + | |
1633 | + if (height == 1) { | |
1634 | + slot = item; | |
1635 | + if (slot_iter_index < slot->pages) { | |
1636 | + /*in this one*/ | |
1637 | + return 1; | |
1638 | + } else { | |
1639 | + slot_iter_index -= slot->pages; | |
1640 | + return 0; | |
1641 | + } | |
1642 | + | |
1643 | + } else { | |
1644 | + node = container_of(item, struct slot_tree_node, snode); | |
1645 | + if (slot_iter_index < node->size) { | |
1646 | + /*in this one*/ | |
1647 | + return 1; | |
1648 | + } else { | |
1649 | + slot_iter_index -= node->size; | |
1650 | + return 0; | |
1651 | + } | |
1652 | + } | |
1653 | +} | |
1654 | + | |
1655 | + | |
1656 | +static inline void slot_tree_init_root(struct sradix_tree_root *root) | |
1657 | +{ | |
1658 | + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); | |
1659 | + root->alloc = slot_tree_node_alloc; | |
1660 | + root->free = slot_tree_node_free; | |
1661 | + root->extend = slot_tree_node_extend; | |
1662 | + root->assign = slot_tree_node_assign; | |
1663 | + root->rm = slot_tree_node_rm; | |
1664 | +} | |
1665 | + | |
1666 | +void slot_tree_init(void) | |
1667 | +{ | |
1668 | + slot_tree_node_cachep = kmem_cache_create("slot_tree_node", | |
1669 | + sizeof(struct slot_tree_node), 0, | |
1670 | + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, | |
1671 | + NULL); | |
1672 | +} | |
1673 | + | |
1674 | + | |
1675 | +/* Each rung of this ladder is a list of VMAs having a same scan ratio */ | |
1676 | +struct scan_rung { | |
1677 | + //struct list_head scanned_list; | |
1678 | + struct sradix_tree_root vma_root; | |
1679 | + struct sradix_tree_root vma_root2; | |
1680 | + | |
1681 | + struct vma_slot *current_scan; | |
1682 | + unsigned long current_offset; | |
1683 | + | |
1684 | + /* | |
1685 | + * The initial value for current_offset, it should loop over | |
1686 | + * [0~ step - 1] to let all slot have its chance to be scanned. | |
1687 | + */ | |
1688 | + unsigned long offset_init; | |
1689 | + unsigned long step; /* dynamic step for current_offset */ | |
1690 | + unsigned int flags; | |
1691 | + unsigned long pages_to_scan; | |
1692 | + //unsigned long fully_scanned_slots; | |
1693 | + /* | |
1694 | + * a little bit tricky - if cpu_time_ratio > 0, then the value is the | |
1695 | + * the cpu time ratio it can spend in rung_i for every scan | |
1696 | + * period. if < 0, then it is the cpu time ratio relative to the | |
1697 | + * max cpu percentage user specified. Both in unit of | |
1698 | + * 1/TIME_RATIO_SCALE | |
1699 | + */ | |
1700 | + int cpu_ratio; | |
1701 | + | |
1702 | + /* | |
1703 | + * How long it will take for all slots in this rung to be fully | |
1704 | + * scanned? If it's zero, we don't care about the cover time: | |
1705 | + * it's fully scanned. | |
1706 | + */ | |
1707 | + unsigned int cover_msecs; | |
1708 | + //unsigned long vma_num; | |
1709 | + //unsigned long pages; /* Sum of all slot's pages in rung */ | |
1710 | +}; | |
1711 | + | |
1712 | +/** | |
1713 | + * node of either the stable or unstale rbtree | |
1714 | + * | |
1715 | + */ | |
1716 | +struct tree_node { | |
1717 | + struct rb_node node; /* link in the main (un)stable rbtree */ | |
1718 | + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ | |
1719 | + u32 hash; | |
1720 | + unsigned long count; /* TODO: merged with sub_root */ | |
1721 | + struct list_head all_list; /* all tree nodes in stable/unstable tree */ | |
1722 | +}; | |
1723 | + | |
1724 | +/** | |
1725 | + * struct stable_node - node of the stable rbtree | |
1726 | + * @node: rb node of this ksm page in the stable tree | |
1727 | + * @hlist: hlist head of rmap_items using this ksm page | |
1728 | + * @kpfn: page frame number of this ksm page | |
1729 | + */ | |
1730 | +struct stable_node { | |
1731 | + struct rb_node node; /* link in sub-rbtree */ | |
1732 | + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ | |
1733 | + struct hlist_head hlist; | |
1734 | + unsigned long kpfn; | |
1735 | + u32 hash_max; /* if ==0 then it's not been calculated yet */ | |
1736 | + struct list_head all_list; /* in a list for all stable nodes */ | |
1737 | +}; | |
1738 | + | |
1739 | +/** | |
1740 | + * struct node_vma - group rmap_items linked in a same stable | |
1741 | + * node together. | |
1742 | + */ | |
1743 | +struct node_vma { | |
1744 | + union { | |
1745 | + struct vma_slot *slot; | |
1746 | + unsigned long key; /* slot is used as key sorted on hlist */ | |
1747 | + }; | |
1748 | + struct hlist_node hlist; | |
1749 | + struct hlist_head rmap_hlist; | |
1750 | + struct stable_node *head; | |
1751 | +}; | |
1752 | + | |
1753 | +/** | |
1754 | + * struct rmap_item - reverse mapping item for virtual addresses | |
1755 | + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | |
1756 | + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | |
1757 | + * @mm: the memory structure this rmap_item is pointing into | |
1758 | + * @address: the virtual address this rmap_item tracks (+ flags in low bits) | |
1759 | + * @node: rb node of this rmap_item in the unstable tree | |
1760 | + * @head: pointer to stable_node heading this list in the stable tree | |
1761 | + * @hlist: link into hlist of rmap_items hanging off that stable_node | |
1762 | + */ | |
1763 | +struct rmap_item { | |
1764 | + struct vma_slot *slot; | |
1765 | + struct page *page; | |
1766 | + unsigned long address; /* + low bits used for flags below */ | |
1767 | + unsigned long hash_round; | |
1768 | + unsigned long entry_index; | |
1769 | + union { | |
1770 | + struct {/* when in unstable tree */ | |
1771 | + struct rb_node node; | |
1772 | + struct tree_node *tree_node; | |
1773 | + u32 hash_max; | |
1774 | + }; | |
1775 | + struct { /* when in stable tree */ | |
1776 | + struct node_vma *head; | |
1777 | + struct hlist_node hlist; | |
1778 | + struct anon_vma *anon_vma; | |
1779 | + }; | |
1780 | + }; | |
1781 | +} __attribute__((aligned(4))); | |
1782 | + | |
1783 | +struct rmap_list_entry { | |
1784 | + union { | |
1785 | + struct rmap_item *item; | |
1786 | + unsigned long addr; | |
1787 | + }; | |
1788 | + /* lowest bit is used for is_addr tag */ | |
1789 | +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ | |
1790 | + | |
1791 | + | |
1792 | +/* Basic data structure definition ends */ | |
1793 | + | |
1794 | + | |
1795 | +/* | |
1796 | + * Flags for rmap_item to judge if it's listed in the stable/unstable tree. | |
1797 | + * The flags use the low bits of rmap_item.address | |
1798 | + */ | |
1799 | +#define UNSTABLE_FLAG 0x1 | |
1800 | +#define STABLE_FLAG 0x2 | |
1801 | +#define get_rmap_addr(x) ((x)->address & PAGE_MASK) | |
1802 | + | |
1803 | +/* | |
1804 | + * rmap_list_entry helpers | |
1805 | + */ | |
1806 | +#define IS_ADDR_FLAG 1 | |
1807 | +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) | |
1808 | +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) | |
1809 | +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) | |
1810 | + | |
1811 | + | |
1812 | +/* | |
1813 | + * High speed caches for frequently allocated and freed structs | |
1814 | + */ | |
1815 | +static struct kmem_cache *rmap_item_cache; | |
1816 | +static struct kmem_cache *stable_node_cache; | |
1817 | +static struct kmem_cache *node_vma_cache; | |
1818 | +static struct kmem_cache *vma_slot_cache; | |
1819 | +static struct kmem_cache *tree_node_cache; | |
1820 | +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ | |
1821 | + sizeof(struct __struct), __alignof__(struct __struct),\ | |
1822 | + (__flags), NULL) | |
1823 | + | |
1824 | +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ | |
1825 | +#define SCAN_LADDER_SIZE 4 | |
1826 | +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; | |
1827 | + | |
1828 | +/* The evaluation rounds uksmd has finished */ | |
1829 | +static unsigned long long uksm_eval_round = 1; | |
1830 | + | |
1831 | +/* | |
1832 | + * we add 1 to this var when we consider we should rebuild the whole | |
1833 | + * unstable tree. | |
1834 | + */ | |
1835 | +static unsigned long uksm_hash_round = 1; | |
1836 | + | |
1837 | +/* | |
1838 | + * How many times the whole memory is scanned. | |
1839 | + */ | |
1840 | +static unsigned long long fully_scanned_round = 1; | |
1841 | + | |
1842 | +/* The total number of virtual pages of all vma slots */ | |
1843 | +static u64 uksm_pages_total; | |
1844 | + | |
1845 | +/* The number of pages has been scanned since the start up */ | |
1846 | +static u64 uksm_pages_scanned; | |
1847 | + | |
1848 | +static u64 scanned_virtual_pages; | |
1849 | + | |
1850 | +/* The number of pages has been scanned since last encode_benefit call */ | |
1851 | +static u64 uksm_pages_scanned_last; | |
1852 | + | |
1853 | +/* If the scanned number is tooo large, we encode it here */ | |
1854 | +static u64 pages_scanned_stored; | |
1855 | + | |
1856 | +static unsigned long pages_scanned_base; | |
1857 | + | |
1858 | +/* The number of nodes in the stable tree */ | |
1859 | +static unsigned long uksm_pages_shared; | |
1860 | + | |
1861 | +/* The number of page slots additionally sharing those nodes */ | |
1862 | +static unsigned long uksm_pages_sharing; | |
1863 | + | |
1864 | +/* The number of nodes in the unstable tree */ | |
1865 | +static unsigned long uksm_pages_unshared; | |
1866 | + | |
1867 | +/* | |
1868 | + * Milliseconds ksmd should sleep between scans, | |
1869 | + * >= 100ms to be consistent with | |
1870 | + * scan_time_to_sleep_msec() | |
1871 | + */ | |
1872 | +static unsigned int uksm_sleep_jiffies; | |
1873 | + | |
1874 | +/* The real value for the uksmd next sleep */ | |
1875 | +static unsigned int uksm_sleep_real; | |
1876 | + | |
1877 | +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ | |
1878 | +static unsigned int uksm_sleep_saved; | |
1879 | + | |
1880 | +/* Max percentage of cpu utilization ksmd can take to scan in one batch */ | |
1881 | +static unsigned int uksm_max_cpu_percentage; | |
1882 | + | |
1883 | +static int uksm_cpu_governor; | |
1884 | + | |
1885 | +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; | |
1886 | + | |
1887 | +struct uksm_cpu_preset_s { | |
1888 | + int cpu_ratio[SCAN_LADDER_SIZE]; | |
1889 | + unsigned int cover_msecs[SCAN_LADDER_SIZE]; | |
1890 | + unsigned int max_cpu; /* percentage */ | |
1891 | +}; | |
1892 | + | |
1893 | +struct uksm_cpu_preset_s uksm_cpu_preset[4] = { | |
1894 | + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, | |
1895 | + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, | |
1896 | + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, | |
1897 | + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, | |
1898 | +}; | |
1899 | + | |
1900 | +/* The default value for uksm_ema_page_time if it's not initialized */ | |
1901 | +#define UKSM_PAGE_TIME_DEFAULT 500 | |
1902 | + | |
1903 | +/*cost to scan one page by expotional moving average in nsecs */ | |
1904 | +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; | |
1905 | + | |
1906 | +/* The expotional moving average alpha weight, in percentage. */ | |
1907 | +#define EMA_ALPHA 20 | |
1908 | + | |
1909 | +/* | |
1910 | + * The threshold used to filter out thrashing areas, | |
1911 | + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound | |
1912 | + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio | |
1913 | + * will be considered as having a zero duplication ratio. | |
1914 | + */ | |
1915 | +static unsigned int uksm_thrash_threshold = 50; | |
1916 | + | |
1917 | +/* How much dedup ratio is considered to be abundant*/ | |
1918 | +static unsigned int uksm_abundant_threshold = 10; | |
1919 | + | |
1920 | +/* All slots having merged pages in this eval round. */ | |
1921 | +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); | |
1922 | + | |
1923 | +/* How many times the ksmd has slept since startup */ | |
1924 | +static unsigned long long uksm_sleep_times; | |
1925 | + | |
1926 | +#define UKSM_RUN_STOP 0 | |
1927 | +#define UKSM_RUN_MERGE 1 | |
1928 | +static unsigned int uksm_run = 1; | |
1929 | + | |
1930 | +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); | |
1931 | +static DEFINE_MUTEX(uksm_thread_mutex); | |
1932 | + | |
1933 | +/* | |
1934 | + * List vma_slot_new is for newly created vma_slot waiting to be added by | |
1935 | + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to | |
1936 | + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding | |
1937 | + * VMA has been removed/freed. | |
1938 | + */ | |
1939 | +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); | |
1940 | +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); | |
1941 | +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); | |
1942 | +static DEFINE_SPINLOCK(vma_slot_list_lock); | |
1943 | + | |
1944 | +/* The unstable tree heads */ | |
1945 | +static struct rb_root root_unstable_tree = RB_ROOT; | |
1946 | + | |
1947 | +/* | |
1948 | + * All tree_nodes are in a list to be freed at once when unstable tree is | |
1949 | + * freed after each scan round. | |
1950 | + */ | |
1951 | +static struct list_head unstable_tree_node_list = | |
1952 | + LIST_HEAD_INIT(unstable_tree_node_list); | |
1953 | + | |
1954 | +/* List contains all stable nodes */ | |
1955 | +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); | |
1956 | + | |
1957 | +/* | |
1958 | + * When the hash strength is changed, the stable tree must be delta_hashed and | |
1959 | + * re-structured. We use two set of below structs to speed up the | |
1960 | + * re-structuring of stable tree. | |
1961 | + */ | |
1962 | +static struct list_head | |
1963 | +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), | |
1964 | + LIST_HEAD_INIT(stable_tree_node_list[1])}; | |
1965 | + | |
1966 | +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; | |
1967 | +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; | |
1968 | +static struct rb_root *root_stable_treep = &root_stable_tree[0]; | |
1969 | +static unsigned long stable_tree_index; | |
1970 | + | |
1971 | +/* The hash strength needed to hash a full page */ | |
1972 | +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) | |
1973 | + | |
1974 | +/* The hash strength needed for loop-back hashing */ | |
1975 | +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) | |
1976 | + | |
1977 | +/* The random offsets in a page */ | |
1978 | +static u32 *random_nums; | |
1979 | + | |
1980 | +/* The hash strength */ | |
1981 | +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; | |
1982 | + | |
1983 | +/* The delta value each time the hash strength increases or decreases */ | |
1984 | +static unsigned long hash_strength_delta; | |
1985 | +#define HASH_STRENGTH_DELTA_MAX 5 | |
1986 | + | |
1987 | +/* The time we have saved due to random_sample_hash */ | |
1988 | +static u64 rshash_pos; | |
1989 | + | |
1990 | +/* The time we have wasted due to hash collision */ | |
1991 | +static u64 rshash_neg; | |
1992 | + | |
1993 | +struct uksm_benefit { | |
1994 | + u64 pos; | |
1995 | + u64 neg; | |
1996 | + u64 scanned; | |
1997 | + unsigned long base; | |
1998 | +} benefit; | |
1999 | + | |
2000 | +/* | |
2001 | + * The relative cost of memcmp, compared to 1 time unit of random sample | |
2002 | + * hash, this value is tested when ksm module is initialized | |
2003 | + */ | |
2004 | +static unsigned long memcmp_cost; | |
2005 | + | |
2006 | +static unsigned long rshash_neg_cont_zero; | |
2007 | +static unsigned long rshash_cont_obscure; | |
2008 | + | |
2009 | +/* The possible states of hash strength adjustment heuristic */ | |
2010 | +enum rshash_states { | |
2011 | + RSHASH_STILL, | |
2012 | + RSHASH_TRYUP, | |
2013 | + RSHASH_TRYDOWN, | |
2014 | + RSHASH_NEW, | |
2015 | + RSHASH_PRE_STILL, | |
2016 | +}; | |
2017 | + | |
2018 | +/* The possible direction we are about to adjust hash strength */ | |
2019 | +enum rshash_direct { | |
2020 | + GO_UP, | |
2021 | + GO_DOWN, | |
2022 | + OBSCURE, | |
2023 | + STILL, | |
2024 | +}; | |
2025 | + | |
2026 | +/* random sampling hash state machine */ | |
2027 | +static struct { | |
2028 | + enum rshash_states state; | |
2029 | + enum rshash_direct pre_direct; | |
2030 | + u8 below_count; | |
2031 | + /* Keep a lookup window of size 5, iff above_count/below_count > 3 | |
2032 | + * in this window we stop trying. | |
2033 | + */ | |
2034 | + u8 lookup_window_index; | |
2035 | + u64 stable_benefit; | |
2036 | + unsigned long turn_point_down; | |
2037 | + unsigned long turn_benefit_down; | |
2038 | + unsigned long turn_point_up; | |
2039 | + unsigned long turn_benefit_up; | |
2040 | + unsigned long stable_point; | |
2041 | +} rshash_state; | |
2042 | + | |
2043 | +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ | |
2044 | +static u32 *zero_hash_table; | |
2045 | + | |
2046 | +static inline struct node_vma *alloc_node_vma(void) | |
2047 | +{ | |
2048 | + struct node_vma *node_vma; | |
2049 | + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); | |
2050 | + if (node_vma) { | |
2051 | + INIT_HLIST_HEAD(&node_vma->rmap_hlist); | |
2052 | + INIT_HLIST_NODE(&node_vma->hlist); | |
2053 | + } | |
2054 | + return node_vma; | |
2055 | +} | |
2056 | + | |
2057 | +static inline void free_node_vma(struct node_vma *node_vma) | |
2058 | +{ | |
2059 | + kmem_cache_free(node_vma_cache, node_vma); | |
2060 | +} | |
2061 | + | |
2062 | + | |
2063 | +static inline struct vma_slot *alloc_vma_slot(void) | |
2064 | +{ | |
2065 | + struct vma_slot *slot; | |
2066 | + | |
2067 | + /* | |
2068 | + * In case ksm is not initialized by now. | |
2069 | + * Oops, we need to consider the call site of uksm_init() in the future. | |
2070 | + */ | |
2071 | + if (!vma_slot_cache) | |
2072 | + return NULL; | |
2073 | + | |
2074 | + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); | |
2075 | + if (slot) { | |
2076 | + INIT_LIST_HEAD(&slot->slot_list); | |
2077 | + INIT_LIST_HEAD(&slot->dedup_list); | |
2078 | + slot->flags |= UKSM_SLOT_NEED_RERAND; | |
2079 | + } | |
2080 | + return slot; | |
2081 | +} | |
2082 | + | |
2083 | +static inline void free_vma_slot(struct vma_slot *vma_slot) | |
2084 | +{ | |
2085 | + kmem_cache_free(vma_slot_cache, vma_slot); | |
2086 | +} | |
2087 | + | |
2088 | + | |
2089 | + | |
2090 | +static inline struct rmap_item *alloc_rmap_item(void) | |
2091 | +{ | |
2092 | + struct rmap_item *rmap_item; | |
2093 | + | |
2094 | + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); | |
2095 | + if (rmap_item) { | |
2096 | + /* bug on lowest bit is not clear for flag use */ | |
2097 | + BUG_ON(is_addr(rmap_item)); | |
2098 | + } | |
2099 | + return rmap_item; | |
2100 | +} | |
2101 | + | |
2102 | +static inline void free_rmap_item(struct rmap_item *rmap_item) | |
2103 | +{ | |
2104 | + rmap_item->slot = NULL; /* debug safety */ | |
2105 | + kmem_cache_free(rmap_item_cache, rmap_item); | |
2106 | +} | |
2107 | + | |
2108 | +static inline struct stable_node *alloc_stable_node(void) | |
2109 | +{ | |
2110 | + struct stable_node *node; | |
2111 | + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); | |
2112 | + if (!node) | |
2113 | + return NULL; | |
2114 | + | |
2115 | + INIT_HLIST_HEAD(&node->hlist); | |
2116 | + list_add(&node->all_list, &stable_node_list); | |
2117 | + return node; | |
2118 | +} | |
2119 | + | |
2120 | +static inline void free_stable_node(struct stable_node *stable_node) | |
2121 | +{ | |
2122 | + list_del(&stable_node->all_list); | |
2123 | + kmem_cache_free(stable_node_cache, stable_node); | |
2124 | +} | |
2125 | + | |
2126 | +static inline struct tree_node *alloc_tree_node(struct list_head *list) | |
2127 | +{ | |
2128 | + struct tree_node *node; | |
2129 | + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); | |
2130 | + if (!node) | |
2131 | + return NULL; | |
2132 | + | |
2133 | + list_add(&node->all_list, list); | |
2134 | + return node; | |
2135 | +} | |
2136 | + | |
2137 | +static inline void free_tree_node(struct tree_node *node) | |
2138 | +{ | |
2139 | + list_del(&node->all_list); | |
2140 | + kmem_cache_free(tree_node_cache, node); | |
2141 | +} | |
2142 | + | |
2143 | +static void uksm_drop_anon_vma(struct rmap_item *rmap_item) | |
2144 | +{ | |
2145 | + struct anon_vma *anon_vma = rmap_item->anon_vma; | |
2146 | + | |
2147 | + put_anon_vma(anon_vma); | |
2148 | +} | |
2149 | + | |
2150 | + | |
2151 | +/** | |
2152 | + * Remove a stable node from stable_tree, may unlink from its tree_node and | |
2153 | + * may remove its parent tree_node if no other stable node is pending. | |
2154 | + * | |
2155 | + * @stable_node The node need to be removed | |
2156 | + * @unlink_rb Will this node be unlinked from the rbtree? | |
2157 | + * @remove_tree_ node Will its tree_node be removed if empty? | |
2158 | + */ | |
2159 | +static void remove_node_from_stable_tree(struct stable_node *stable_node, | |
2160 | + int unlink_rb, int remove_tree_node) | |
2161 | +{ | |
2162 | + struct node_vma *node_vma; | |
2163 | + struct rmap_item *rmap_item; | |
2164 | + struct hlist_node *n; | |
2165 | + | |
2166 | + if (!hlist_empty(&stable_node->hlist)) { | |
2167 | + hlist_for_each_entry_safe(node_vma, n, | |
2168 | + &stable_node->hlist, hlist) { | |
2169 | + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { | |
2170 | + uksm_pages_sharing--; | |
2171 | + | |
2172 | + uksm_drop_anon_vma(rmap_item); | |
2173 | + rmap_item->address &= PAGE_MASK; | |
2174 | + } | |
2175 | + free_node_vma(node_vma); | |
2176 | + cond_resched(); | |
2177 | + } | |
2178 | + | |
2179 | + /* the last one is counted as shared */ | |
2180 | + uksm_pages_shared--; | |
2181 | + uksm_pages_sharing++; | |
2182 | + } | |
2183 | + | |
2184 | + if (stable_node->tree_node && unlink_rb) { | |
2185 | + rb_erase(&stable_node->node, | |
2186 | + &stable_node->tree_node->sub_root); | |
2187 | + | |
2188 | + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && | |
2189 | + remove_tree_node) { | |
2190 | + rb_erase(&stable_node->tree_node->node, | |
2191 | + root_stable_treep); | |
2192 | + free_tree_node(stable_node->tree_node); | |
2193 | + } else { | |
2194 | + stable_node->tree_node->count--; | |
2195 | + } | |
2196 | + } | |
2197 | + | |
2198 | + free_stable_node(stable_node); | |
2199 | +} | |
2200 | + | |
2201 | + | |
2202 | +/* | |
2203 | + * get_uksm_page: checks if the page indicated by the stable node | |
2204 | + * is still its ksm page, despite having held no reference to it. | |
2205 | + * In which case we can trust the content of the page, and it | |
2206 | + * returns the gotten page; but if the page has now been zapped, | |
2207 | + * remove the stale node from the stable tree and return NULL. | |
2208 | + * | |
2209 | + * You would expect the stable_node to hold a reference to the ksm page. | |
2210 | + * But if it increments the page's count, swapping out has to wait for | |
2211 | + * ksmd to come around again before it can free the page, which may take | |
2212 | + * seconds or even minutes: much too unresponsive. So instead we use a | |
2213 | + * "keyhole reference": access to the ksm page from the stable node peeps | |
2214 | + * out through its keyhole to see if that page still holds the right key, | |
2215 | + * pointing back to this stable node. This relies on freeing a PageAnon | |
2216 | + * page to reset its page->mapping to NULL, and relies on no other use of | |
2217 | + * a page to put something that might look like our key in page->mapping. | |
2218 | + * | |
2219 | + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | |
2220 | + * but this is different - made simpler by uksm_thread_mutex being held, but | |
2221 | + * interesting for assuming that no other use of the struct page could ever | |
2222 | + * put our expected_mapping into page->mapping (or a field of the union which | |
2223 | + * coincides with page->mapping). The RCU calls are not for KSM at all, but | |
2224 | + * to keep the page_count protocol described with page_cache_get_speculative. | |
2225 | + * | |
2226 | + * Note: it is possible that get_uksm_page() will return NULL one moment, | |
2227 | + * then page the next, if the page is in between page_freeze_refs() and | |
2228 | + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | |
2229 | + * is on its way to being freed; but it is an anomaly to bear in mind. | |
2230 | + * | |
2231 | + * @unlink_rb: if the removal of this node will firstly unlink from | |
2232 | + * its rbtree. stable_node_reinsert will prevent this when restructuring the | |
2233 | + * node from its old tree. | |
2234 | + * | |
2235 | + * @remove_tree_node: if this is the last one of its tree_node, will the | |
2236 | + * tree_node be freed ? If we are inserting stable node, this tree_node may | |
2237 | + * be reused, so don't free it. | |
2238 | + */ | |
2239 | +static struct page *get_uksm_page(struct stable_node *stable_node, | |
2240 | + int unlink_rb, int remove_tree_node) | |
2241 | +{ | |
2242 | + struct page *page; | |
2243 | + void *expected_mapping; | |
2244 | + | |
2245 | + page = pfn_to_page(stable_node->kpfn); | |
2246 | + expected_mapping = (void *)stable_node + | |
2247 | + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | |
2248 | + rcu_read_lock(); | |
2249 | + if (page->mapping != expected_mapping) | |
2250 | + goto stale; | |
2251 | + if (!get_page_unless_zero(page)) | |
2252 | + goto stale; | |
2253 | + if (page->mapping != expected_mapping) { | |
2254 | + put_page(page); | |
2255 | + goto stale; | |
2256 | + } | |
2257 | + rcu_read_unlock(); | |
2258 | + return page; | |
2259 | +stale: | |
2260 | + rcu_read_unlock(); | |
2261 | + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); | |
2262 | + | |
2263 | + return NULL; | |
2264 | +} | |
2265 | + | |
2266 | +/* | |
2267 | + * Removing rmap_item from stable or unstable tree. | |
2268 | + * This function will clean the information from the stable/unstable tree. | |
2269 | + */ | |
2270 | +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |
2271 | +{ | |
2272 | + if (rmap_item->address & STABLE_FLAG) { | |
2273 | + struct stable_node *stable_node; | |
2274 | + struct node_vma *node_vma; | |
2275 | + struct page *page; | |
2276 | + | |
2277 | + node_vma = rmap_item->head; | |
2278 | + stable_node = node_vma->head; | |
2279 | + page = get_uksm_page(stable_node, 1, 1); | |
2280 | + if (!page) | |
2281 | + goto out; | |
2282 | + | |
2283 | + /* | |
2284 | + * page lock is needed because it's racing with | |
2285 | + * try_to_unmap_ksm(), etc. | |
2286 | + */ | |
2287 | + lock_page(page); | |
2288 | + hlist_del(&rmap_item->hlist); | |
2289 | + | |
2290 | + if (hlist_empty(&node_vma->rmap_hlist)) { | |
2291 | + hlist_del(&node_vma->hlist); | |
2292 | + free_node_vma(node_vma); | |
2293 | + } | |
2294 | + unlock_page(page); | |
2295 | + | |
2296 | + put_page(page); | |
2297 | + if (hlist_empty(&stable_node->hlist)) { | |
2298 | + /* do NOT call remove_node_from_stable_tree() here, | |
2299 | + * it's possible for a forked rmap_item not in | |
2300 | + * stable tree while the in-tree rmap_items were | |
2301 | + * deleted. | |
2302 | + */ | |
2303 | + uksm_pages_shared--; | |
2304 | + } else | |
2305 | + uksm_pages_sharing--; | |
2306 | + | |
2307 | + | |
2308 | + uksm_drop_anon_vma(rmap_item); | |
2309 | + } else if (rmap_item->address & UNSTABLE_FLAG) { | |
2310 | + if (rmap_item->hash_round == uksm_hash_round) { | |
2311 | + | |
2312 | + rb_erase(&rmap_item->node, | |
2313 | + &rmap_item->tree_node->sub_root); | |
2314 | + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { | |
2315 | + rb_erase(&rmap_item->tree_node->node, | |
2316 | + &root_unstable_tree); | |
2317 | + | |
2318 | + free_tree_node(rmap_item->tree_node); | |
2319 | + } else | |
2320 | + rmap_item->tree_node->count--; | |
2321 | + } | |
2322 | + uksm_pages_unshared--; | |
2323 | + } | |
2324 | + | |
2325 | + rmap_item->address &= PAGE_MASK; | |
2326 | + rmap_item->hash_max = 0; | |
2327 | + | |
2328 | +out: | |
2329 | + cond_resched(); /* we're called from many long loops */ | |
2330 | +} | |
2331 | + | |
2332 | +static inline int slot_in_uksm(struct vma_slot *slot) | |
2333 | +{ | |
2334 | + return list_empty(&slot->slot_list); | |
2335 | +} | |
2336 | + | |
2337 | +/* | |
2338 | + * Test if the mm is exiting | |
2339 | + */ | |
2340 | +static inline bool uksm_test_exit(struct mm_struct *mm) | |
2341 | +{ | |
2342 | + return atomic_read(&mm->mm_users) == 0; | |
2343 | +} | |
2344 | + | |
2345 | +/** | |
2346 | + * Need to do two things: | |
2347 | + * 1. check if slot was moved to del list | |
2348 | + * 2. make sure the mmap_sem is manipulated under valid vma. | |
2349 | + * | |
2350 | + * My concern here is that in some cases, this may make | |
2351 | + * vma_slot_list_lock() waiters to serialized further by some | |
2352 | + * sem->wait_lock, can this really be expensive? | |
2353 | + * | |
2354 | + * | |
2355 | + * @return | |
2356 | + * 0: if successfully locked mmap_sem | |
2357 | + * -ENOENT: this slot was moved to del list | |
2358 | + * -EBUSY: vma lock failed | |
2359 | + */ | |
2360 | +static int try_down_read_slot_mmap_sem(struct vma_slot *slot) | |
2361 | +{ | |
2362 | + struct vm_area_struct *vma; | |
2363 | + struct mm_struct *mm; | |
2364 | + struct rw_semaphore *sem; | |
2365 | + | |
2366 | + spin_lock(&vma_slot_list_lock); | |
2367 | + | |
2368 | + /* the slot_list was removed and inited from new list, when it enters | |
2369 | + * uksm_list. If now it's not empty, then it must be moved to del list | |
2370 | + */ | |
2371 | + if (!slot_in_uksm(slot)) { | |
2372 | + spin_unlock(&vma_slot_list_lock); | |
2373 | + return -ENOENT; | |
2374 | + } | |
2375 | + | |
2376 | + BUG_ON(slot->pages != vma_pages(slot->vma)); | |
2377 | + /* Ok, vma still valid */ | |
2378 | + vma = slot->vma; | |
2379 | + mm = vma->vm_mm; | |
2380 | + sem = &mm->mmap_sem; | |
2381 | + | |
2382 | + if (uksm_test_exit(mm)) { | |
2383 | + spin_unlock(&vma_slot_list_lock); | |
2384 | + return -ENOENT; | |
2385 | + } | |
2386 | + | |
2387 | + if (down_read_trylock(sem)) { | |
2388 | + spin_unlock(&vma_slot_list_lock); | |
2389 | + return 0; | |
2390 | + } | |
2391 | + | |
2392 | + spin_unlock(&vma_slot_list_lock); | |
2393 | + return -EBUSY; | |
2394 | +} | |
2395 | + | |
2396 | +static inline unsigned long | |
2397 | +vma_page_address(struct page *page, struct vm_area_struct *vma) | |
2398 | +{ | |
2399 | + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | |
2400 | + unsigned long address; | |
2401 | + | |
2402 | + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | |
2403 | + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | |
2404 | + /* page should be within @vma mapping range */ | |
2405 | + return -EFAULT; | |
2406 | + } | |
2407 | + return address; | |
2408 | +} | |
2409 | + | |
2410 | + | |
2411 | +/* return 0 on success with the item's mmap_sem locked */ | |
2412 | +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) | |
2413 | +{ | |
2414 | + struct mm_struct *mm; | |
2415 | + struct vma_slot *slot = item->slot; | |
2416 | + int err = -EINVAL; | |
2417 | + | |
2418 | + struct page *page; | |
2419 | + | |
2420 | + /* | |
2421 | + * try_down_read_slot_mmap_sem() returns non-zero if the slot | |
2422 | + * has been removed by uksm_remove_vma(). | |
2423 | + */ | |
2424 | + if (try_down_read_slot_mmap_sem(slot)) | |
2425 | + return -EBUSY; | |
2426 | + | |
2427 | + mm = slot->vma->vm_mm; | |
2428 | + | |
2429 | + if (uksm_test_exit(mm)) | |
2430 | + goto failout_up; | |
2431 | + | |
2432 | + page = item->page; | |
2433 | + rcu_read_lock(); | |
2434 | + if (!get_page_unless_zero(page)) { | |
2435 | + rcu_read_unlock(); | |
2436 | + goto failout_up; | |
2437 | + } | |
2438 | + | |
2439 | + /* No need to consider huge page here. */ | |
2440 | + if (item->slot->vma->anon_vma != page_anon_vma(page) || | |
2441 | + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { | |
2442 | + /* | |
2443 | + * TODO: | |
2444 | + * should we release this item becase of its stale page | |
2445 | + * mapping? | |
2446 | + */ | |
2447 | + put_page(page); | |
2448 | + rcu_read_unlock(); | |
2449 | + goto failout_up; | |
2450 | + } | |
2451 | + rcu_read_unlock(); | |
2452 | + return 0; | |
2453 | + | |
2454 | +failout_up: | |
2455 | + up_read(&mm->mmap_sem); | |
2456 | + return err; | |
2457 | +} | |
2458 | + | |
2459 | +/* | |
2460 | + * What kind of VMA is considered ? | |
2461 | + */ | |
2462 | +static inline int vma_can_enter(struct vm_area_struct *vma) | |
2463 | +{ | |
2464 | + return uksm_flags_can_scan(vma->vm_flags); | |
2465 | +} | |
2466 | + | |
2467 | +/* | |
2468 | + * Called whenever a fresh new vma is created A new vma_slot. | |
2469 | + * is created and inserted into a global list Must be called. | |
2470 | + * after vma is inserted to its mm . | |
2471 | + */ | |
2472 | +void uksm_vma_add_new(struct vm_area_struct *vma) | |
2473 | +{ | |
2474 | + struct vma_slot *slot; | |
2475 | + | |
2476 | + if (!vma_can_enter(vma)) { | |
2477 | + vma->uksm_vma_slot = NULL; | |
2478 | + return; | |
2479 | + } | |
2480 | + | |
2481 | + slot = alloc_vma_slot(); | |
2482 | + if (!slot) { | |
2483 | + vma->uksm_vma_slot = NULL; | |
2484 | + return; | |
2485 | + } | |
2486 | + | |
2487 | + vma->uksm_vma_slot = slot; | |
2488 | + vma->vm_flags |= VM_MERGEABLE; | |
2489 | + slot->vma = vma; | |
2490 | + slot->mm = vma->vm_mm; | |
2491 | + slot->ctime_j = jiffies; | |
2492 | + slot->pages = vma_pages(vma); | |
2493 | + spin_lock(&vma_slot_list_lock); | |
2494 | + list_add_tail(&slot->slot_list, &vma_slot_new); | |
2495 | + spin_unlock(&vma_slot_list_lock); | |
2496 | +} | |
2497 | + | |
2498 | +/* | |
2499 | + * Called after vma is unlinked from its mm | |
2500 | + */ | |
2501 | +void uksm_remove_vma(struct vm_area_struct *vma) | |
2502 | +{ | |
2503 | + struct vma_slot *slot; | |
2504 | + | |
2505 | + if (!vma->uksm_vma_slot) | |
2506 | + return; | |
2507 | + | |
2508 | + slot = vma->uksm_vma_slot; | |
2509 | + spin_lock(&vma_slot_list_lock); | |
2510 | + if (slot_in_uksm(slot)) { | |
2511 | + /** | |
2512 | + * This slot has been added by ksmd, so move to the del list | |
2513 | + * waiting ksmd to free it. | |
2514 | + */ | |
2515 | + list_add_tail(&slot->slot_list, &vma_slot_del); | |
2516 | + } else { | |
2517 | + /** | |
2518 | + * It's still on new list. It's ok to free slot directly. | |
2519 | + */ | |
2520 | + list_del(&slot->slot_list); | |
2521 | + free_vma_slot(slot); | |
2522 | + } | |
2523 | + spin_unlock(&vma_slot_list_lock); | |
2524 | + vma->uksm_vma_slot = NULL; | |
2525 | +} | |
2526 | + | |
2527 | +/* 32/3 < they < 32/2 */ | |
2528 | +#define shiftl 8 | |
2529 | +#define shiftr 12 | |
2530 | + | |
2531 | +#define HASH_FROM_TO(from, to) \ | |
2532 | +for (index = from; index < to; index++) { \ | |
2533 | + pos = random_nums[index]; \ | |
2534 | + hash += key[pos]; \ | |
2535 | + hash += (hash << shiftl); \ | |
2536 | + hash ^= (hash >> shiftr); \ | |
2537 | +} | |
2538 | + | |
2539 | + | |
2540 | +#define HASH_FROM_DOWN_TO(from, to) \ | |
2541 | +for (index = from - 1; index >= to; index--) { \ | |
2542 | + hash ^= (hash >> shiftr); \ | |
2543 | + hash ^= (hash >> (shiftr*2)); \ | |
2544 | + hash -= (hash << shiftl); \ | |
2545 | + hash += (hash << (shiftl*2)); \ | |
2546 | + pos = random_nums[index]; \ | |
2547 | + hash -= key[pos]; \ | |
2548 | +} | |
2549 | + | |
2550 | +/* | |
2551 | + * The main random sample hash function. | |
2552 | + */ | |
2553 | +static u32 random_sample_hash(void *addr, u32 hash_strength) | |
2554 | +{ | |
2555 | + u32 hash = 0xdeadbeef; | |
2556 | + int index, pos, loop = hash_strength; | |
2557 | + u32 *key = (u32 *)addr; | |
2558 | + | |
2559 | + if (loop > HASH_STRENGTH_FULL) | |
2560 | + loop = HASH_STRENGTH_FULL; | |
2561 | + | |
2562 | + HASH_FROM_TO(0, loop); | |
2563 | + | |
2564 | + if (hash_strength > HASH_STRENGTH_FULL) { | |
2565 | + loop = hash_strength - HASH_STRENGTH_FULL; | |
2566 | + HASH_FROM_TO(0, loop); | |
2567 | + } | |
2568 | + | |
2569 | + return hash; | |
2570 | +} | |
2571 | + | |
2572 | + | |
2573 | +/** | |
2574 | + * It's used when hash strength is adjusted | |
2575 | + * | |
2576 | + * @addr The page's virtual address | |
2577 | + * @from The original hash strength | |
2578 | + * @to The hash strength changed to | |
2579 | + * @hash The hash value generated with "from" hash value | |
2580 | + * | |
2581 | + * return the hash value | |
2582 | + */ | |
2583 | +static u32 delta_hash(void *addr, int from, int to, u32 hash) | |
2584 | +{ | |
2585 | + u32 *key = (u32 *)addr; | |
2586 | + int index, pos; /* make sure they are int type */ | |
2587 | + | |
2588 | + if (to > from) { | |
2589 | + if (from >= HASH_STRENGTH_FULL) { | |
2590 | + from -= HASH_STRENGTH_FULL; | |
2591 | + to -= HASH_STRENGTH_FULL; | |
2592 | + HASH_FROM_TO(from, to); | |
2593 | + } else if (to <= HASH_STRENGTH_FULL) { | |
2594 | + HASH_FROM_TO(from, to); | |
2595 | + } else { | |
2596 | + HASH_FROM_TO(from, HASH_STRENGTH_FULL); | |
2597 | + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); | |
2598 | + } | |
2599 | + } else { | |
2600 | + if (from <= HASH_STRENGTH_FULL) { | |
2601 | + HASH_FROM_DOWN_TO(from, to); | |
2602 | + } else if (to >= HASH_STRENGTH_FULL) { | |
2603 | + from -= HASH_STRENGTH_FULL; | |
2604 | + to -= HASH_STRENGTH_FULL; | |
2605 | + HASH_FROM_DOWN_TO(from, to); | |
2606 | + } else { | |
2607 | + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); | |
2608 | + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); | |
2609 | + } | |
2610 | + } | |
2611 | + | |
2612 | + return hash; | |
2613 | +} | |
2614 | + | |
2615 | + | |
2616 | + | |
2617 | + | |
2618 | +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) | |
2619 | + | |
2620 | +/** | |
2621 | + * | |
2622 | + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round | |
2623 | + * has finished. | |
2624 | + * | |
2625 | + * return 0 if no page has been scanned since last call, 1 otherwise. | |
2626 | + */ | |
2627 | +static inline int encode_benefit(void) | |
2628 | +{ | |
2629 | + u64 scanned_delta, pos_delta, neg_delta; | |
2630 | + unsigned long base = benefit.base; | |
2631 | + | |
2632 | + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; | |
2633 | + | |
2634 | + if (!scanned_delta) | |
2635 | + return 0; | |
2636 | + | |
2637 | + scanned_delta >>= base; | |
2638 | + pos_delta = rshash_pos >> base; | |
2639 | + neg_delta = rshash_neg >> base; | |
2640 | + | |
2641 | + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || | |
2642 | + CAN_OVERFLOW_U64(benefit.neg, neg_delta) || | |
2643 | + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { | |
2644 | + benefit.scanned >>= 1; | |
2645 | + benefit.neg >>= 1; | |
2646 | + benefit.pos >>= 1; | |
2647 | + benefit.base++; | |
2648 | + scanned_delta >>= 1; | |
2649 | + pos_delta >>= 1; | |
2650 | + neg_delta >>= 1; | |
2651 | + } | |
2652 | + | |
2653 | + benefit.pos += pos_delta; | |
2654 | + benefit.neg += neg_delta; | |
2655 | + benefit.scanned += scanned_delta; | |
2656 | + | |
2657 | + BUG_ON(!benefit.scanned); | |
2658 | + | |
2659 | + rshash_pos = rshash_neg = 0; | |
2660 | + uksm_pages_scanned_last = uksm_pages_scanned; | |
2661 | + | |
2662 | + return 1; | |
2663 | +} | |
2664 | + | |
2665 | +static inline void reset_benefit(void) | |
2666 | +{ | |
2667 | + benefit.pos = 0; | |
2668 | + benefit.neg = 0; | |
2669 | + benefit.base = 0; | |
2670 | + benefit.scanned = 0; | |
2671 | +} | |
2672 | + | |
2673 | +static inline void inc_rshash_pos(unsigned long delta) | |
2674 | +{ | |
2675 | + if (CAN_OVERFLOW_U64(rshash_pos, delta)) | |
2676 | + encode_benefit(); | |
2677 | + | |
2678 | + rshash_pos += delta; | |
2679 | +} | |
2680 | + | |
2681 | +static inline void inc_rshash_neg(unsigned long delta) | |
2682 | +{ | |
2683 | + if (CAN_OVERFLOW_U64(rshash_neg, delta)) | |
2684 | + encode_benefit(); | |
2685 | + | |
2686 | + rshash_neg += delta; | |
2687 | +} | |
2688 | + | |
2689 | + | |
2690 | +static inline u32 page_hash(struct page *page, unsigned long hash_strength, | |
2691 | + int cost_accounting) | |
2692 | +{ | |
2693 | + u32 val; | |
2694 | + unsigned long delta; | |
2695 | + | |
2696 | + void *addr = kmap_atomic(page); | |
2697 | + | |
2698 | + val = random_sample_hash(addr, hash_strength); | |
2699 | + kunmap_atomic(addr); | |
2700 | + | |
2701 | + if (cost_accounting) { | |
2702 | + if (HASH_STRENGTH_FULL > hash_strength) | |
2703 | + delta = HASH_STRENGTH_FULL - hash_strength; | |
2704 | + else | |
2705 | + delta = 0; | |
2706 | + | |
2707 | + inc_rshash_pos(delta); | |
2708 | + } | |
2709 | + | |
2710 | + return val; | |
2711 | +} | |
2712 | + | |
2713 | +static int memcmp_pages(struct page *page1, struct page *page2, | |
2714 | + int cost_accounting) | |
2715 | +{ | |
2716 | + char *addr1, *addr2; | |
2717 | + int ret; | |
2718 | + | |
2719 | + addr1 = kmap_atomic(page1); | |
2720 | + addr2 = kmap_atomic(page2); | |
2721 | + ret = memcmp(addr1, addr2, PAGE_SIZE); | |
2722 | + kunmap_atomic(addr2); | |
2723 | + kunmap_atomic(addr1); | |
2724 | + | |
2725 | + if (cost_accounting) | |
2726 | + inc_rshash_neg(memcmp_cost); | |
2727 | + | |
2728 | + return ret; | |
2729 | +} | |
2730 | + | |
2731 | +static inline int pages_identical(struct page *page1, struct page *page2) | |
2732 | +{ | |
2733 | + return !memcmp_pages(page1, page2, 0); | |
2734 | +} | |
2735 | + | |
2736 | +static inline int is_page_full_zero(struct page *page) | |
2737 | +{ | |
2738 | + char *addr; | |
2739 | + int ret; | |
2740 | + | |
2741 | + addr = kmap_atomic(page); | |
2742 | + ret = is_full_zero(addr, PAGE_SIZE); | |
2743 | + kunmap_atomic(addr); | |
2744 | + | |
2745 | + return ret; | |
2746 | +} | |
2747 | + | |
2748 | +static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |
2749 | + pte_t *orig_pte, pte_t *old_pte) | |
2750 | +{ | |
2751 | + struct mm_struct *mm = vma->vm_mm; | |
2752 | + unsigned long addr; | |
2753 | + pte_t *ptep; | |
2754 | + spinlock_t *ptl; | |
2755 | + int swapped; | |
2756 | + int err = -EFAULT; | |
2757 | + unsigned long mmun_start; /* For mmu_notifiers */ | |
2758 | + unsigned long mmun_end; /* For mmu_notifiers */ | |
2759 | + | |
2760 | + addr = page_address_in_vma(page, vma); | |
2761 | + if (addr == -EFAULT) | |
2762 | + goto out; | |
2763 | + | |
2764 | + BUG_ON(PageTransCompound(page)); | |
2765 | + | |
2766 | + mmun_start = addr; | |
2767 | + mmun_end = addr + PAGE_SIZE; | |
2768 | + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | |
2769 | + | |
2770 | + ptep = page_check_address(page, mm, addr, &ptl, 0); | |
2771 | + if (!ptep) | |
2772 | + goto out_mn; | |
2773 | + | |
2774 | + if (old_pte) | |
2775 | + *old_pte = *ptep; | |
2776 | + | |
2777 | + if (pte_write(*ptep) || pte_dirty(*ptep)) { | |
2778 | + pte_t entry; | |
2779 | + | |
2780 | + swapped = PageSwapCache(page); | |
2781 | + flush_cache_page(vma, addr, page_to_pfn(page)); | |
2782 | + /* | |
2783 | + * Ok this is tricky, when get_user_pages_fast() run it doesnt | |
2784 | + * take any lock, therefore the check that we are going to make | |
2785 | + * with the pagecount against the mapcount is racey and | |
2786 | + * O_DIRECT can happen right after the check. | |
2787 | + * So we clear the pte and flush the tlb before the check | |
2788 | + * this assure us that no O_DIRECT can happen after the check | |
2789 | + * or in the middle of the check. | |
2790 | + */ | |
2791 | + entry = ptep_clear_flush(vma, addr, ptep); | |
2792 | + /* | |
2793 | + * Check that no O_DIRECT or similar I/O is in progress on the | |
2794 | + * page | |
2795 | + */ | |
2796 | + if (page_mapcount(page) + 1 + swapped != page_count(page)) { | |
2797 | + set_pte_at(mm, addr, ptep, entry); | |
2798 | + goto out_unlock; | |
2799 | + } | |
2800 | + if (pte_dirty(entry)) | |
2801 | + set_page_dirty(page); | |
2802 | + entry = pte_mkclean(pte_wrprotect(entry)); | |
2803 | + set_pte_at_notify(mm, addr, ptep, entry); | |
2804 | + } | |
2805 | + *orig_pte = *ptep; | |
2806 | + err = 0; | |
2807 | + | |
2808 | +out_unlock: | |
2809 | + pte_unmap_unlock(ptep, ptl); | |
2810 | +out_mn: | |
2811 | + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | |
2812 | +out: | |
2813 | + return err; | |
2814 | +} | |
2815 | + | |
2816 | +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ | |
2817 | +#define MERGE_ERR_COLLI 2 /* there is a collision */ | |
2818 | +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ | |
2819 | +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ | |
2820 | + | |
2821 | + | |
2822 | +/** | |
2823 | + * replace_page - replace page in vma by new ksm page | |
2824 | + * @vma: vma that holds the pte pointing to page | |
2825 | + * @page: the page we are replacing by kpage | |
2826 | + * @kpage: the ksm page we replace page by | |
2827 | + * @orig_pte: the original value of the pte | |
2828 | + * | |
2829 | + * Returns 0 on success, MERGE_ERR_PGERR on failure. | |
2830 | + */ | |
2831 | +static int replace_page(struct vm_area_struct *vma, struct page *page, | |
2832 | + struct page *kpage, pte_t orig_pte) | |
2833 | +{ | |
2834 | + struct mm_struct *mm = vma->vm_mm; | |
2835 | + pgd_t *pgd; | |
2836 | + pud_t *pud; | |
2837 | + pmd_t *pmd; | |
2838 | + pte_t *ptep; | |
2839 | + spinlock_t *ptl; | |
2840 | + pte_t entry; | |
2841 | + | |
2842 | + unsigned long addr; | |
2843 | + int err = MERGE_ERR_PGERR; | |
2844 | + unsigned long mmun_start; /* For mmu_notifiers */ | |
2845 | + unsigned long mmun_end; /* For mmu_notifiers */ | |
2846 | + | |
2847 | + addr = page_address_in_vma(page, vma); | |
2848 | + if (addr == -EFAULT) | |
2849 | + goto out; | |
2850 | + | |
2851 | + pgd = pgd_offset(mm, addr); | |
2852 | + if (!pgd_present(*pgd)) | |
2853 | + goto out; | |
2854 | + | |
2855 | + pud = pud_offset(pgd, addr); | |
2856 | + if (!pud_present(*pud)) | |
2857 | + goto out; | |
2858 | + | |
2859 | + pmd = pmd_offset(pud, addr); | |
2860 | + BUG_ON(pmd_trans_huge(*pmd)); | |
2861 | + if (!pmd_present(*pmd)) | |
2862 | + goto out; | |
2863 | + | |
2864 | + mmun_start = addr; | |
2865 | + mmun_end = addr + PAGE_SIZE; | |
2866 | + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | |
2867 | + | |
2868 | + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
2869 | + if (!pte_same(*ptep, orig_pte)) { | |
2870 | + pte_unmap_unlock(ptep, ptl); | |
2871 | + goto out_mn; | |
2872 | + } | |
2873 | + | |
2874 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
2875 | + ptep_clear_flush(vma, addr, ptep); | |
2876 | + entry = mk_pte(kpage, vma->vm_page_prot); | |
2877 | + | |
2878 | + /* special treatment is needed for zero_page */ | |
2879 | + if ((page_to_pfn(kpage) == uksm_zero_pfn) || | |
2880 | + (page_to_pfn(kpage) == zero_pfn)) | |
2881 | + entry = pte_mkspecial(entry); | |
2882 | + else { | |
2883 | + get_page(kpage); | |
2884 | + page_add_anon_rmap(kpage, vma, addr); | |
2885 | + } | |
2886 | + | |
2887 | + set_pte_at_notify(mm, addr, ptep, entry); | |
2888 | + | |
2889 | + page_remove_rmap(page); | |
2890 | + if (!page_mapped(page)) | |
2891 | + try_to_free_swap(page); | |
2892 | + put_page(page); | |
2893 | + | |
2894 | + pte_unmap_unlock(ptep, ptl); | |
2895 | + err = 0; | |
2896 | +out_mn: | |
2897 | + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | |
2898 | +out: | |
2899 | + return err; | |
2900 | +} | |
2901 | + | |
2902 | + | |
2903 | +/** | |
2904 | + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The | |
2905 | + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its | |
2906 | + * hash_max member has not been calculated. | |
2907 | + * | |
2908 | + * @page The page needs to be hashed | |
2909 | + * @hash_old The hash value calculated with current hash strength | |
2910 | + * | |
2911 | + * return the new hash value calculated at HASH_STRENGTH_MAX | |
2912 | + */ | |
2913 | +static inline u32 page_hash_max(struct page *page, u32 hash_old) | |
2914 | +{ | |
2915 | + u32 hash_max = 0; | |
2916 | + void *addr; | |
2917 | + | |
2918 | + addr = kmap_atomic(page); | |
2919 | + hash_max = delta_hash(addr, hash_strength, | |
2920 | + HASH_STRENGTH_MAX, hash_old); | |
2921 | + | |
2922 | + kunmap_atomic(addr); | |
2923 | + | |
2924 | + if (!hash_max) | |
2925 | + hash_max = 1; | |
2926 | + | |
2927 | + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); | |
2928 | + return hash_max; | |
2929 | +} | |
2930 | + | |
2931 | +/* | |
2932 | + * We compare the hash again, to ensure that it is really a hash collision | |
2933 | + * instead of being caused by page write. | |
2934 | + */ | |
2935 | +static inline int check_collision(struct rmap_item *rmap_item, | |
2936 | + u32 hash) | |
2937 | +{ | |
2938 | + int err; | |
2939 | + struct page *page = rmap_item->page; | |
2940 | + | |
2941 | + /* if this rmap_item has already been hash_maxed, then the collision | |
2942 | + * must appears in the second-level rbtree search. In this case we check | |
2943 | + * if its hash_max value has been changed. Otherwise, the collision | |
2944 | + * happens in the first-level rbtree search, so we check against it's | |
2945 | + * current hash value. | |
2946 | + */ | |
2947 | + if (rmap_item->hash_max) { | |
2948 | + inc_rshash_neg(memcmp_cost); | |
2949 | + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); | |
2950 | + | |
2951 | + if (rmap_item->hash_max == page_hash_max(page, hash)) | |
2952 | + err = MERGE_ERR_COLLI; | |
2953 | + else | |
2954 | + err = MERGE_ERR_CHANGED; | |
2955 | + } else { | |
2956 | + inc_rshash_neg(memcmp_cost + hash_strength); | |
2957 | + | |
2958 | + if (page_hash(page, hash_strength, 0) == hash) | |
2959 | + err = MERGE_ERR_COLLI; | |
2960 | + else | |
2961 | + err = MERGE_ERR_CHANGED; | |
2962 | + } | |
2963 | + | |
2964 | + return err; | |
2965 | +} | |
2966 | + | |
2967 | +static struct page *page_trans_compound_anon(struct page *page) | |
2968 | +{ | |
2969 | + if (PageTransCompound(page)) { | |
2970 | + struct page *head = compound_head(page); | |
2971 | + /* | |
2972 | + * head may actually be splitted and freed from under | |
2973 | + * us but it's ok here. | |
2974 | + */ | |
2975 | + if (PageAnon(head)) | |
2976 | + return head; | |
2977 | + } | |
2978 | + return NULL; | |
2979 | +} | |
2980 | + | |
2981 | +static int page_trans_compound_anon_split(struct page *page) | |
2982 | +{ | |
2983 | + int ret = 0; | |
2984 | + struct page *transhuge_head = page_trans_compound_anon(page); | |
2985 | + if (transhuge_head) { | |
2986 | + /* Get the reference on the head to split it. */ | |
2987 | + if (get_page_unless_zero(transhuge_head)) { | |
2988 | + /* | |
2989 | + * Recheck we got the reference while the head | |
2990 | + * was still anonymous. | |
2991 | + */ | |
2992 | + if (PageAnon(transhuge_head)) | |
2993 | + ret = split_huge_page(transhuge_head); | |
2994 | + else | |
2995 | + /* | |
2996 | + * Retry later if split_huge_page run | |
2997 | + * from under us. | |
2998 | + */ | |
2999 | + ret = 1; | |
3000 | + put_page(transhuge_head); | |
3001 | + } else | |
3002 | + /* Retry later if split_huge_page run from under us. */ | |
3003 | + ret = 1; | |
3004 | + } | |
3005 | + return ret; | |
3006 | +} | |
3007 | + | |
3008 | +/** | |
3009 | + * Try to merge a rmap_item.page with a kpage in stable node. kpage must | |
3010 | + * already be a ksm page. | |
3011 | + * | |
3012 | + * @return 0 if the pages were merged, -EFAULT otherwise. | |
3013 | + */ | |
3014 | +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, | |
3015 | + struct page *kpage, u32 hash) | |
3016 | +{ | |
3017 | + struct vm_area_struct *vma = rmap_item->slot->vma; | |
3018 | + struct mm_struct *mm = vma->vm_mm; | |
3019 | + pte_t orig_pte = __pte(0); | |
3020 | + int err = MERGE_ERR_PGERR; | |
3021 | + struct page *page; | |
3022 | + | |
3023 | + if (uksm_test_exit(mm)) | |
3024 | + goto out; | |
3025 | + | |
3026 | + page = rmap_item->page; | |
3027 | + | |
3028 | + if (page == kpage) { /* ksm page forked */ | |
3029 | + err = 0; | |
3030 | + goto out; | |
3031 | + } | |
3032 | + | |
3033 | + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
3034 | + goto out; | |
3035 | + BUG_ON(PageTransCompound(page)); | |
3036 | + | |
3037 | + if (!PageAnon(page) || !PageKsm(kpage)) | |
3038 | + goto out; | |
3039 | + | |
3040 | + /* | |
3041 | + * We need the page lock to read a stable PageSwapCache in | |
3042 | + * write_protect_page(). We use trylock_page() instead of | |
3043 | + * lock_page() because we don't want to wait here - we | |
3044 | + * prefer to continue scanning and merging different pages, | |
3045 | + * then come back to this page when it is unlocked. | |
3046 | + */ | |
3047 | + if (!trylock_page(page)) | |
3048 | + goto out; | |
3049 | + /* | |
3050 | + * If this anonymous page is mapped only here, its pte may need | |
3051 | + * to be write-protected. If it's mapped elsewhere, all of its | |
3052 | + * ptes are necessarily already write-protected. But in either | |
3053 | + * case, we need to lock and check page_count is not raised. | |
3054 | + */ | |
3055 | + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { | |
3056 | + if (pages_identical(page, kpage)) | |
3057 | + err = replace_page(vma, page, kpage, orig_pte); | |
3058 | + else | |
3059 | + err = check_collision(rmap_item, hash); | |
3060 | + } | |
3061 | + | |
3062 | + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { | |
3063 | + munlock_vma_page(page); | |
3064 | + if (!PageMlocked(kpage)) { | |
3065 | + unlock_page(page); | |
3066 | + lock_page(kpage); | |
3067 | + mlock_vma_page(kpage); | |
3068 | + page = kpage; /* for final unlock */ | |
3069 | + } | |
3070 | + } | |
3071 | + | |
3072 | + unlock_page(page); | |
3073 | +out: | |
3074 | + return err; | |
3075 | +} | |
3076 | + | |
3077 | + | |
3078 | + | |
3079 | +/** | |
3080 | + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance | |
3081 | + * to restore a page mapping that has been changed in try_to_merge_two_pages. | |
3082 | + * | |
3083 | + * @return 0 on success. | |
3084 | + */ | |
3085 | +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, | |
3086 | + pte_t orig_pte, pte_t wprt_pte) | |
3087 | +{ | |
3088 | + struct mm_struct *mm = vma->vm_mm; | |
3089 | + pgd_t *pgd; | |
3090 | + pud_t *pud; | |
3091 | + pmd_t *pmd; | |
3092 | + pte_t *ptep; | |
3093 | + spinlock_t *ptl; | |
3094 | + | |
3095 | + int err = -EFAULT; | |
3096 | + | |
3097 | + pgd = pgd_offset(mm, addr); | |
3098 | + if (!pgd_present(*pgd)) | |
3099 | + goto out; | |
3100 | + | |
3101 | + pud = pud_offset(pgd, addr); | |
3102 | + if (!pud_present(*pud)) | |
3103 | + goto out; | |
3104 | + | |
3105 | + pmd = pmd_offset(pud, addr); | |
3106 | + if (!pmd_present(*pmd)) | |
3107 | + goto out; | |
3108 | + | |
3109 | + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
3110 | + if (!pte_same(*ptep, wprt_pte)) { | |
3111 | + /* already copied, let it be */ | |
3112 | + pte_unmap_unlock(ptep, ptl); | |
3113 | + goto out; | |
3114 | + } | |
3115 | + | |
3116 | + /* | |
3117 | + * Good boy, still here. When we still get the ksm page, it does not | |
3118 | + * return to the free page pool, there is no way that a pte was changed | |
3119 | + * to other page and gets back to this page. And remind that ksm page | |
3120 | + * do not reuse in do_wp_page(). So it's safe to restore the original | |
3121 | + * pte. | |
3122 | + */ | |
3123 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
3124 | + ptep_clear_flush(vma, addr, ptep); | |
3125 | + set_pte_at_notify(mm, addr, ptep, orig_pte); | |
3126 | + | |
3127 | + pte_unmap_unlock(ptep, ptl); | |
3128 | + err = 0; | |
3129 | +out: | |
3130 | + return err; | |
3131 | +} | |
3132 | + | |
3133 | +/** | |
3134 | + * try_to_merge_two_pages() - take two identical pages and prepare | |
3135 | + * them to be merged into one page(rmap_item->page) | |
3136 | + * | |
3137 | + * @return 0 if we successfully merged two identical pages into | |
3138 | + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision | |
3139 | + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been | |
3140 | + * changed since it's hashed. MERGE_ERR_PGERR otherwise. | |
3141 | + * | |
3142 | + */ | |
3143 | +static int try_to_merge_two_pages(struct rmap_item *rmap_item, | |
3144 | + struct rmap_item *tree_rmap_item, | |
3145 | + u32 hash) | |
3146 | +{ | |
3147 | + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); | |
3148 | + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); | |
3149 | + struct vm_area_struct *vma1 = rmap_item->slot->vma; | |
3150 | + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; | |
3151 | + struct page *page = rmap_item->page; | |
3152 | + struct page *tree_page = tree_rmap_item->page; | |
3153 | + int err = MERGE_ERR_PGERR; | |
3154 | + struct address_space *saved_mapping; | |
3155 | + | |
3156 | + | |
3157 | + if (rmap_item->page == tree_rmap_item->page) | |
3158 | + goto out; | |
3159 | + | |
3160 | + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
3161 | + goto out; | |
3162 | + BUG_ON(PageTransCompound(page)); | |
3163 | + | |
3164 | + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) | |
3165 | + goto out; | |
3166 | + BUG_ON(PageTransCompound(tree_page)); | |
3167 | + | |
3168 | + if (!PageAnon(page) || !PageAnon(tree_page)) | |
3169 | + goto out; | |
3170 | + | |
3171 | + if (!trylock_page(page)) | |
3172 | + goto out; | |
3173 | + | |
3174 | + | |
3175 | + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { | |
3176 | + unlock_page(page); | |
3177 | + goto out; | |
3178 | + } | |
3179 | + | |
3180 | + /* | |
3181 | + * While we hold page lock, upgrade page from | |
3182 | + * PageAnon+anon_vma to PageKsm+NULL stable_node: | |
3183 | + * stable_tree_insert() will update stable_node. | |
3184 | + */ | |
3185 | + saved_mapping = page->mapping; | |
3186 | + set_page_stable_node(page, NULL); | |
3187 | + mark_page_accessed(page); | |
3188 | + unlock_page(page); | |
3189 | + | |
3190 | + if (!trylock_page(tree_page)) | |
3191 | + goto restore_out; | |
3192 | + | |
3193 | + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { | |
3194 | + unlock_page(tree_page); | |
3195 | + goto restore_out; | |
3196 | + } | |
3197 | + | |
3198 | + if (pages_identical(page, tree_page)) { | |
3199 | + err = replace_page(vma2, tree_page, page, wprt_pte2); | |
3200 | + if (err) { | |
3201 | + unlock_page(tree_page); | |
3202 | + goto restore_out; | |
3203 | + } | |
3204 | + | |
3205 | + if ((vma2->vm_flags & VM_LOCKED)) { | |
3206 | + munlock_vma_page(tree_page); | |
3207 | + if (!PageMlocked(page)) { | |
3208 | + unlock_page(tree_page); | |
3209 | + lock_page(page); | |
3210 | + mlock_vma_page(page); | |
3211 | + tree_page = page; /* for final unlock */ | |
3212 | + } | |
3213 | + } | |
3214 | + | |
3215 | + unlock_page(tree_page); | |
3216 | + | |
3217 | + goto out; /* success */ | |
3218 | + | |
3219 | + } else { | |
3220 | + if (tree_rmap_item->hash_max && | |
3221 | + tree_rmap_item->hash_max == rmap_item->hash_max) { | |
3222 | + err = MERGE_ERR_COLLI_MAX; | |
3223 | + } else if (page_hash(page, hash_strength, 0) == | |
3224 | + page_hash(tree_page, hash_strength, 0)) { | |
3225 | + inc_rshash_neg(memcmp_cost + hash_strength * 2); | |
3226 | + err = MERGE_ERR_COLLI; | |
3227 | + } else { | |
3228 | + err = MERGE_ERR_CHANGED; | |
3229 | + } | |
3230 | + | |
3231 | + unlock_page(tree_page); | |
3232 | + } | |
3233 | + | |
3234 | +restore_out: | |
3235 | + lock_page(page); | |
3236 | + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), | |
3237 | + orig_pte1, wprt_pte1)) | |
3238 | + page->mapping = saved_mapping; | |
3239 | + | |
3240 | + unlock_page(page); | |
3241 | +out: | |
3242 | + return err; | |
3243 | +} | |
3244 | + | |
3245 | +static inline int hash_cmp(u32 new_val, u32 node_val) | |
3246 | +{ | |
3247 | + if (new_val > node_val) | |
3248 | + return 1; | |
3249 | + else if (new_val < node_val) | |
3250 | + return -1; | |
3251 | + else | |
3252 | + return 0; | |
3253 | +} | |
3254 | + | |
3255 | +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) | |
3256 | +{ | |
3257 | + u32 hash_max = item->hash_max; | |
3258 | + | |
3259 | + if (!hash_max) { | |
3260 | + hash_max = page_hash_max(item->page, hash); | |
3261 | + | |
3262 | + item->hash_max = hash_max; | |
3263 | + } | |
3264 | + | |
3265 | + return hash_max; | |
3266 | +} | |
3267 | + | |
3268 | + | |
3269 | + | |
3270 | +/** | |
3271 | + * stable_tree_search() - search the stable tree for a page | |
3272 | + * | |
3273 | + * @item: the rmap_item we are comparing with | |
3274 | + * @hash: the hash value of this item->page already calculated | |
3275 | + * | |
3276 | + * @return the page we have found, NULL otherwise. The page returned has | |
3277 | + * been gotten. | |
3278 | + */ | |
3279 | +static struct page *stable_tree_search(struct rmap_item *item, u32 hash) | |
3280 | +{ | |
3281 | + struct rb_node *node = root_stable_treep->rb_node; | |
3282 | + struct tree_node *tree_node; | |
3283 | + unsigned long hash_max; | |
3284 | + struct page *page = item->page; | |
3285 | + struct stable_node *stable_node; | |
3286 | + | |
3287 | + stable_node = page_stable_node(page); | |
3288 | + if (stable_node) { | |
3289 | + /* ksm page forked, that is | |
3290 | + * if (PageKsm(page) && !in_stable_tree(rmap_item)) | |
3291 | + * it's actually gotten once outside. | |
3292 | + */ | |
3293 | + get_page(page); | |
3294 | + return page; | |
3295 | + } | |
3296 | + | |
3297 | + while (node) { | |
3298 | + int cmp; | |
3299 | + | |
3300 | + tree_node = rb_entry(node, struct tree_node, node); | |
3301 | + | |
3302 | + cmp = hash_cmp(hash, tree_node->hash); | |
3303 | + | |
3304 | + if (cmp < 0) | |
3305 | + node = node->rb_left; | |
3306 | + else if (cmp > 0) | |
3307 | + node = node->rb_right; | |
3308 | + else | |
3309 | + break; | |
3310 | + } | |
3311 | + | |
3312 | + if (!node) | |
3313 | + return NULL; | |
3314 | + | |
3315 | + if (tree_node->count == 1) { | |
3316 | + stable_node = rb_entry(tree_node->sub_root.rb_node, | |
3317 | + struct stable_node, node); | |
3318 | + BUG_ON(!stable_node); | |
3319 | + | |
3320 | + goto get_page_out; | |
3321 | + } | |
3322 | + | |
3323 | + /* | |
3324 | + * ok, we have to search the second | |
3325 | + * level subtree, hash the page to a | |
3326 | + * full strength. | |
3327 | + */ | |
3328 | + node = tree_node->sub_root.rb_node; | |
3329 | + BUG_ON(!node); | |
3330 | + hash_max = rmap_item_hash_max(item, hash); | |
3331 | + | |
3332 | + while (node) { | |
3333 | + int cmp; | |
3334 | + | |
3335 | + stable_node = rb_entry(node, struct stable_node, node); | |
3336 | + | |
3337 | + cmp = hash_cmp(hash_max, stable_node->hash_max); | |
3338 | + | |
3339 | + if (cmp < 0) | |
3340 | + node = node->rb_left; | |
3341 | + else if (cmp > 0) | |
3342 | + node = node->rb_right; | |
3343 | + else | |
3344 | + goto get_page_out; | |
3345 | + } | |
3346 | + | |
3347 | + return NULL; | |
3348 | + | |
3349 | +get_page_out: | |
3350 | + page = get_uksm_page(stable_node, 1, 1); | |
3351 | + return page; | |
3352 | +} | |
3353 | + | |
3354 | +static int try_merge_rmap_item(struct rmap_item *item, | |
3355 | + struct page *kpage, | |
3356 | + struct page *tree_page) | |
3357 | +{ | |
3358 | + spinlock_t *ptl; | |
3359 | + pte_t *ptep; | |
3360 | + unsigned long addr; | |
3361 | + struct vm_area_struct *vma = item->slot->vma; | |
3362 | + | |
3363 | + addr = get_rmap_addr(item); | |
3364 | + ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); | |
3365 | + if (!ptep) | |
3366 | + return 0; | |
3367 | + | |
3368 | + if (pte_write(*ptep)) { | |
3369 | + /* has changed, abort! */ | |
3370 | + pte_unmap_unlock(ptep, ptl); | |
3371 | + return 0; | |
3372 | + } | |
3373 | + | |
3374 | + get_page(tree_page); | |
3375 | + page_add_anon_rmap(tree_page, vma, addr); | |
3376 | + | |
3377 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
3378 | + ptep_clear_flush(vma, addr, ptep); | |
3379 | + set_pte_at_notify(vma->vm_mm, addr, ptep, | |
3380 | + mk_pte(tree_page, vma->vm_page_prot)); | |
3381 | + | |
3382 | + page_remove_rmap(kpage); | |
3383 | + put_page(kpage); | |
3384 | + | |
3385 | + pte_unmap_unlock(ptep, ptl); | |
3386 | + | |
3387 | + return 1; | |
3388 | +} | |
3389 | + | |
3390 | +/** | |
3391 | + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted | |
3392 | + * into stable tree, the page was found to be identical to a stable ksm page, | |
3393 | + * this is the last chance we can merge them into one. | |
3394 | + * | |
3395 | + * @item1: the rmap_item holding the page which we wanted to insert | |
3396 | + * into stable tree. | |
3397 | + * @item2: the other rmap_item we found when unstable tree search | |
3398 | + * @oldpage: the page currently mapped by the two rmap_items | |
3399 | + * @tree_page: the page we found identical in stable tree node | |
3400 | + * @success1: return if item1 is successfully merged | |
3401 | + * @success2: return if item2 is successfully merged | |
3402 | + */ | |
3403 | +static void try_merge_with_stable(struct rmap_item *item1, | |
3404 | + struct rmap_item *item2, | |
3405 | + struct page **kpage, | |
3406 | + struct page *tree_page, | |
3407 | + int *success1, int *success2) | |
3408 | +{ | |
3409 | + struct vm_area_struct *vma1 = item1->slot->vma; | |
3410 | + struct vm_area_struct *vma2 = item2->slot->vma; | |
3411 | + *success1 = 0; | |
3412 | + *success2 = 0; | |
3413 | + | |
3414 | + if (unlikely(*kpage == tree_page)) { | |
3415 | + /* I don't think this can really happen */ | |
3416 | + printk(KERN_WARNING "UKSM: unexpected condition detected in " | |
3417 | + "try_merge_with_stable() -- *kpage == tree_page !\n"); | |
3418 | + *success1 = 1; | |
3419 | + *success2 = 1; | |
3420 | + return; | |
3421 | + } | |
3422 | + | |
3423 | + if (!PageAnon(*kpage) || !PageKsm(*kpage)) | |
3424 | + goto failed; | |
3425 | + | |
3426 | + if (!trylock_page(tree_page)) | |
3427 | + goto failed; | |
3428 | + | |
3429 | + /* If the oldpage is still ksm and still pointed | |
3430 | + * to in the right place, and still write protected, | |
3431 | + * we are confident it's not changed, no need to | |
3432 | + * memcmp anymore. | |
3433 | + * be ware, we cannot take nested pte locks, | |
3434 | + * deadlock risk. | |
3435 | + */ | |
3436 | + if (!try_merge_rmap_item(item1, *kpage, tree_page)) | |
3437 | + goto unlock_failed; | |
3438 | + | |
3439 | + /* ok, then vma2, remind that pte1 already set */ | |
3440 | + if (!try_merge_rmap_item(item2, *kpage, tree_page)) | |
3441 | + goto success_1; | |
3442 | + | |
3443 | + *success2 = 1; | |
3444 | +success_1: | |
3445 | + *success1 = 1; | |
3446 | + | |
3447 | + | |
3448 | + if ((*success1 && vma1->vm_flags & VM_LOCKED) || | |
3449 | + (*success2 && vma2->vm_flags & VM_LOCKED)) { | |
3450 | + munlock_vma_page(*kpage); | |
3451 | + if (!PageMlocked(tree_page)) | |
3452 | + mlock_vma_page(tree_page); | |
3453 | + } | |
3454 | + | |
3455 | + /* | |
3456 | + * We do not need oldpage any more in the caller, so can break the lock | |
3457 | + * now. | |
3458 | + */ | |
3459 | + unlock_page(*kpage); | |
3460 | + *kpage = tree_page; /* Get unlocked outside. */ | |
3461 | + return; | |
3462 | + | |
3463 | +unlock_failed: | |
3464 | + unlock_page(tree_page); | |
3465 | +failed: | |
3466 | + return; | |
3467 | +} | |
3468 | + | |
3469 | +static inline void stable_node_hash_max(struct stable_node *node, | |
3470 | + struct page *page, u32 hash) | |
3471 | +{ | |
3472 | + u32 hash_max = node->hash_max; | |
3473 | + | |
3474 | + if (!hash_max) { | |
3475 | + hash_max = page_hash_max(page, hash); | |
3476 | + node->hash_max = hash_max; | |
3477 | + } | |
3478 | +} | |
3479 | + | |
3480 | +static inline | |
3481 | +struct stable_node *new_stable_node(struct tree_node *tree_node, | |
3482 | + struct page *kpage, u32 hash_max) | |
3483 | +{ | |
3484 | + struct stable_node *new_stable_node; | |
3485 | + | |
3486 | + new_stable_node = alloc_stable_node(); | |
3487 | + if (!new_stable_node) | |
3488 | + return NULL; | |
3489 | + | |
3490 | + new_stable_node->kpfn = page_to_pfn(kpage); | |
3491 | + new_stable_node->hash_max = hash_max; | |
3492 | + new_stable_node->tree_node = tree_node; | |
3493 | + set_page_stable_node(kpage, new_stable_node); | |
3494 | + | |
3495 | + return new_stable_node; | |
3496 | +} | |
3497 | + | |
3498 | +static inline | |
3499 | +struct stable_node *first_level_insert(struct tree_node *tree_node, | |
3500 | + struct rmap_item *rmap_item, | |
3501 | + struct rmap_item *tree_rmap_item, | |
3502 | + struct page **kpage, u32 hash, | |
3503 | + int *success1, int *success2) | |
3504 | +{ | |
3505 | + int cmp; | |
3506 | + struct page *tree_page; | |
3507 | + u32 hash_max = 0; | |
3508 | + struct stable_node *stable_node, *new_snode; | |
3509 | + struct rb_node *parent = NULL, **new; | |
3510 | + | |
3511 | + /* this tree node contains no sub-tree yet */ | |
3512 | + stable_node = rb_entry(tree_node->sub_root.rb_node, | |
3513 | + struct stable_node, node); | |
3514 | + | |
3515 | + tree_page = get_uksm_page(stable_node, 1, 0); | |
3516 | + if (tree_page) { | |
3517 | + cmp = memcmp_pages(*kpage, tree_page, 1); | |
3518 | + if (!cmp) { | |
3519 | + try_merge_with_stable(rmap_item, tree_rmap_item, kpage, | |
3520 | + tree_page, success1, success2); | |
3521 | + put_page(tree_page); | |
3522 | + if (!*success1 && !*success2) | |
3523 | + goto failed; | |
3524 | + | |
3525 | + return stable_node; | |
3526 | + | |
3527 | + } else { | |
3528 | + /* | |
3529 | + * collision in first level try to create a subtree. | |
3530 | + * A new node need to be created. | |
3531 | + */ | |
3532 | + put_page(tree_page); | |
3533 | + | |
3534 | + stable_node_hash_max(stable_node, tree_page, | |
3535 | + tree_node->hash); | |
3536 | + hash_max = rmap_item_hash_max(rmap_item, hash); | |
3537 | + cmp = hash_cmp(hash_max, stable_node->hash_max); | |
3538 | + | |
3539 | + parent = &stable_node->node; | |
3540 | + if (cmp < 0) { | |
3541 | + new = &parent->rb_left; | |
3542 | + } else if (cmp > 0) { | |
3543 | + new = &parent->rb_right; | |
3544 | + } else { | |
3545 | + goto failed; | |
3546 | + } | |
3547 | + } | |
3548 | + | |
3549 | + } else { | |
3550 | + /* the only stable_node deleted, we reuse its tree_node. | |
3551 | + */ | |
3552 | + parent = NULL; | |
3553 | + new = &tree_node->sub_root.rb_node; | |
3554 | + } | |
3555 | + | |
3556 | + new_snode = new_stable_node(tree_node, *kpage, hash_max); | |
3557 | + if (!new_snode) | |
3558 | + goto failed; | |
3559 | + | |
3560 | + rb_link_node(&new_snode->node, parent, new); | |
3561 | + rb_insert_color(&new_snode->node, &tree_node->sub_root); | |
3562 | + tree_node->count++; | |
3563 | + *success1 = *success2 = 1; | |
3564 | + | |
3565 | + return new_snode; | |
3566 | + | |
3567 | +failed: | |
3568 | + return NULL; | |
3569 | +} | |
3570 | + | |
3571 | +static inline | |
3572 | +struct stable_node *stable_subtree_insert(struct tree_node *tree_node, | |
3573 | + struct rmap_item *rmap_item, | |
3574 | + struct rmap_item *tree_rmap_item, | |
3575 | + struct page **kpage, u32 hash, | |
3576 | + int *success1, int *success2) | |
3577 | +{ | |
3578 | + struct page *tree_page; | |
3579 | + u32 hash_max; | |
3580 | + struct stable_node *stable_node, *new_snode; | |
3581 | + struct rb_node *parent, **new; | |
3582 | + | |
3583 | +research: | |
3584 | + parent = NULL; | |
3585 | + new = &tree_node->sub_root.rb_node; | |
3586 | + BUG_ON(!*new); | |
3587 | + hash_max = rmap_item_hash_max(rmap_item, hash); | |
3588 | + while (*new) { | |
3589 | + int cmp; | |
3590 | + | |
3591 | + stable_node = rb_entry(*new, struct stable_node, node); | |
3592 | + | |
3593 | + cmp = hash_cmp(hash_max, stable_node->hash_max); | |
3594 | + | |
3595 | + if (cmp < 0) { | |
3596 | + parent = *new; | |
3597 | + new = &parent->rb_left; | |
3598 | + } else if (cmp > 0) { | |
3599 | + parent = *new; | |
3600 | + new = &parent->rb_right; | |
3601 | + } else { | |
3602 | + tree_page = get_uksm_page(stable_node, 1, 0); | |
3603 | + if (tree_page) { | |
3604 | + cmp = memcmp_pages(*kpage, tree_page, 1); | |
3605 | + if (!cmp) { | |
3606 | + try_merge_with_stable(rmap_item, | |
3607 | + tree_rmap_item, kpage, | |
3608 | + tree_page, success1, success2); | |
3609 | + | |
3610 | + put_page(tree_page); | |
3611 | + if (!*success1 && !*success2) | |
3612 | + goto failed; | |
3613 | + /* | |
3614 | + * successfully merged with a stable | |
3615 | + * node | |
3616 | + */ | |
3617 | + return stable_node; | |
3618 | + } else { | |
3619 | + put_page(tree_page); | |
3620 | + goto failed; | |
3621 | + } | |
3622 | + } else { | |
3623 | + /* | |
3624 | + * stable node may be deleted, | |
3625 | + * and subtree maybe | |
3626 | + * restructed, cannot | |
3627 | + * continue, research it. | |
3628 | + */ | |
3629 | + if (tree_node->count) { | |
3630 | + goto research; | |
3631 | + } else { | |
3632 | + /* reuse the tree node*/ | |
3633 | + parent = NULL; | |
3634 | + new = &tree_node->sub_root.rb_node; | |
3635 | + } | |
3636 | + } | |
3637 | + } | |
3638 | + } | |
3639 | + | |
3640 | + new_snode = new_stable_node(tree_node, *kpage, hash_max); | |
3641 | + if (!new_snode) | |
3642 | + goto failed; | |
3643 | + | |
3644 | + rb_link_node(&new_snode->node, parent, new); | |
3645 | + rb_insert_color(&new_snode->node, &tree_node->sub_root); | |
3646 | + tree_node->count++; | |
3647 | + *success1 = *success2 = 1; | |
3648 | + | |
3649 | + return new_snode; | |
3650 | + | |
3651 | +failed: | |
3652 | + return NULL; | |
3653 | +} | |
3654 | + | |
3655 | + | |
3656 | +/** | |
3657 | + * stable_tree_insert() - try to insert a merged page in unstable tree to | |
3658 | + * the stable tree | |
3659 | + * | |
3660 | + * @kpage: the page need to be inserted | |
3661 | + * @hash: the current hash of this page | |
3662 | + * @rmap_item: the rmap_item being scanned | |
3663 | + * @tree_rmap_item: the rmap_item found on unstable tree | |
3664 | + * @success1: return if rmap_item is merged | |
3665 | + * @success2: return if tree_rmap_item is merged | |
3666 | + * | |
3667 | + * @return the stable_node on stable tree if at least one | |
3668 | + * rmap_item is inserted into stable tree, NULL | |
3669 | + * otherwise. | |
3670 | + */ | |
3671 | +static struct stable_node * | |
3672 | +stable_tree_insert(struct page **kpage, u32 hash, | |
3673 | + struct rmap_item *rmap_item, | |
3674 | + struct rmap_item *tree_rmap_item, | |
3675 | + int *success1, int *success2) | |
3676 | +{ | |
3677 | + struct rb_node **new = &root_stable_treep->rb_node; | |
3678 | + struct rb_node *parent = NULL; | |
3679 | + struct stable_node *stable_node; | |
3680 | + struct tree_node *tree_node; | |
3681 | + u32 hash_max = 0; | |
3682 | + | |
3683 | + *success1 = *success2 = 0; | |
3684 | + | |
3685 | + while (*new) { | |
3686 | + int cmp; | |
3687 | + | |
3688 | + tree_node = rb_entry(*new, struct tree_node, node); | |
3689 | + | |
3690 | + cmp = hash_cmp(hash, tree_node->hash); | |
3691 | + | |
3692 | + if (cmp < 0) { | |
3693 | + parent = *new; | |
3694 | + new = &parent->rb_left; | |
3695 | + } else if (cmp > 0) { | |
3696 | + parent = *new; | |
3697 | + new = &parent->rb_right; | |
3698 | + } else | |
3699 | + break; | |
3700 | + } | |
3701 | + | |
3702 | + if (*new) { | |
3703 | + if (tree_node->count == 1) { | |
3704 | + stable_node = first_level_insert(tree_node, rmap_item, | |
3705 | + tree_rmap_item, kpage, | |
3706 | + hash, success1, success2); | |
3707 | + } else { | |
3708 | + stable_node = stable_subtree_insert(tree_node, | |
3709 | + rmap_item, tree_rmap_item, kpage, | |
3710 | + hash, success1, success2); | |
3711 | + } | |
3712 | + } else { | |
3713 | + | |
3714 | + /* no tree node found */ | |
3715 | + tree_node = alloc_tree_node(stable_tree_node_listp); | |
3716 | + if (!tree_node) { | |
3717 | + stable_node = NULL; | |
3718 | + goto out; | |
3719 | + } | |
3720 | + | |
3721 | + stable_node = new_stable_node(tree_node, *kpage, hash_max); | |
3722 | + if (!stable_node) { | |
3723 | + free_tree_node(tree_node); | |
3724 | + goto out; | |
3725 | + } | |
3726 | + | |
3727 | + tree_node->hash = hash; | |
3728 | + rb_link_node(&tree_node->node, parent, new); | |
3729 | + rb_insert_color(&tree_node->node, root_stable_treep); | |
3730 | + parent = NULL; | |
3731 | + new = &tree_node->sub_root.rb_node; | |
3732 | + | |
3733 | + rb_link_node(&stable_node->node, parent, new); | |
3734 | + rb_insert_color(&stable_node->node, &tree_node->sub_root); | |
3735 | + tree_node->count++; | |
3736 | + *success1 = *success2 = 1; | |
3737 | + } | |
3738 | + | |
3739 | +out: | |
3740 | + return stable_node; | |
3741 | +} | |
3742 | + | |
3743 | + | |
3744 | +/** | |
3745 | + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem | |
3746 | + * | |
3747 | + * @return 0 on success, -EBUSY if unable to lock the mmap_sem, | |
3748 | + * -EINVAL if the page mapping has been changed. | |
3749 | + */ | |
3750 | +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) | |
3751 | +{ | |
3752 | + int err; | |
3753 | + | |
3754 | + err = get_mergeable_page_lock_mmap(tree_rmap_item); | |
3755 | + | |
3756 | + if (err == -EINVAL) { | |
3757 | + /* its page map has been changed, remove it */ | |
3758 | + remove_rmap_item_from_tree(tree_rmap_item); | |
3759 | + } | |
3760 | + | |
3761 | + /* The page is gotten and mmap_sem is locked now. */ | |
3762 | + return err; | |
3763 | +} | |
3764 | + | |
3765 | + | |
3766 | +/** | |
3767 | + * unstable_tree_search_insert() - search an unstable tree rmap_item with the | |
3768 | + * same hash value. Get its page and trylock the mmap_sem | |
3769 | + */ | |
3770 | +static inline | |
3771 | +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |
3772 | + u32 hash) | |
3773 | + | |
3774 | +{ | |
3775 | + struct rb_node **new = &root_unstable_tree.rb_node; | |
3776 | + struct rb_node *parent = NULL; | |
3777 | + struct tree_node *tree_node; | |
3778 | + u32 hash_max; | |
3779 | + struct rmap_item *tree_rmap_item; | |
3780 | + | |
3781 | + while (*new) { | |
3782 | + int cmp; | |
3783 | + | |
3784 | + tree_node = rb_entry(*new, struct tree_node, node); | |
3785 | + | |
3786 | + cmp = hash_cmp(hash, tree_node->hash); | |
3787 | + | |
3788 | + if (cmp < 0) { | |
3789 | + parent = *new; | |
3790 | + new = &parent->rb_left; | |
3791 | + } else if (cmp > 0) { | |
3792 | + parent = *new; | |
3793 | + new = &parent->rb_right; | |
3794 | + } else | |
3795 | + break; | |
3796 | + } | |
3797 | + | |
3798 | + if (*new) { | |
3799 | + /* got the tree_node */ | |
3800 | + if (tree_node->count == 1) { | |
3801 | + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, | |
3802 | + struct rmap_item, node); | |
3803 | + BUG_ON(!tree_rmap_item); | |
3804 | + | |
3805 | + goto get_page_out; | |
3806 | + } | |
3807 | + | |
3808 | + /* well, search the collision subtree */ | |
3809 | + new = &tree_node->sub_root.rb_node; | |
3810 | + BUG_ON(!*new); | |
3811 | + hash_max = rmap_item_hash_max(rmap_item, hash); | |
3812 | + | |
3813 | + while (*new) { | |
3814 | + int cmp; | |
3815 | + | |
3816 | + tree_rmap_item = rb_entry(*new, struct rmap_item, | |
3817 | + node); | |
3818 | + | |
3819 | + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); | |
3820 | + parent = *new; | |
3821 | + if (cmp < 0) | |
3822 | + new = &parent->rb_left; | |
3823 | + else if (cmp > 0) | |
3824 | + new = &parent->rb_right; | |
3825 | + else | |
3826 | + goto get_page_out; | |
3827 | + } | |
3828 | + } else { | |
3829 | + /* alloc a new tree_node */ | |
3830 | + tree_node = alloc_tree_node(&unstable_tree_node_list); | |
3831 | + if (!tree_node) | |
3832 | + return NULL; | |
3833 | + | |
3834 | + tree_node->hash = hash; | |
3835 | + rb_link_node(&tree_node->node, parent, new); | |
3836 | + rb_insert_color(&tree_node->node, &root_unstable_tree); | |
3837 | + parent = NULL; | |
3838 | + new = &tree_node->sub_root.rb_node; | |
3839 | + } | |
3840 | + | |
3841 | + /* did not found even in sub-tree */ | |
3842 | + rmap_item->tree_node = tree_node; | |
3843 | + rmap_item->address |= UNSTABLE_FLAG; | |
3844 | + rmap_item->hash_round = uksm_hash_round; | |
3845 | + rb_link_node(&rmap_item->node, parent, new); | |
3846 | + rb_insert_color(&rmap_item->node, &tree_node->sub_root); | |
3847 | + | |
3848 | + uksm_pages_unshared++; | |
3849 | + return NULL; | |
3850 | + | |
3851 | +get_page_out: | |
3852 | + if (tree_rmap_item->page == rmap_item->page) | |
3853 | + return NULL; | |
3854 | + | |
3855 | + if (get_tree_rmap_item_page(tree_rmap_item)) | |
3856 | + return NULL; | |
3857 | + | |
3858 | + return tree_rmap_item; | |
3859 | +} | |
3860 | + | |
3861 | +static void hold_anon_vma(struct rmap_item *rmap_item, | |
3862 | + struct anon_vma *anon_vma) | |
3863 | +{ | |
3864 | + rmap_item->anon_vma = anon_vma; | |
3865 | + get_anon_vma(anon_vma); | |
3866 | +} | |
3867 | + | |
3868 | + | |
3869 | +/** | |
3870 | + * stable_tree_append() - append a rmap_item to a stable node. Deduplication | |
3871 | + * ratio statistics is done in this function. | |
3872 | + * | |
3873 | + */ | |
3874 | +static void stable_tree_append(struct rmap_item *rmap_item, | |
3875 | + struct stable_node *stable_node, int logdedup) | |
3876 | +{ | |
3877 | + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; | |
3878 | + unsigned long key = (unsigned long)rmap_item->slot; | |
3879 | + unsigned long factor = rmap_item->slot->rung->step; | |
3880 | + | |
3881 | + BUG_ON(!stable_node); | |
3882 | + rmap_item->address |= STABLE_FLAG; | |
3883 | + | |
3884 | + if (hlist_empty(&stable_node->hlist)) { | |
3885 | + uksm_pages_shared++; | |
3886 | + goto node_vma_new; | |
3887 | + } else { | |
3888 | + uksm_pages_sharing++; | |
3889 | + } | |
3890 | + | |
3891 | + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { | |
3892 | + if (node_vma->key >= key) | |
3893 | + break; | |
3894 | + | |
3895 | + if (logdedup) { | |
3896 | + node_vma->slot->pages_bemerged += factor; | |
3897 | + if (list_empty(&node_vma->slot->dedup_list)) | |
3898 | + list_add(&node_vma->slot->dedup_list, | |
3899 | + &vma_slot_dedup); | |
3900 | + } | |
3901 | + } | |
3902 | + | |
3903 | + if (node_vma) { | |
3904 | + if (node_vma->key == key) { | |
3905 | + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); | |
3906 | + goto node_vma_ok; | |
3907 | + } else if (node_vma->key > key) { | |
3908 | + node_vma_cont = node_vma; | |
3909 | + } | |
3910 | + } | |
3911 | + | |
3912 | +node_vma_new: | |
3913 | + /* no same vma already in node, alloc a new node_vma */ | |
3914 | + new_node_vma = alloc_node_vma(); | |
3915 | + BUG_ON(!new_node_vma); | |
3916 | + new_node_vma->head = stable_node; | |
3917 | + new_node_vma->slot = rmap_item->slot; | |
3918 | + | |
3919 | + if (!node_vma) { | |
3920 | + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); | |
3921 | + } else if (node_vma->key != key) { | |
3922 | + if (node_vma->key < key) | |
3923 | + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); | |
3924 | + else { | |
3925 | + hlist_add_before(&new_node_vma->hlist, | |
3926 | + &node_vma->hlist); | |
3927 | + } | |
3928 | + | |
3929 | + } | |
3930 | + node_vma = new_node_vma; | |
3931 | + | |
3932 | +node_vma_ok: /* ok, ready to add to the list */ | |
3933 | + rmap_item->head = node_vma; | |
3934 | + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); | |
3935 | + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); | |
3936 | + if (logdedup) { | |
3937 | + rmap_item->slot->pages_merged++; | |
3938 | + if (node_vma_cont) { | |
3939 | + node_vma = node_vma_cont; | |
3940 | + hlist_for_each_entry_continue(node_vma, hlist) { | |
3941 | + node_vma->slot->pages_bemerged += factor; | |
3942 | + if (list_empty(&node_vma->slot->dedup_list)) | |
3943 | + list_add(&node_vma->slot->dedup_list, | |
3944 | + &vma_slot_dedup); | |
3945 | + } | |
3946 | + } | |
3947 | + } | |
3948 | +} | |
3949 | + | |
3950 | +/* | |
3951 | + * We use break_ksm to break COW on a ksm page: it's a stripped down | |
3952 | + * | |
3953 | + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) | |
3954 | + * put_page(page); | |
3955 | + * | |
3956 | + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, | |
3957 | + * in case the application has unmapped and remapped mm,addr meanwhile. | |
3958 | + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP | |
3959 | + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. | |
3960 | + */ | |
3961 | +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |
3962 | +{ | |
3963 | + struct page *page; | |
3964 | + int ret = 0; | |
3965 | + | |
3966 | + do { | |
3967 | + cond_resched(); | |
3968 | + page = follow_page(vma, addr, FOLL_GET); | |
3969 | + if (IS_ERR_OR_NULL(page)) | |
3970 | + break; | |
3971 | + if (PageKsm(page)) { | |
3972 | + ret = handle_mm_fault(vma->vm_mm, vma, addr, | |
3973 | + FAULT_FLAG_WRITE); | |
3974 | + } else | |
3975 | + ret = VM_FAULT_WRITE; | |
3976 | + put_page(page); | |
3977 | + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); | |
3978 | + /* | |
3979 | + * We must loop because handle_mm_fault() may back out if there's | |
3980 | + * any difficulty e.g. if pte accessed bit gets updated concurrently. | |
3981 | + * | |
3982 | + * VM_FAULT_WRITE is what we have been hoping for: it indicates that | |
3983 | + * COW has been broken, even if the vma does not permit VM_WRITE; | |
3984 | + * but note that a concurrent fault might break PageKsm for us. | |
3985 | + * | |
3986 | + * VM_FAULT_SIGBUS could occur if we race with truncation of the | |
3987 | + * backing file, which also invalidates anonymous pages: that's | |
3988 | + * okay, that truncation will have unmapped the PageKsm for us. | |
3989 | + * | |
3990 | + * VM_FAULT_OOM: at the time of writing (late July 2009), setting | |
3991 | + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the | |
3992 | + * current task has TIF_MEMDIE set, and will be OOM killed on return | |
3993 | + * to user; and ksmd, having no mm, would never be chosen for that. | |
3994 | + * | |
3995 | + * But if the mm is in a limited mem_cgroup, then the fault may fail | |
3996 | + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and | |
3997 | + * even ksmd can fail in this way - though it's usually breaking ksm | |
3998 | + * just to undo a merge it made a moment before, so unlikely to oom. | |
3999 | + * | |
4000 | + * That's a pity: we might therefore have more kernel pages allocated | |
4001 | + * than we're counting as nodes in the stable tree; but uksm_do_scan | |
4002 | + * will retry to break_cow on each pass, so should recover the page | |
4003 | + * in due course. The important thing is to not let VM_MERGEABLE | |
4004 | + * be cleared while any such pages might remain in the area. | |
4005 | + */ | |
4006 | + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | |
4007 | +} | |
4008 | + | |
4009 | +static void break_cow(struct rmap_item *rmap_item) | |
4010 | +{ | |
4011 | + struct vm_area_struct *vma = rmap_item->slot->vma; | |
4012 | + struct mm_struct *mm = vma->vm_mm; | |
4013 | + unsigned long addr = get_rmap_addr(rmap_item); | |
4014 | + | |
4015 | + if (uksm_test_exit(mm)) | |
4016 | + goto out; | |
4017 | + | |
4018 | + break_ksm(vma, addr); | |
4019 | +out: | |
4020 | + return; | |
4021 | +} | |
4022 | + | |
4023 | +/* | |
4024 | + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | |
4025 | + * than check every pte of a given vma, the locking doesn't quite work for | |
4026 | + * that - an rmap_item is assigned to the stable tree after inserting ksm | |
4027 | + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | |
4028 | + * rmap_items from parent to child at fork time (so as not to waste time | |
4029 | + * if exit comes before the next scan reaches it). | |
4030 | + * | |
4031 | + * Similarly, although we'd like to remove rmap_items (so updating counts | |
4032 | + * and freeing memory) when unmerging an area, it's easier to leave that | |
4033 | + * to the next pass of ksmd - consider, for example, how ksmd might be | |
4034 | + * in cmp_and_merge_page on one of the rmap_items we would be removing. | |
4035 | + */ | |
4036 | +inline int unmerge_uksm_pages(struct vm_area_struct *vma, | |
4037 | + unsigned long start, unsigned long end) | |
4038 | +{ | |
4039 | + unsigned long addr; | |
4040 | + int err = 0; | |
4041 | + | |
4042 | + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { | |
4043 | + if (uksm_test_exit(vma->vm_mm)) | |
4044 | + break; | |
4045 | + if (signal_pending(current)) | |
4046 | + err = -ERESTARTSYS; | |
4047 | + else | |
4048 | + err = break_ksm(vma, addr); | |
4049 | + } | |
4050 | + return err; | |
4051 | +} | |
4052 | + | |
4053 | +static inline void inc_uksm_pages_scanned(void) | |
4054 | +{ | |
4055 | + u64 delta; | |
4056 | + | |
4057 | + | |
4058 | + if (uksm_pages_scanned == U64_MAX) { | |
4059 | + encode_benefit(); | |
4060 | + | |
4061 | + delta = uksm_pages_scanned >> pages_scanned_base; | |
4062 | + | |
4063 | + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { | |
4064 | + pages_scanned_stored >>= 1; | |
4065 | + delta >>= 1; | |
4066 | + pages_scanned_base++; | |
4067 | + } | |
4068 | + | |
4069 | + pages_scanned_stored += delta; | |
4070 | + | |
4071 | + uksm_pages_scanned = uksm_pages_scanned_last = 0; | |
4072 | + } | |
4073 | + | |
4074 | + uksm_pages_scanned++; | |
4075 | +} | |
4076 | + | |
4077 | +static inline int find_zero_page_hash(int strength, u32 hash) | |
4078 | +{ | |
4079 | + return (zero_hash_table[strength] == hash); | |
4080 | +} | |
4081 | + | |
4082 | +static | |
4083 | +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) | |
4084 | +{ | |
4085 | + struct page *zero_page = empty_uksm_zero_page; | |
4086 | + struct mm_struct *mm = vma->vm_mm; | |
4087 | + pte_t orig_pte = __pte(0); | |
4088 | + int err = -EFAULT; | |
4089 | + | |
4090 | + if (uksm_test_exit(mm)) | |
4091 | + goto out; | |
4092 | + | |
4093 | + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
4094 | + goto out; | |
4095 | + BUG_ON(PageTransCompound(page)); | |
4096 | + | |
4097 | + if (!PageAnon(page)) | |
4098 | + goto out; | |
4099 | + | |
4100 | + if (!trylock_page(page)) | |
4101 | + goto out; | |
4102 | + | |
4103 | + if (write_protect_page(vma, page, &orig_pte, 0) == 0) { | |
4104 | + if (is_page_full_zero(page)) | |
4105 | + err = replace_page(vma, page, zero_page, orig_pte); | |
4106 | + } | |
4107 | + | |
4108 | + unlock_page(page); | |
4109 | +out: | |
4110 | + return err; | |
4111 | +} | |
4112 | + | |
4113 | +/* | |
4114 | + * cmp_and_merge_page() - first see if page can be merged into the stable | |
4115 | + * tree; if not, compare hash to previous and if it's the same, see if page | |
4116 | + * can be inserted into the unstable tree, or merged with a page already there | |
4117 | + * and both transferred to the stable tree. | |
4118 | + * | |
4119 | + * @page: the page that we are searching identical page to. | |
4120 | + * @rmap_item: the reverse mapping into the virtual address of this page | |
4121 | + */ | |
4122 | +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) | |
4123 | +{ | |
4124 | + struct rmap_item *tree_rmap_item; | |
4125 | + struct page *page; | |
4126 | + struct page *kpage = NULL; | |
4127 | + u32 hash_max; | |
4128 | + int err; | |
4129 | + unsigned int success1, success2; | |
4130 | + struct stable_node *snode; | |
4131 | + int cmp; | |
4132 | + struct rb_node *parent = NULL, **new; | |
4133 | + | |
4134 | + remove_rmap_item_from_tree(rmap_item); | |
4135 | + page = rmap_item->page; | |
4136 | + | |
4137 | + /* We first start with searching the page inside the stable tree */ | |
4138 | + kpage = stable_tree_search(rmap_item, hash); | |
4139 | + if (kpage) { | |
4140 | + err = try_to_merge_with_uksm_page(rmap_item, kpage, | |
4141 | + hash); | |
4142 | + if (!err) { | |
4143 | + /* | |
4144 | + * The page was successfully merged, add | |
4145 | + * its rmap_item to the stable tree. | |
4146 | + * page lock is needed because it's | |
4147 | + * racing with try_to_unmap_ksm(), etc. | |
4148 | + */ | |
4149 | + lock_page(kpage); | |
4150 | + snode = page_stable_node(kpage); | |
4151 | + stable_tree_append(rmap_item, snode, 1); | |
4152 | + unlock_page(kpage); | |
4153 | + put_page(kpage); | |
4154 | + return; /* success */ | |
4155 | + } | |
4156 | + put_page(kpage); | |
4157 | + | |
4158 | + /* | |
4159 | + * if it's a collision and it has been search in sub-rbtree | |
4160 | + * (hash_max != 0), we want to abort, because if it is | |
4161 | + * successfully merged in unstable tree, the collision trends to | |
4162 | + * happen again. | |
4163 | + */ | |
4164 | + if (err == MERGE_ERR_COLLI && rmap_item->hash_max) | |
4165 | + return; | |
4166 | + } | |
4167 | + | |
4168 | + tree_rmap_item = | |
4169 | + unstable_tree_search_insert(rmap_item, hash); | |
4170 | + if (tree_rmap_item) { | |
4171 | + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); | |
4172 | + /* | |
4173 | + * As soon as we merge this page, we want to remove the | |
4174 | + * rmap_item of the page we have merged with from the unstable | |
4175 | + * tree, and insert it instead as new node in the stable tree. | |
4176 | + */ | |
4177 | + if (!err) { | |
4178 | + kpage = page; | |
4179 | + remove_rmap_item_from_tree(tree_rmap_item); | |
4180 | + lock_page(kpage); | |
4181 | + snode = stable_tree_insert(&kpage, hash, | |
4182 | + rmap_item, tree_rmap_item, | |
4183 | + &success1, &success2); | |
4184 | + | |
4185 | + /* | |
4186 | + * Do not log dedup for tree item, it's not counted as | |
4187 | + * scanned in this round. | |
4188 | + */ | |
4189 | + if (success2) | |
4190 | + stable_tree_append(tree_rmap_item, snode, 0); | |
4191 | + | |
4192 | + /* | |
4193 | + * The order of these two stable append is important: | |
4194 | + * we are scanning rmap_item. | |
4195 | + */ | |
4196 | + if (success1) | |
4197 | + stable_tree_append(rmap_item, snode, 1); | |
4198 | + | |
4199 | + /* | |
4200 | + * The original kpage may be unlocked inside | |
4201 | + * stable_tree_insert() already. This page | |
4202 | + * should be unlocked before doing | |
4203 | + * break_cow(). | |
4204 | + */ | |
4205 | + unlock_page(kpage); | |
4206 | + | |
4207 | + if (!success1) | |
4208 | + break_cow(rmap_item); | |
4209 | + | |
4210 | + if (!success2) | |
4211 | + break_cow(tree_rmap_item); | |
4212 | + | |
4213 | + } else if (err == MERGE_ERR_COLLI) { | |
4214 | + BUG_ON(tree_rmap_item->tree_node->count > 1); | |
4215 | + | |
4216 | + rmap_item_hash_max(tree_rmap_item, | |
4217 | + tree_rmap_item->tree_node->hash); | |
4218 | + | |
4219 | + hash_max = rmap_item_hash_max(rmap_item, hash); | |
4220 | + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); | |
4221 | + parent = &tree_rmap_item->node; | |
4222 | + if (cmp < 0) | |
4223 | + new = &parent->rb_left; | |
4224 | + else if (cmp > 0) | |
4225 | + new = &parent->rb_right; | |
4226 | + else | |
4227 | + goto put_up_out; | |
4228 | + | |
4229 | + rmap_item->tree_node = tree_rmap_item->tree_node; | |
4230 | + rmap_item->address |= UNSTABLE_FLAG; | |
4231 | + rmap_item->hash_round = uksm_hash_round; | |
4232 | + rb_link_node(&rmap_item->node, parent, new); | |
4233 | + rb_insert_color(&rmap_item->node, | |
4234 | + &tree_rmap_item->tree_node->sub_root); | |
4235 | + rmap_item->tree_node->count++; | |
4236 | + } else { | |
4237 | + /* | |
4238 | + * either one of the page has changed or they collide | |
4239 | + * at the max hash, we consider them as ill items. | |
4240 | + */ | |
4241 | + remove_rmap_item_from_tree(tree_rmap_item); | |
4242 | + } | |
4243 | +put_up_out: | |
4244 | + put_page(tree_rmap_item->page); | |
4245 | + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); | |
4246 | + } | |
4247 | +} | |
4248 | + | |
4249 | + | |
4250 | + | |
4251 | + | |
4252 | +static inline unsigned long get_pool_index(struct vma_slot *slot, | |
4253 | + unsigned long index) | |
4254 | +{ | |
4255 | + unsigned long pool_index; | |
4256 | + | |
4257 | + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; | |
4258 | + if (pool_index >= slot->pool_size) | |
4259 | + BUG(); | |
4260 | + return pool_index; | |
4261 | +} | |
4262 | + | |
4263 | +static inline unsigned long index_page_offset(unsigned long index) | |
4264 | +{ | |
4265 | + return offset_in_page(sizeof(struct rmap_list_entry *) * index); | |
4266 | +} | |
4267 | + | |
4268 | +static inline | |
4269 | +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, | |
4270 | + unsigned long index, int need_alloc) | |
4271 | +{ | |
4272 | + unsigned long pool_index; | |
4273 | + struct page *page; | |
4274 | + void *addr; | |
4275 | + | |
4276 | + | |
4277 | + pool_index = get_pool_index(slot, index); | |
4278 | + if (!slot->rmap_list_pool[pool_index]) { | |
4279 | + if (!need_alloc) | |
4280 | + return NULL; | |
4281 | + | |
4282 | + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); | |
4283 | + if (!page) | |
4284 | + return NULL; | |
4285 | + | |
4286 | + slot->rmap_list_pool[pool_index] = page; | |
4287 | + } | |
4288 | + | |
4289 | + addr = kmap(slot->rmap_list_pool[pool_index]); | |
4290 | + addr += index_page_offset(index); | |
4291 | + | |
4292 | + return addr; | |
4293 | +} | |
4294 | + | |
4295 | +static inline void put_rmap_list_entry(struct vma_slot *slot, | |
4296 | + unsigned long index) | |
4297 | +{ | |
4298 | + unsigned long pool_index; | |
4299 | + | |
4300 | + pool_index = get_pool_index(slot, index); | |
4301 | + BUG_ON(!slot->rmap_list_pool[pool_index]); | |
4302 | + kunmap(slot->rmap_list_pool[pool_index]); | |
4303 | +} | |
4304 | + | |
4305 | +static inline int entry_is_new(struct rmap_list_entry *entry) | |
4306 | +{ | |
4307 | + return !entry->item; | |
4308 | +} | |
4309 | + | |
4310 | +static inline unsigned long get_index_orig_addr(struct vma_slot *slot, | |
4311 | + unsigned long index) | |
4312 | +{ | |
4313 | + return slot->vma->vm_start + (index << PAGE_SHIFT); | |
4314 | +} | |
4315 | + | |
4316 | +static inline unsigned long get_entry_address(struct rmap_list_entry *entry) | |
4317 | +{ | |
4318 | + unsigned long addr; | |
4319 | + | |
4320 | + if (is_addr(entry->addr)) | |
4321 | + addr = get_clean_addr(entry->addr); | |
4322 | + else if (entry->item) | |
4323 | + addr = get_rmap_addr(entry->item); | |
4324 | + else | |
4325 | + BUG(); | |
4326 | + | |
4327 | + return addr; | |
4328 | +} | |
4329 | + | |
4330 | +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) | |
4331 | +{ | |
4332 | + if (is_addr(entry->addr)) | |
4333 | + return NULL; | |
4334 | + | |
4335 | + return entry->item; | |
4336 | +} | |
4337 | + | |
4338 | +static inline void inc_rmap_list_pool_count(struct vma_slot *slot, | |
4339 | + unsigned long index) | |
4340 | +{ | |
4341 | + unsigned long pool_index; | |
4342 | + | |
4343 | + pool_index = get_pool_index(slot, index); | |
4344 | + BUG_ON(!slot->rmap_list_pool[pool_index]); | |
4345 | + slot->pool_counts[pool_index]++; | |
4346 | +} | |
4347 | + | |
4348 | +static inline void dec_rmap_list_pool_count(struct vma_slot *slot, | |
4349 | + unsigned long index) | |
4350 | +{ | |
4351 | + unsigned long pool_index; | |
4352 | + | |
4353 | + pool_index = get_pool_index(slot, index); | |
4354 | + BUG_ON(!slot->rmap_list_pool[pool_index]); | |
4355 | + BUG_ON(!slot->pool_counts[pool_index]); | |
4356 | + slot->pool_counts[pool_index]--; | |
4357 | +} | |
4358 | + | |
4359 | +static inline int entry_has_rmap(struct rmap_list_entry *entry) | |
4360 | +{ | |
4361 | + return !is_addr(entry->addr) && entry->item; | |
4362 | +} | |
4363 | + | |
4364 | +static inline void swap_entries(struct rmap_list_entry *entry1, | |
4365 | + unsigned long index1, | |
4366 | + struct rmap_list_entry *entry2, | |
4367 | + unsigned long index2) | |
4368 | +{ | |
4369 | + struct rmap_list_entry tmp; | |
4370 | + | |
4371 | + /* swapping two new entries is meaningless */ | |
4372 | + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); | |
4373 | + | |
4374 | + tmp = *entry1; | |
4375 | + *entry1 = *entry2; | |
4376 | + *entry2 = tmp; | |
4377 | + | |
4378 | + if (entry_has_rmap(entry1)) | |
4379 | + entry1->item->entry_index = index1; | |
4380 | + | |
4381 | + if (entry_has_rmap(entry2)) | |
4382 | + entry2->item->entry_index = index2; | |
4383 | + | |
4384 | + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { | |
4385 | + inc_rmap_list_pool_count(entry1->item->slot, index1); | |
4386 | + dec_rmap_list_pool_count(entry1->item->slot, index2); | |
4387 | + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { | |
4388 | + inc_rmap_list_pool_count(entry2->item->slot, index2); | |
4389 | + dec_rmap_list_pool_count(entry2->item->slot, index1); | |
4390 | + } | |
4391 | +} | |
4392 | + | |
4393 | +static inline void free_entry_item(struct rmap_list_entry *entry) | |
4394 | +{ | |
4395 | + unsigned long index; | |
4396 | + struct rmap_item *item; | |
4397 | + | |
4398 | + if (!is_addr(entry->addr)) { | |
4399 | + BUG_ON(!entry->item); | |
4400 | + item = entry->item; | |
4401 | + entry->addr = get_rmap_addr(item); | |
4402 | + set_is_addr(entry->addr); | |
4403 | + index = item->entry_index; | |
4404 | + remove_rmap_item_from_tree(item); | |
4405 | + dec_rmap_list_pool_count(item->slot, index); | |
4406 | + free_rmap_item(item); | |
4407 | + } | |
4408 | +} | |
4409 | + | |
4410 | +static inline int pool_entry_boundary(unsigned long index) | |
4411 | +{ | |
4412 | + unsigned long linear_addr; | |
4413 | + | |
4414 | + linear_addr = sizeof(struct rmap_list_entry *) * index; | |
4415 | + return index && !offset_in_page(linear_addr); | |
4416 | +} | |
4417 | + | |
4418 | +static inline void try_free_last_pool(struct vma_slot *slot, | |
4419 | + unsigned long index) | |
4420 | +{ | |
4421 | + unsigned long pool_index; | |
4422 | + | |
4423 | + pool_index = get_pool_index(slot, index); | |
4424 | + if (slot->rmap_list_pool[pool_index] && | |
4425 | + !slot->pool_counts[pool_index]) { | |
4426 | + __free_page(slot->rmap_list_pool[pool_index]); | |
4427 | + slot->rmap_list_pool[pool_index] = NULL; | |
4428 | + slot->flags |= UKSM_SLOT_NEED_SORT; | |
4429 | + } | |
4430 | + | |
4431 | +} | |
4432 | + | |
4433 | +static inline unsigned long vma_item_index(struct vm_area_struct *vma, | |
4434 | + struct rmap_item *item) | |
4435 | +{ | |
4436 | + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; | |
4437 | +} | |
4438 | + | |
4439 | +static int within_same_pool(struct vma_slot *slot, | |
4440 | + unsigned long i, unsigned long j) | |
4441 | +{ | |
4442 | + unsigned long pool_i, pool_j; | |
4443 | + | |
4444 | + pool_i = get_pool_index(slot, i); | |
4445 | + pool_j = get_pool_index(slot, j); | |
4446 | + | |
4447 | + return (pool_i == pool_j); | |
4448 | +} | |
4449 | + | |
4450 | +static void sort_rmap_entry_list(struct vma_slot *slot) | |
4451 | +{ | |
4452 | + unsigned long i, j; | |
4453 | + struct rmap_list_entry *entry, *swap_entry; | |
4454 | + | |
4455 | + entry = get_rmap_list_entry(slot, 0, 0); | |
4456 | + for (i = 0; i < slot->pages; ) { | |
4457 | + | |
4458 | + if (!entry) | |
4459 | + goto skip_whole_pool; | |
4460 | + | |
4461 | + if (entry_is_new(entry)) | |
4462 | + goto next_entry; | |
4463 | + | |
4464 | + if (is_addr(entry->addr)) { | |
4465 | + entry->addr = 0; | |
4466 | + goto next_entry; | |
4467 | + } | |
4468 | + | |
4469 | + j = vma_item_index(slot->vma, entry->item); | |
4470 | + if (j == i) | |
4471 | + goto next_entry; | |
4472 | + | |
4473 | + if (within_same_pool(slot, i, j)) | |
4474 | + swap_entry = entry + j - i; | |
4475 | + else | |
4476 | + swap_entry = get_rmap_list_entry(slot, j, 1); | |
4477 | + | |
4478 | + swap_entries(entry, i, swap_entry, j); | |
4479 | + if (!within_same_pool(slot, i, j)) | |
4480 | + put_rmap_list_entry(slot, j); | |
4481 | + continue; | |
4482 | + | |
4483 | +skip_whole_pool: | |
4484 | + i += PAGE_SIZE / sizeof(*entry); | |
4485 | + if (i < slot->pages) | |
4486 | + entry = get_rmap_list_entry(slot, i, 0); | |
4487 | + continue; | |
4488 | + | |
4489 | +next_entry: | |
4490 | + if (i >= slot->pages - 1 || | |
4491 | + !within_same_pool(slot, i, i + 1)) { | |
4492 | + put_rmap_list_entry(slot, i); | |
4493 | + if (i + 1 < slot->pages) | |
4494 | + entry = get_rmap_list_entry(slot, i + 1, 0); | |
4495 | + } else | |
4496 | + entry++; | |
4497 | + i++; | |
4498 | + continue; | |
4499 | + } | |
4500 | + | |
4501 | + /* free empty pool entries which contain no rmap_item */ | |
4502 | + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ | |
4503 | + for (i = 0; i < slot->pool_size; i++) { | |
4504 | + unsigned char has_rmap; | |
4505 | + void *addr; | |
4506 | + | |
4507 | + if (!slot->rmap_list_pool[i]) | |
4508 | + continue; | |
4509 | + | |
4510 | + has_rmap = 0; | |
4511 | + addr = kmap(slot->rmap_list_pool[i]); | |
4512 | + BUG_ON(!addr); | |
4513 | + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { | |
4514 | + entry = (struct rmap_list_entry *)addr + j; | |
4515 | + if (is_addr(entry->addr)) | |
4516 | + continue; | |
4517 | + if (!entry->item) | |
4518 | + continue; | |
4519 | + has_rmap = 1; | |
4520 | + } | |
4521 | + kunmap(slot->rmap_list_pool[i]); | |
4522 | + if (!has_rmap) { | |
4523 | + BUG_ON(slot->pool_counts[i]); | |
4524 | + __free_page(slot->rmap_list_pool[i]); | |
4525 | + slot->rmap_list_pool[i] = NULL; | |
4526 | + } | |
4527 | + } | |
4528 | + | |
4529 | + slot->flags &= ~UKSM_SLOT_NEED_SORT; | |
4530 | +} | |
4531 | + | |
4532 | +/* | |
4533 | + * vma_fully_scanned() - if all the pages in this slot have been scanned. | |
4534 | + */ | |
4535 | +static inline int vma_fully_scanned(struct vma_slot *slot) | |
4536 | +{ | |
4537 | + return slot->pages_scanned == slot->pages; | |
4538 | +} | |
4539 | + | |
4540 | +/** | |
4541 | + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to | |
4542 | + * its random permutation. This function is embedded with the random | |
4543 | + * permutation index management code. | |
4544 | + */ | |
4545 | +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) | |
4546 | +{ | |
4547 | + unsigned long rand_range, addr, swap_index, scan_index; | |
4548 | + struct rmap_item *item = NULL; | |
4549 | + struct rmap_list_entry *scan_entry, *swap_entry = NULL; | |
4550 | + struct page *page; | |
4551 | + | |
4552 | + scan_index = swap_index = slot->pages_scanned % slot->pages; | |
4553 | + | |
4554 | + if (pool_entry_boundary(scan_index)) | |
4555 | + try_free_last_pool(slot, scan_index - 1); | |
4556 | + | |
4557 | + if (vma_fully_scanned(slot)) { | |
4558 | + if (slot->flags & UKSM_SLOT_NEED_SORT) | |
4559 | + slot->flags |= UKSM_SLOT_NEED_RERAND; | |
4560 | + else | |
4561 | + slot->flags &= ~UKSM_SLOT_NEED_RERAND; | |
4562 | + if (slot->flags & UKSM_SLOT_NEED_SORT) | |
4563 | + sort_rmap_entry_list(slot); | |
4564 | + } | |
4565 | + | |
4566 | + scan_entry = get_rmap_list_entry(slot, scan_index, 1); | |
4567 | + if (!scan_entry) | |
4568 | + return NULL; | |
4569 | + | |
4570 | + if (entry_is_new(scan_entry)) { | |
4571 | + scan_entry->addr = get_index_orig_addr(slot, scan_index); | |
4572 | + set_is_addr(scan_entry->addr); | |
4573 | + } | |
4574 | + | |
4575 | + if (slot->flags & UKSM_SLOT_NEED_RERAND) { | |
4576 | + rand_range = slot->pages - scan_index; | |
4577 | + BUG_ON(!rand_range); | |
4578 | + swap_index = scan_index + (prandom_u32() % rand_range); | |
4579 | + } | |
4580 | + | |
4581 | + if (swap_index != scan_index) { | |
4582 | + swap_entry = get_rmap_list_entry(slot, swap_index, 1); | |
4583 | + if (entry_is_new(swap_entry)) { | |
4584 | + swap_entry->addr = get_index_orig_addr(slot, | |
4585 | + swap_index); | |
4586 | + set_is_addr(swap_entry->addr); | |
4587 | + } | |
4588 | + swap_entries(scan_entry, scan_index, swap_entry, swap_index); | |
4589 | + } | |
4590 | + | |
4591 | + addr = get_entry_address(scan_entry); | |
4592 | + item = get_entry_item(scan_entry); | |
4593 | + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); | |
4594 | + | |
4595 | + page = follow_page(slot->vma, addr, FOLL_GET); | |
4596 | + if (IS_ERR_OR_NULL(page)) | |
4597 | + goto nopage; | |
4598 | + | |
4599 | + if (!PageAnon(page) && !page_trans_compound_anon(page)) | |
4600 | + goto putpage; | |
4601 | + | |
4602 | + /*check is zero_page pfn or uksm_zero_page*/ | |
4603 | + if ((page_to_pfn(page) == zero_pfn) | |
4604 | + || (page_to_pfn(page) == uksm_zero_pfn)) | |
4605 | + goto putpage; | |
4606 | + | |
4607 | + flush_anon_page(slot->vma, page, addr); | |
4608 | + flush_dcache_page(page); | |
4609 | + | |
4610 | + | |
4611 | + *hash = page_hash(page, hash_strength, 1); | |
4612 | + inc_uksm_pages_scanned(); | |
4613 | + /*if the page content all zero, re-map to zero-page*/ | |
4614 | + if (find_zero_page_hash(hash_strength, *hash)) { | |
4615 | + if (!cmp_and_merge_zero_page(slot->vma, page)) { | |
4616 | + slot->pages_merged++; | |
4617 | + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); | |
4618 | + dec_mm_counter(slot->mm, MM_ANONPAGES); | |
4619 | + | |
4620 | + /* For full-zero pages, no need to create rmap item */ | |
4621 | + goto putpage; | |
4622 | + } else { | |
4623 | + inc_rshash_neg(memcmp_cost / 2); | |
4624 | + } | |
4625 | + } | |
4626 | + | |
4627 | + if (!item) { | |
4628 | + item = alloc_rmap_item(); | |
4629 | + if (item) { | |
4630 | + /* It has already been zeroed */ | |
4631 | + item->slot = slot; | |
4632 | + item->address = addr; | |
4633 | + item->entry_index = scan_index; | |
4634 | + scan_entry->item = item; | |
4635 | + inc_rmap_list_pool_count(slot, scan_index); | |
4636 | + } else | |
4637 | + goto putpage; | |
4638 | + } | |
4639 | + | |
4640 | + BUG_ON(item->slot != slot); | |
4641 | + /* the page may have changed */ | |
4642 | + item->page = page; | |
4643 | + put_rmap_list_entry(slot, scan_index); | |
4644 | + if (swap_entry) | |
4645 | + put_rmap_list_entry(slot, swap_index); | |
4646 | + return item; | |
4647 | + | |
4648 | +putpage: | |
4649 | + put_page(page); | |
4650 | + page = NULL; | |
4651 | +nopage: | |
4652 | + /* no page, store addr back and free rmap_item if possible */ | |
4653 | + free_entry_item(scan_entry); | |
4654 | + put_rmap_list_entry(slot, scan_index); | |
4655 | + if (swap_entry) | |
4656 | + put_rmap_list_entry(slot, swap_index); | |
4657 | + return NULL; | |
4658 | +} | |
4659 | + | |
4660 | +static inline int in_stable_tree(struct rmap_item *rmap_item) | |
4661 | +{ | |
4662 | + return rmap_item->address & STABLE_FLAG; | |
4663 | +} | |
4664 | + | |
4665 | +/** | |
4666 | + * scan_vma_one_page() - scan the next page in a vma_slot. Called with | |
4667 | + * mmap_sem locked. | |
4668 | + */ | |
4669 | +static noinline void scan_vma_one_page(struct vma_slot *slot) | |
4670 | +{ | |
4671 | + u32 hash; | |
4672 | + struct mm_struct *mm; | |
4673 | + struct rmap_item *rmap_item = NULL; | |
4674 | + struct vm_area_struct *vma = slot->vma; | |
4675 | + | |
4676 | + mm = vma->vm_mm; | |
4677 | + BUG_ON(!mm); | |
4678 | + BUG_ON(!slot); | |
4679 | + | |
4680 | + rmap_item = get_next_rmap_item(slot, &hash); | |
4681 | + if (!rmap_item) | |
4682 | + goto out1; | |
4683 | + | |
4684 | + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) | |
4685 | + goto out2; | |
4686 | + | |
4687 | + cmp_and_merge_page(rmap_item, hash); | |
4688 | +out2: | |
4689 | + put_page(rmap_item->page); | |
4690 | +out1: | |
4691 | + slot->pages_scanned++; | |
4692 | + if (slot->fully_scanned_round != fully_scanned_round) | |
4693 | + scanned_virtual_pages++; | |
4694 | + | |
4695 | + if (vma_fully_scanned(slot)) | |
4696 | + slot->fully_scanned_round = fully_scanned_round; | |
4697 | +} | |
4698 | + | |
4699 | +static inline unsigned long rung_get_pages(struct scan_rung *rung) | |
4700 | +{ | |
4701 | + struct slot_tree_node *node; | |
4702 | + | |
4703 | + if (!rung->vma_root.rnode) | |
4704 | + return 0; | |
4705 | + | |
4706 | + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); | |
4707 | + | |
4708 | + return node->size; | |
4709 | +} | |
4710 | + | |
4711 | +#define RUNG_SAMPLED_MIN 3 | |
4712 | + | |
4713 | +static inline | |
4714 | +void uksm_calc_rung_step(struct scan_rung *rung, | |
4715 | + unsigned long page_time, unsigned long ratio) | |
4716 | +{ | |
4717 | + unsigned long sampled, pages; | |
4718 | + | |
4719 | + /* will be fully scanned ? */ | |
4720 | + if (!rung->cover_msecs) { | |
4721 | + rung->step = 1; | |
4722 | + return; | |
4723 | + } | |
4724 | + | |
4725 | + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) | |
4726 | + * ratio / page_time; | |
4727 | + | |
4728 | + /* | |
4729 | + * Before we finsish a scan round and expensive per-round jobs, | |
4730 | + * we need to have a chance to estimate the per page time. So | |
4731 | + * the sampled number can not be too small. | |
4732 | + */ | |
4733 | + if (sampled < RUNG_SAMPLED_MIN) | |
4734 | + sampled = RUNG_SAMPLED_MIN; | |
4735 | + | |
4736 | + pages = rung_get_pages(rung); | |
4737 | + if (likely(pages > sampled)) | |
4738 | + rung->step = pages / sampled; | |
4739 | + else | |
4740 | + rung->step = 1; | |
4741 | +} | |
4742 | + | |
4743 | +static inline int step_need_recalc(struct scan_rung *rung) | |
4744 | +{ | |
4745 | + unsigned long pages, stepmax; | |
4746 | + | |
4747 | + pages = rung_get_pages(rung); | |
4748 | + stepmax = pages / RUNG_SAMPLED_MIN; | |
4749 | + | |
4750 | + return pages && (rung->step > pages || | |
4751 | + (stepmax && rung->step > stepmax)); | |
4752 | +} | |
4753 | + | |
4754 | +static inline | |
4755 | +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) | |
4756 | +{ | |
4757 | + struct vma_slot *slot; | |
4758 | + | |
4759 | + if (finished) | |
4760 | + rung->flags |= UKSM_RUNG_ROUND_FINISHED; | |
4761 | + | |
4762 | + if (step_recalc || step_need_recalc(rung)) { | |
4763 | + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); | |
4764 | + BUG_ON(step_need_recalc(rung)); | |
4765 | + } | |
4766 | + | |
4767 | + slot_iter_index = prandom_u32() % rung->step; | |
4768 | + BUG_ON(!rung->vma_root.rnode); | |
4769 | + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); | |
4770 | + BUG_ON(!slot); | |
4771 | + | |
4772 | + rung->current_scan = slot; | |
4773 | + rung->current_offset = slot_iter_index; | |
4774 | +} | |
4775 | + | |
4776 | +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) | |
4777 | +{ | |
4778 | + return &slot->rung->vma_root; | |
4779 | +} | |
4780 | + | |
4781 | +/* | |
4782 | + * return if resetted. | |
4783 | + */ | |
4784 | +static int advance_current_scan(struct scan_rung *rung) | |
4785 | +{ | |
4786 | + unsigned short n; | |
4787 | + struct vma_slot *slot, *next = NULL; | |
4788 | + | |
4789 | + BUG_ON(!rung->vma_root.num); | |
4790 | + | |
4791 | + slot = rung->current_scan; | |
4792 | + n = (slot->pages - rung->current_offset) % rung->step; | |
4793 | + slot_iter_index = rung->step - n; | |
4794 | + next = sradix_tree_next(&rung->vma_root, slot->snode, | |
4795 | + slot->sindex, slot_iter); | |
4796 | + | |
4797 | + if (next) { | |
4798 | + rung->current_offset = slot_iter_index; | |
4799 | + rung->current_scan = next; | |
4800 | + return 0; | |
4801 | + } else { | |
4802 | + reset_current_scan(rung, 1, 0); | |
4803 | + return 1; | |
4804 | + } | |
4805 | +} | |
4806 | + | |
4807 | +static inline void rung_rm_slot(struct vma_slot *slot) | |
4808 | +{ | |
4809 | + struct scan_rung *rung = slot->rung; | |
4810 | + struct sradix_tree_root *root; | |
4811 | + | |
4812 | + if (rung->current_scan == slot) | |
4813 | + advance_current_scan(rung); | |
4814 | + | |
4815 | + root = slot_get_root(slot); | |
4816 | + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); | |
4817 | + slot->snode = NULL; | |
4818 | + if (step_need_recalc(rung)) { | |
4819 | + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); | |
4820 | + BUG_ON(step_need_recalc(rung)); | |
4821 | + } | |
4822 | + | |
4823 | + /* In case advance_current_scan loop back to this slot again */ | |
4824 | + if (rung->vma_root.num && rung->current_scan == slot) | |
4825 | + reset_current_scan(slot->rung, 1, 0); | |
4826 | +} | |
4827 | + | |
4828 | +static inline void rung_add_new_slots(struct scan_rung *rung, | |
4829 | + struct vma_slot **slots, unsigned long num) | |
4830 | +{ | |
4831 | + int err; | |
4832 | + struct vma_slot *slot; | |
4833 | + unsigned long i; | |
4834 | + struct sradix_tree_root *root = &rung->vma_root; | |
4835 | + | |
4836 | + err = sradix_tree_enter(root, (void **)slots, num); | |
4837 | + BUG_ON(err); | |
4838 | + | |
4839 | + for (i = 0; i < num; i++) { | |
4840 | + slot = slots[i]; | |
4841 | + slot->rung = rung; | |
4842 | + BUG_ON(vma_fully_scanned(slot)); | |
4843 | + } | |
4844 | + | |
4845 | + if (rung->vma_root.num == num) | |
4846 | + reset_current_scan(rung, 0, 1); | |
4847 | +} | |
4848 | + | |
4849 | +static inline int rung_add_one_slot(struct scan_rung *rung, | |
4850 | + struct vma_slot *slot) | |
4851 | +{ | |
4852 | + int err; | |
4853 | + | |
4854 | + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); | |
4855 | + if (err) | |
4856 | + return err; | |
4857 | + | |
4858 | + slot->rung = rung; | |
4859 | + if (rung->vma_root.num == 1) | |
4860 | + reset_current_scan(rung, 0, 1); | |
4861 | + | |
4862 | + return 0; | |
4863 | +} | |
4864 | + | |
4865 | +/* | |
4866 | + * Return true if the slot is deleted from its rung. | |
4867 | + */ | |
4868 | +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) | |
4869 | +{ | |
4870 | + struct scan_rung *old_rung = slot->rung; | |
4871 | + int err; | |
4872 | + | |
4873 | + if (old_rung == rung) | |
4874 | + return 0; | |
4875 | + | |
4876 | + rung_rm_slot(slot); | |
4877 | + err = rung_add_one_slot(rung, slot); | |
4878 | + if (err) { | |
4879 | + err = rung_add_one_slot(old_rung, slot); | |
4880 | + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ | |
4881 | + } | |
4882 | + | |
4883 | + return 1; | |
4884 | +} | |
4885 | + | |
4886 | +static inline int vma_rung_up(struct vma_slot *slot) | |
4887 | +{ | |
4888 | + struct scan_rung *rung; | |
4889 | + | |
4890 | + rung = slot->rung; | |
4891 | + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) | |
4892 | + rung++; | |
4893 | + | |
4894 | + return vma_rung_enter(slot, rung); | |
4895 | +} | |
4896 | + | |
4897 | +static inline int vma_rung_down(struct vma_slot *slot) | |
4898 | +{ | |
4899 | + struct scan_rung *rung; | |
4900 | + | |
4901 | + rung = slot->rung; | |
4902 | + if (slot->rung != &uksm_scan_ladder[0]) | |
4903 | + rung--; | |
4904 | + | |
4905 | + return vma_rung_enter(slot, rung); | |
4906 | +} | |
4907 | + | |
4908 | +/** | |
4909 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. | |
4910 | + */ | |
4911 | +static unsigned long cal_dedup_ratio(struct vma_slot *slot) | |
4912 | +{ | |
4913 | + unsigned long ret; | |
4914 | + | |
4915 | + BUG_ON(slot->pages_scanned == slot->last_scanned); | |
4916 | + | |
4917 | + ret = slot->pages_merged; | |
4918 | + | |
4919 | + /* Thrashing area filtering */ | |
4920 | + if (ret && uksm_thrash_threshold) { | |
4921 | + if (slot->pages_cowed * 100 / slot->pages_merged | |
4922 | + > uksm_thrash_threshold) { | |
4923 | + ret = 0; | |
4924 | + } else { | |
4925 | + ret = slot->pages_merged - slot->pages_cowed; | |
4926 | + } | |
4927 | + } | |
4928 | + | |
4929 | + return ret; | |
4930 | +} | |
4931 | + | |
4932 | +/** | |
4933 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. | |
4934 | + */ | |
4935 | +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) | |
4936 | +{ | |
4937 | + unsigned long ret; | |
4938 | + unsigned long pages_scanned; | |
4939 | + | |
4940 | + pages_scanned = slot->pages_scanned; | |
4941 | + if (!pages_scanned) { | |
4942 | + if (uksm_thrash_threshold) | |
4943 | + return 0; | |
4944 | + else | |
4945 | + pages_scanned = slot->pages_scanned; | |
4946 | + } | |
4947 | + | |
4948 | + ret = slot->pages_bemerged * 100 / pages_scanned; | |
4949 | + | |
4950 | + /* Thrashing area filtering */ | |
4951 | + if (ret && uksm_thrash_threshold) { | |
4952 | + if (slot->pages_cowed * 100 / slot->pages_bemerged | |
4953 | + > uksm_thrash_threshold) { | |
4954 | + ret = 0; | |
4955 | + } else { | |
4956 | + ret = slot->pages_bemerged - slot->pages_cowed; | |
4957 | + } | |
4958 | + } | |
4959 | + | |
4960 | + return ret; | |
4961 | +} | |
4962 | + | |
4963 | +/** | |
4964 | + * stable_node_reinsert() - When the hash_strength has been adjusted, the | |
4965 | + * stable tree need to be restructured, this is the function re-inserting the | |
4966 | + * stable node. | |
4967 | + */ | |
4968 | +static inline void stable_node_reinsert(struct stable_node *new_node, | |
4969 | + struct page *page, | |
4970 | + struct rb_root *root_treep, | |
4971 | + struct list_head *tree_node_listp, | |
4972 | + u32 hash) | |
4973 | +{ | |
4974 | + struct rb_node **new = &root_treep->rb_node; | |
4975 | + struct rb_node *parent = NULL; | |
4976 | + struct stable_node *stable_node; | |
4977 | + struct tree_node *tree_node; | |
4978 | + struct page *tree_page; | |
4979 | + int cmp; | |
4980 | + | |
4981 | + while (*new) { | |
4982 | + int cmp; | |
4983 | + | |
4984 | + tree_node = rb_entry(*new, struct tree_node, node); | |
4985 | + | |
4986 | + cmp = hash_cmp(hash, tree_node->hash); | |
4987 | + | |
4988 | + if (cmp < 0) { | |
4989 | + parent = *new; | |
4990 | + new = &parent->rb_left; | |
4991 | + } else if (cmp > 0) { | |
4992 | + parent = *new; | |
4993 | + new = &parent->rb_right; | |
4994 | + } else | |
4995 | + break; | |
4996 | + } | |
4997 | + | |
4998 | + if (*new) { | |
4999 | + /* find a stable tree node with same first level hash value */ | |
5000 | + stable_node_hash_max(new_node, page, hash); | |
5001 | + if (tree_node->count == 1) { | |
5002 | + stable_node = rb_entry(tree_node->sub_root.rb_node, | |
5003 | + struct stable_node, node); | |
5004 | + tree_page = get_uksm_page(stable_node, 1, 0); | |
5005 | + if (tree_page) { | |
5006 | + stable_node_hash_max(stable_node, | |
5007 | + tree_page, hash); | |
5008 | + put_page(tree_page); | |
5009 | + | |
5010 | + /* prepare for stable node insertion */ | |
5011 | + | |
5012 | + cmp = hash_cmp(new_node->hash_max, | |
5013 | + stable_node->hash_max); | |
5014 | + parent = &stable_node->node; | |
5015 | + if (cmp < 0) | |
5016 | + new = &parent->rb_left; | |
5017 | + else if (cmp > 0) | |
5018 | + new = &parent->rb_right; | |
5019 | + else | |
5020 | + goto failed; | |
5021 | + | |
5022 | + goto add_node; | |
5023 | + } else { | |
5024 | + /* the only stable_node deleted, the tree node | |
5025 | + * was not deleted. | |
5026 | + */ | |
5027 | + goto tree_node_reuse; | |
5028 | + } | |
5029 | + } | |
5030 | + | |
5031 | + /* well, search the collision subtree */ | |
5032 | + new = &tree_node->sub_root.rb_node; | |
5033 | + parent = NULL; | |
5034 | + BUG_ON(!*new); | |
5035 | + while (*new) { | |
5036 | + int cmp; | |
5037 | + | |
5038 | + stable_node = rb_entry(*new, struct stable_node, node); | |
5039 | + | |
5040 | + cmp = hash_cmp(new_node->hash_max, | |
5041 | + stable_node->hash_max); | |
5042 | + | |
5043 | + if (cmp < 0) { | |
5044 | + parent = *new; | |
5045 | + new = &parent->rb_left; | |
5046 | + } else if (cmp > 0) { | |
5047 | + parent = *new; | |
5048 | + new = &parent->rb_right; | |
5049 | + } else { | |
5050 | + /* oh, no, still a collision */ | |
5051 | + goto failed; | |
5052 | + } | |
5053 | + } | |
5054 | + | |
5055 | + goto add_node; | |
5056 | + } | |
5057 | + | |
5058 | + /* no tree node found */ | |
5059 | + tree_node = alloc_tree_node(tree_node_listp); | |
5060 | + if (!tree_node) { | |
5061 | + printk(KERN_ERR "UKSM: memory allocation error!\n"); | |
5062 | + goto failed; | |
5063 | + } else { | |
5064 | + tree_node->hash = hash; | |
5065 | + rb_link_node(&tree_node->node, parent, new); | |
5066 | + rb_insert_color(&tree_node->node, root_treep); | |
5067 | + | |
5068 | +tree_node_reuse: | |
5069 | + /* prepare for stable node insertion */ | |
5070 | + parent = NULL; | |
5071 | + new = &tree_node->sub_root.rb_node; | |
5072 | + } | |
5073 | + | |
5074 | +add_node: | |
5075 | + rb_link_node(&new_node->node, parent, new); | |
5076 | + rb_insert_color(&new_node->node, &tree_node->sub_root); | |
5077 | + new_node->tree_node = tree_node; | |
5078 | + tree_node->count++; | |
5079 | + return; | |
5080 | + | |
5081 | +failed: | |
5082 | + /* This can only happen when two nodes have collided | |
5083 | + * in two levels. | |
5084 | + */ | |
5085 | + new_node->tree_node = NULL; | |
5086 | + return; | |
5087 | +} | |
5088 | + | |
5089 | +static inline void free_all_tree_nodes(struct list_head *list) | |
5090 | +{ | |
5091 | + struct tree_node *node, *tmp; | |
5092 | + | |
5093 | + list_for_each_entry_safe(node, tmp, list, all_list) { | |
5094 | + free_tree_node(node); | |
5095 | + } | |
5096 | +} | |
5097 | + | |
5098 | +/** | |
5099 | + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash | |
5100 | + * strength to the current hash_strength. It re-structures the hole tree. | |
5101 | + */ | |
5102 | +static inline void stable_tree_delta_hash(u32 prev_hash_strength) | |
5103 | +{ | |
5104 | + struct stable_node *node, *tmp; | |
5105 | + struct rb_root *root_new_treep; | |
5106 | + struct list_head *new_tree_node_listp; | |
5107 | + | |
5108 | + stable_tree_index = (stable_tree_index + 1) % 2; | |
5109 | + root_new_treep = &root_stable_tree[stable_tree_index]; | |
5110 | + new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; | |
5111 | + *root_new_treep = RB_ROOT; | |
5112 | + BUG_ON(!list_empty(new_tree_node_listp)); | |
5113 | + | |
5114 | + /* | |
5115 | + * we need to be safe, the node could be removed by get_uksm_page() | |
5116 | + */ | |
5117 | + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { | |
5118 | + void *addr; | |
5119 | + struct page *node_page; | |
5120 | + u32 hash; | |
5121 | + | |
5122 | + /* | |
5123 | + * We are completely re-structuring the stable nodes to a new | |
5124 | + * stable tree. We don't want to touch the old tree unlinks and | |
5125 | + * old tree_nodes. The old tree_nodes will be freed at once. | |
5126 | + */ | |
5127 | + node_page = get_uksm_page(node, 0, 0); | |
5128 | + if (!node_page) | |
5129 | + continue; | |
5130 | + | |
5131 | + if (node->tree_node) { | |
5132 | + hash = node->tree_node->hash; | |
5133 | + | |
5134 | + addr = kmap_atomic(node_page); | |
5135 | + | |
5136 | + hash = delta_hash(addr, prev_hash_strength, | |
5137 | + hash_strength, hash); | |
5138 | + kunmap_atomic(addr); | |
5139 | + } else { | |
5140 | + /* | |
5141 | + *it was not inserted to rbtree due to collision in last | |
5142 | + *round scan. | |
5143 | + */ | |
5144 | + hash = page_hash(node_page, hash_strength, 0); | |
5145 | + } | |
5146 | + | |
5147 | + stable_node_reinsert(node, node_page, root_new_treep, | |
5148 | + new_tree_node_listp, hash); | |
5149 | + put_page(node_page); | |
5150 | + } | |
5151 | + | |
5152 | + root_stable_treep = root_new_treep; | |
5153 | + free_all_tree_nodes(stable_tree_node_listp); | |
5154 | + BUG_ON(!list_empty(stable_tree_node_listp)); | |
5155 | + stable_tree_node_listp = new_tree_node_listp; | |
5156 | +} | |
5157 | + | |
5158 | +static inline void inc_hash_strength(unsigned long delta) | |
5159 | +{ | |
5160 | + hash_strength += 1 << delta; | |
5161 | + if (hash_strength > HASH_STRENGTH_MAX) | |
5162 | + hash_strength = HASH_STRENGTH_MAX; | |
5163 | +} | |
5164 | + | |
5165 | +static inline void dec_hash_strength(unsigned long delta) | |
5166 | +{ | |
5167 | + unsigned long change = 1 << delta; | |
5168 | + | |
5169 | + if (hash_strength <= change + 1) | |
5170 | + hash_strength = 1; | |
5171 | + else | |
5172 | + hash_strength -= change; | |
5173 | +} | |
5174 | + | |
5175 | +static inline void inc_hash_strength_delta(void) | |
5176 | +{ | |
5177 | + hash_strength_delta++; | |
5178 | + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) | |
5179 | + hash_strength_delta = HASH_STRENGTH_DELTA_MAX; | |
5180 | +} | |
5181 | + | |
5182 | +/* | |
5183 | +static inline unsigned long get_current_neg_ratio(void) | |
5184 | +{ | |
5185 | + if (!rshash_pos || rshash_neg > rshash_pos) | |
5186 | + return 100; | |
5187 | + | |
5188 | + return div64_u64(100 * rshash_neg , rshash_pos); | |
5189 | +} | |
5190 | +*/ | |
5191 | + | |
5192 | +static inline unsigned long get_current_neg_ratio(void) | |
5193 | +{ | |
5194 | + u64 pos = benefit.pos; | |
5195 | + u64 neg = benefit.neg; | |
5196 | + | |
5197 | + if (!neg) | |
5198 | + return 0; | |
5199 | + | |
5200 | + if (!pos || neg > pos) | |
5201 | + return 100; | |
5202 | + | |
5203 | + if (neg > div64_u64(U64_MAX, 100)) | |
5204 | + pos = div64_u64(pos, 100); | |
5205 | + else | |
5206 | + neg *= 100; | |
5207 | + | |
5208 | + return div64_u64(neg, pos); | |
5209 | +} | |
5210 | + | |
5211 | +static inline unsigned long get_current_benefit(void) | |
5212 | +{ | |
5213 | + u64 pos = benefit.pos; | |
5214 | + u64 neg = benefit.neg; | |
5215 | + u64 scanned = benefit.scanned; | |
5216 | + | |
5217 | + if (neg > pos) | |
5218 | + return 0; | |
5219 | + | |
5220 | + return div64_u64((pos - neg), scanned); | |
5221 | +} | |
5222 | + | |
5223 | +static inline int judge_rshash_direction(void) | |
5224 | +{ | |
5225 | + u64 current_neg_ratio, stable_benefit; | |
5226 | + u64 current_benefit, delta = 0; | |
5227 | + int ret = STILL; | |
5228 | + | |
5229 | + /* Try to probe a value after the boot, and in case the system | |
5230 | + are still for a long time. */ | |
5231 | + if ((fully_scanned_round & 0xFFULL) == 10) { | |
5232 | + ret = OBSCURE; | |
5233 | + goto out; | |
5234 | + } | |
5235 | + | |
5236 | + current_neg_ratio = get_current_neg_ratio(); | |
5237 | + | |
5238 | + if (current_neg_ratio == 0) { | |
5239 | + rshash_neg_cont_zero++; | |
5240 | + if (rshash_neg_cont_zero > 2) | |
5241 | + return GO_DOWN; | |
5242 | + else | |
5243 | + return STILL; | |
5244 | + } | |
5245 | + rshash_neg_cont_zero = 0; | |
5246 | + | |
5247 | + if (current_neg_ratio > 90) { | |
5248 | + ret = GO_UP; | |
5249 | + goto out; | |
5250 | + } | |
5251 | + | |
5252 | + current_benefit = get_current_benefit(); | |
5253 | + stable_benefit = rshash_state.stable_benefit; | |
5254 | + | |
5255 | + if (!stable_benefit) { | |
5256 | + ret = OBSCURE; | |
5257 | + goto out; | |
5258 | + } | |
5259 | + | |
5260 | + if (current_benefit > stable_benefit) | |
5261 | + delta = current_benefit - stable_benefit; | |
5262 | + else if (current_benefit < stable_benefit) | |
5263 | + delta = stable_benefit - current_benefit; | |
5264 | + | |
5265 | + delta = div64_u64(100 * delta , stable_benefit); | |
5266 | + | |
5267 | + if (delta > 50) { | |
5268 | + rshash_cont_obscure++; | |
5269 | + if (rshash_cont_obscure > 2) | |
5270 | + return OBSCURE; | |
5271 | + else | |
5272 | + return STILL; | |
5273 | + } | |
5274 | + | |
5275 | +out: | |
5276 | + rshash_cont_obscure = 0; | |
5277 | + return ret; | |
5278 | +} | |
5279 | + | |
5280 | +/** | |
5281 | + * rshash_adjust() - The main function to control the random sampling state | |
5282 | + * machine for hash strength adapting. | |
5283 | + * | |
5284 | + * return true if hash_strength has changed. | |
5285 | + */ | |
5286 | +static inline int rshash_adjust(void) | |
5287 | +{ | |
5288 | + unsigned long prev_hash_strength = hash_strength; | |
5289 | + | |
5290 | + if (!encode_benefit()) | |
5291 | + return 0; | |
5292 | + | |
5293 | + switch (rshash_state.state) { | |
5294 | + case RSHASH_STILL: | |
5295 | + switch (judge_rshash_direction()) { | |
5296 | + case GO_UP: | |
5297 | + if (rshash_state.pre_direct == GO_DOWN) | |
5298 | + hash_strength_delta = 0; | |
5299 | + | |
5300 | + inc_hash_strength(hash_strength_delta); | |
5301 | + inc_hash_strength_delta(); | |
5302 | + rshash_state.stable_benefit = get_current_benefit(); | |
5303 | + rshash_state.pre_direct = GO_UP; | |
5304 | + break; | |
5305 | + | |
5306 | + case GO_DOWN: | |
5307 | + if (rshash_state.pre_direct == GO_UP) | |
5308 | + hash_strength_delta = 0; | |
5309 | + | |
5310 | + dec_hash_strength(hash_strength_delta); | |
5311 | + inc_hash_strength_delta(); | |
5312 | + rshash_state.stable_benefit = get_current_benefit(); | |
5313 | + rshash_state.pre_direct = GO_DOWN; | |
5314 | + break; | |
5315 | + | |
5316 | + case OBSCURE: | |
5317 | + rshash_state.stable_point = hash_strength; | |
5318 | + rshash_state.turn_point_down = hash_strength; | |
5319 | + rshash_state.turn_point_up = hash_strength; | |
5320 | + rshash_state.turn_benefit_down = get_current_benefit(); | |
5321 | + rshash_state.turn_benefit_up = get_current_benefit(); | |
5322 | + rshash_state.lookup_window_index = 0; | |
5323 | + rshash_state.state = RSHASH_TRYDOWN; | |
5324 | + dec_hash_strength(hash_strength_delta); | |
5325 | + inc_hash_strength_delta(); | |
5326 | + break; | |
5327 | + | |
5328 | + case STILL: | |
5329 | + break; | |
5330 | + default: | |
5331 | + BUG(); | |
5332 | + } | |
5333 | + break; | |
5334 | + | |
5335 | + case RSHASH_TRYDOWN: | |
5336 | + if (rshash_state.lookup_window_index++ % 5 == 0) | |
5337 | + rshash_state.below_count = 0; | |
5338 | + | |
5339 | + if (get_current_benefit() < rshash_state.stable_benefit) | |
5340 | + rshash_state.below_count++; | |
5341 | + else if (get_current_benefit() > | |
5342 | + rshash_state.turn_benefit_down) { | |
5343 | + rshash_state.turn_point_down = hash_strength; | |
5344 | + rshash_state.turn_benefit_down = get_current_benefit(); | |
5345 | + } | |
5346 | + | |
5347 | + if (rshash_state.below_count >= 3 || | |
5348 | + judge_rshash_direction() == GO_UP || | |
5349 | + hash_strength == 1) { | |
5350 | + hash_strength = rshash_state.stable_point; | |
5351 | + hash_strength_delta = 0; | |
5352 | + inc_hash_strength(hash_strength_delta); | |
5353 | + inc_hash_strength_delta(); | |
5354 | + rshash_state.lookup_window_index = 0; | |
5355 | + rshash_state.state = RSHASH_TRYUP; | |
5356 | + hash_strength_delta = 0; | |
5357 | + } else { | |
5358 | + dec_hash_strength(hash_strength_delta); | |
5359 | + inc_hash_strength_delta(); | |
5360 | + } | |
5361 | + break; | |
5362 | + | |
5363 | + case RSHASH_TRYUP: | |
5364 | + if (rshash_state.lookup_window_index++ % 5 == 0) | |
5365 | + rshash_state.below_count = 0; | |
5366 | + | |
5367 | + if (get_current_benefit() < rshash_state.turn_benefit_down) | |
5368 | + rshash_state.below_count++; | |
5369 | + else if (get_current_benefit() > rshash_state.turn_benefit_up) { | |
5370 | + rshash_state.turn_point_up = hash_strength; | |
5371 | + rshash_state.turn_benefit_up = get_current_benefit(); | |
5372 | + } | |
5373 | + | |
5374 | + if (rshash_state.below_count >= 3 || | |
5375 | + judge_rshash_direction() == GO_DOWN || | |
5376 | + hash_strength == HASH_STRENGTH_MAX) { | |
5377 | + hash_strength = rshash_state.turn_benefit_up > | |
5378 | + rshash_state.turn_benefit_down ? | |
5379 | + rshash_state.turn_point_up : | |
5380 | + rshash_state.turn_point_down; | |
5381 | + | |
5382 | + rshash_state.state = RSHASH_PRE_STILL; | |
5383 | + } else { | |
5384 | + inc_hash_strength(hash_strength_delta); | |
5385 | + inc_hash_strength_delta(); | |
5386 | + } | |
5387 | + | |
5388 | + break; | |
5389 | + | |
5390 | + case RSHASH_NEW: | |
5391 | + case RSHASH_PRE_STILL: | |
5392 | + rshash_state.stable_benefit = get_current_benefit(); | |
5393 | + rshash_state.state = RSHASH_STILL; | |
5394 | + hash_strength_delta = 0; | |
5395 | + break; | |
5396 | + default: | |
5397 | + BUG(); | |
5398 | + } | |
5399 | + | |
5400 | + /* rshash_neg = rshash_pos = 0; */ | |
5401 | + reset_benefit(); | |
5402 | + | |
5403 | + if (prev_hash_strength != hash_strength) | |
5404 | + stable_tree_delta_hash(prev_hash_strength); | |
5405 | + | |
5406 | + return prev_hash_strength != hash_strength; | |
5407 | +} | |
5408 | + | |
5409 | +/** | |
5410 | + * round_update_ladder() - The main function to do update of all the | |
5411 | + * adjustments whenever a scan round is finished. | |
5412 | + */ | |
5413 | +static noinline void round_update_ladder(void) | |
5414 | +{ | |
5415 | + int i; | |
5416 | + unsigned long dedup; | |
5417 | + struct vma_slot *slot, *tmp_slot; | |
5418 | + | |
5419 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
5420 | + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; | |
5421 | + } | |
5422 | + | |
5423 | + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { | |
5424 | + | |
5425 | + /* slot may be rung_rm_slot() when mm exits */ | |
5426 | + if (slot->snode) { | |
5427 | + dedup = cal_dedup_ratio_old(slot); | |
5428 | + if (dedup && dedup >= uksm_abundant_threshold) | |
5429 | + vma_rung_up(slot); | |
5430 | + } | |
5431 | + | |
5432 | + slot->pages_bemerged = 0; | |
5433 | + slot->pages_cowed = 0; | |
5434 | + | |
5435 | + list_del_init(&slot->dedup_list); | |
5436 | + } | |
5437 | +} | |
5438 | + | |
5439 | +static void uksm_del_vma_slot(struct vma_slot *slot) | |
5440 | +{ | |
5441 | + int i, j; | |
5442 | + struct rmap_list_entry *entry; | |
5443 | + | |
5444 | + if (slot->snode) { | |
5445 | + /* | |
5446 | + * In case it just failed when entering the rung, it's not | |
5447 | + * necessary. | |
5448 | + */ | |
5449 | + rung_rm_slot(slot); | |
5450 | + } | |
5451 | + | |
5452 | + if (!list_empty(&slot->dedup_list)) | |
5453 | + list_del(&slot->dedup_list); | |
5454 | + | |
5455 | + if (!slot->rmap_list_pool || !slot->pool_counts) { | |
5456 | + /* In case it OOMed in uksm_vma_enter() */ | |
5457 | + goto out; | |
5458 | + } | |
5459 | + | |
5460 | + for (i = 0; i < slot->pool_size; i++) { | |
5461 | + void *addr; | |
5462 | + | |
5463 | + if (!slot->rmap_list_pool[i]) | |
5464 | + continue; | |
5465 | + | |
5466 | + addr = kmap(slot->rmap_list_pool[i]); | |
5467 | + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { | |
5468 | + entry = (struct rmap_list_entry *)addr + j; | |
5469 | + if (is_addr(entry->addr)) | |
5470 | + continue; | |
5471 | + if (!entry->item) | |
5472 | + continue; | |
5473 | + | |
5474 | + remove_rmap_item_from_tree(entry->item); | |
5475 | + free_rmap_item(entry->item); | |
5476 | + slot->pool_counts[i]--; | |
5477 | + } | |
5478 | + BUG_ON(slot->pool_counts[i]); | |
5479 | + kunmap(slot->rmap_list_pool[i]); | |
5480 | + __free_page(slot->rmap_list_pool[i]); | |
5481 | + } | |
5482 | + kfree(slot->rmap_list_pool); | |
5483 | + kfree(slot->pool_counts); | |
5484 | + | |
5485 | +out: | |
5486 | + slot->rung = NULL; | |
5487 | + BUG_ON(uksm_pages_total < slot->pages); | |
5488 | + if (slot->flags & UKSM_SLOT_IN_UKSM) | |
5489 | + uksm_pages_total -= slot->pages; | |
5490 | + | |
5491 | + if (slot->fully_scanned_round == fully_scanned_round) | |
5492 | + scanned_virtual_pages -= slot->pages; | |
5493 | + else | |
5494 | + scanned_virtual_pages -= slot->pages_scanned; | |
5495 | + free_vma_slot(slot); | |
5496 | +} | |
5497 | + | |
5498 | + | |
5499 | +#define SPIN_LOCK_PERIOD 32 | |
5500 | +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; | |
5501 | +static inline void cleanup_vma_slots(void) | |
5502 | +{ | |
5503 | + struct vma_slot *slot; | |
5504 | + int i; | |
5505 | + | |
5506 | + i = 0; | |
5507 | + spin_lock(&vma_slot_list_lock); | |
5508 | + while (!list_empty(&vma_slot_del)) { | |
5509 | + slot = list_entry(vma_slot_del.next, | |
5510 | + struct vma_slot, slot_list); | |
5511 | + list_del(&slot->slot_list); | |
5512 | + cleanup_slots[i++] = slot; | |
5513 | + if (i == SPIN_LOCK_PERIOD) { | |
5514 | + spin_unlock(&vma_slot_list_lock); | |
5515 | + while (--i >= 0) | |
5516 | + uksm_del_vma_slot(cleanup_slots[i]); | |
5517 | + i = 0; | |
5518 | + spin_lock(&vma_slot_list_lock); | |
5519 | + } | |
5520 | + } | |
5521 | + spin_unlock(&vma_slot_list_lock); | |
5522 | + | |
5523 | + while (--i >= 0) | |
5524 | + uksm_del_vma_slot(cleanup_slots[i]); | |
5525 | +} | |
5526 | + | |
5527 | +/* | |
5528 | +*expotional moving average formula | |
5529 | +*/ | |
5530 | +static inline unsigned long ema(unsigned long curr, unsigned long last_ema) | |
5531 | +{ | |
5532 | + /* | |
5533 | + * For a very high burst, even the ema cannot work well, a false very | |
5534 | + * high per-page time estimation can result in feedback in very high | |
5535 | + * overhead of context swith and rung update -- this will then lead | |
5536 | + * to higher per-paper time, this may not converge. | |
5537 | + * | |
5538 | + * Instead, we try to approach this value in a binary manner. | |
5539 | + */ | |
5540 | + if (curr > last_ema * 10) | |
5541 | + return last_ema * 2; | |
5542 | + | |
5543 | + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; | |
5544 | +} | |
5545 | + | |
5546 | +/* | |
5547 | + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to | |
5548 | + * nanoseconds based on current uksm_sleep_jiffies. | |
5549 | + */ | |
5550 | +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) | |
5551 | +{ | |
5552 | + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / | |
5553 | + (TIME_RATIO_SCALE - ratio) * ratio; | |
5554 | +} | |
5555 | + | |
5556 | + | |
5557 | +static inline unsigned long rung_real_ratio(int cpu_time_ratio) | |
5558 | +{ | |
5559 | + unsigned long ret; | |
5560 | + | |
5561 | + BUG_ON(!cpu_time_ratio); | |
5562 | + | |
5563 | + if (cpu_time_ratio > 0) | |
5564 | + ret = cpu_time_ratio; | |
5565 | + else | |
5566 | + ret = (unsigned long)(-cpu_time_ratio) * | |
5567 | + uksm_max_cpu_percentage / 100UL; | |
5568 | + | |
5569 | + return ret ? ret : 1; | |
5570 | +} | |
5571 | + | |
5572 | +static noinline void uksm_calc_scan_pages(void) | |
5573 | +{ | |
5574 | + struct scan_rung *ladder = uksm_scan_ladder; | |
5575 | + unsigned long sleep_usecs, nsecs; | |
5576 | + unsigned long ratio; | |
5577 | + int i; | |
5578 | + unsigned long per_page; | |
5579 | + | |
5580 | + if (uksm_ema_page_time > 100000 || | |
5581 | + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) | |
5582 | + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; | |
5583 | + | |
5584 | + per_page = uksm_ema_page_time; | |
5585 | + BUG_ON(!per_page); | |
5586 | + | |
5587 | + /* | |
5588 | + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value | |
5589 | + * based on saved user input. | |
5590 | + */ | |
5591 | + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) | |
5592 | + uksm_sleep_jiffies = uksm_sleep_saved; | |
5593 | + | |
5594 | + /* We require a rung scan at least 1 page in a period. */ | |
5595 | + nsecs = per_page; | |
5596 | + ratio = rung_real_ratio(ladder[0].cpu_ratio); | |
5597 | + if (cpu_ratio_to_nsec(ratio) < nsecs) { | |
5598 | + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio | |
5599 | + / NSEC_PER_USEC; | |
5600 | + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; | |
5601 | + } | |
5602 | + | |
5603 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
5604 | + ratio = rung_real_ratio(ladder[i].cpu_ratio); | |
5605 | + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / | |
5606 | + per_page; | |
5607 | + BUG_ON(!ladder[i].pages_to_scan); | |
5608 | + uksm_calc_rung_step(&ladder[i], per_page, ratio); | |
5609 | + } | |
5610 | +} | |
5611 | + | |
5612 | +/* | |
5613 | + * From the scan time of this round (ns) to next expected min sleep time | |
5614 | + * (ms), be careful of the possible overflows. ratio is taken from | |
5615 | + * rung_real_ratio() | |
5616 | + */ | |
5617 | +static inline | |
5618 | +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) | |
5619 | +{ | |
5620 | + scan_time >>= 20; /* to msec level now */ | |
5621 | + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); | |
5622 | + | |
5623 | + return (unsigned int) ((unsigned long) scan_time * | |
5624 | + (TIME_RATIO_SCALE - ratio) / ratio); | |
5625 | +} | |
5626 | + | |
5627 | +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) | |
5628 | +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) | |
5629 | + | |
5630 | +static inline unsigned long vma_pool_size(struct vma_slot *slot) | |
5631 | +{ | |
5632 | + return round_up(sizeof(struct rmap_list_entry) * slot->pages, | |
5633 | + PAGE_SIZE) >> PAGE_SHIFT; | |
5634 | +} | |
5635 | + | |
5636 | +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) | |
5637 | +{ | |
5638 | + struct scan_rung *rung; | |
5639 | + unsigned long pool_size, i; | |
5640 | + struct vma_slot *slot; | |
5641 | + int failed; | |
5642 | + | |
5643 | + rung = &uksm_scan_ladder[0]; | |
5644 | + | |
5645 | + failed = 0; | |
5646 | + for (i = 0; i < num; i++) { | |
5647 | + slot = slots[i]; | |
5648 | + | |
5649 | + pool_size = vma_pool_size(slot); | |
5650 | + slot->rmap_list_pool = kzalloc(sizeof(struct page *) * | |
5651 | + pool_size, GFP_KERNEL); | |
5652 | + if (!slot->rmap_list_pool) | |
5653 | + break; | |
5654 | + | |
5655 | + slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, | |
5656 | + GFP_KERNEL); | |
5657 | + if (!slot->pool_counts) { | |
5658 | + kfree(slot->rmap_list_pool); | |
5659 | + break; | |
5660 | + } | |
5661 | + | |
5662 | + slot->pool_size = pool_size; | |
5663 | + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); | |
5664 | + slot->flags |= UKSM_SLOT_IN_UKSM; | |
5665 | + uksm_pages_total += slot->pages; | |
5666 | + } | |
5667 | + | |
5668 | + if (i) | |
5669 | + rung_add_new_slots(rung, slots, i); | |
5670 | + | |
5671 | + return; | |
5672 | +} | |
5673 | + | |
5674 | +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; | |
5675 | + | |
5676 | +static void uksm_enter_all_slots(void) | |
5677 | +{ | |
5678 | + struct vma_slot *slot; | |
5679 | + unsigned long index; | |
5680 | + struct list_head empty_vma_list; | |
5681 | + int i; | |
5682 | + | |
5683 | + i = 0; | |
5684 | + index = 0; | |
5685 | + INIT_LIST_HEAD(&empty_vma_list); | |
5686 | + | |
5687 | + spin_lock(&vma_slot_list_lock); | |
5688 | + while (!list_empty(&vma_slot_new)) { | |
5689 | + slot = list_entry(vma_slot_new.next, | |
5690 | + struct vma_slot, slot_list); | |
5691 | + | |
5692 | + if (!slot->vma->anon_vma) { | |
5693 | + list_move(&slot->slot_list, &empty_vma_list); | |
5694 | + } else if (vma_can_enter(slot->vma)) { | |
5695 | + batch_slots[index++] = slot; | |
5696 | + list_del_init(&slot->slot_list); | |
5697 | + } else { | |
5698 | + list_move(&slot->slot_list, &vma_slot_noadd); | |
5699 | + } | |
5700 | + | |
5701 | + if (++i == SPIN_LOCK_PERIOD || | |
5702 | + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { | |
5703 | + spin_unlock(&vma_slot_list_lock); | |
5704 | + | |
5705 | + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { | |
5706 | + uksm_vma_enter(batch_slots, index); | |
5707 | + index = 0; | |
5708 | + } | |
5709 | + i = 0; | |
5710 | + cond_resched(); | |
5711 | + spin_lock(&vma_slot_list_lock); | |
5712 | + } | |
5713 | + } | |
5714 | + | |
5715 | + list_splice(&empty_vma_list, &vma_slot_new); | |
5716 | + | |
5717 | + spin_unlock(&vma_slot_list_lock); | |
5718 | + | |
5719 | + if (index) | |
5720 | + uksm_vma_enter(batch_slots, index); | |
5721 | + | |
5722 | +} | |
5723 | + | |
5724 | +static inline int rung_round_finished(struct scan_rung *rung) | |
5725 | +{ | |
5726 | + return rung->flags & UKSM_RUNG_ROUND_FINISHED; | |
5727 | +} | |
5728 | + | |
5729 | +static inline void judge_slot(struct vma_slot *slot) | |
5730 | +{ | |
5731 | + struct scan_rung *rung = slot->rung; | |
5732 | + unsigned long dedup; | |
5733 | + int deleted; | |
5734 | + | |
5735 | + dedup = cal_dedup_ratio(slot); | |
5736 | + if (vma_fully_scanned(slot) && uksm_thrash_threshold) | |
5737 | + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); | |
5738 | + else if (dedup && dedup >= uksm_abundant_threshold) | |
5739 | + deleted = vma_rung_up(slot); | |
5740 | + else | |
5741 | + deleted = vma_rung_down(slot); | |
5742 | + | |
5743 | + slot->pages_merged = 0; | |
5744 | + slot->pages_cowed = 0; | |
5745 | + | |
5746 | + if (vma_fully_scanned(slot)) | |
5747 | + slot->pages_scanned = 0; | |
5748 | + | |
5749 | + slot->last_scanned = slot->pages_scanned; | |
5750 | + | |
5751 | + /* If its deleted in above, then rung was already advanced. */ | |
5752 | + if (!deleted) | |
5753 | + advance_current_scan(rung); | |
5754 | +} | |
5755 | + | |
5756 | + | |
5757 | +static inline int hash_round_finished(void) | |
5758 | +{ | |
5759 | + if (scanned_virtual_pages > (uksm_pages_total >> 2)) { | |
5760 | + scanned_virtual_pages = 0; | |
5761 | + if (uksm_pages_scanned) | |
5762 | + fully_scanned_round++; | |
5763 | + | |
5764 | + return 1; | |
5765 | + } else { | |
5766 | + return 0; | |
5767 | + } | |
5768 | +} | |
5769 | + | |
5770 | +#define UKSM_MMSEM_BATCH 5 | |
5771 | +#define BUSY_RETRY 100 | |
5772 | + | |
5773 | +/** | |
5774 | + * uksm_do_scan() - the main worker function. | |
5775 | + */ | |
5776 | +static noinline void uksm_do_scan(void) | |
5777 | +{ | |
5778 | + struct vma_slot *slot, *iter; | |
5779 | + struct mm_struct *busy_mm; | |
5780 | + unsigned char round_finished, all_rungs_emtpy; | |
5781 | + int i, err, mmsem_batch; | |
5782 | + unsigned long pcost; | |
5783 | + long long delta_exec; | |
5784 | + unsigned long vpages, max_cpu_ratio; | |
5785 | + unsigned long long start_time, end_time, scan_time; | |
5786 | + unsigned int expected_jiffies; | |
5787 | + | |
5788 | + might_sleep(); | |
5789 | + | |
5790 | + vpages = 0; | |
5791 | + | |
5792 | + start_time = task_sched_runtime(current); | |
5793 | + max_cpu_ratio = 0; | |
5794 | + mmsem_batch = 0; | |
5795 | + | |
5796 | + for (i = 0; i < SCAN_LADDER_SIZE;) { | |
5797 | + struct scan_rung *rung = &uksm_scan_ladder[i]; | |
5798 | + unsigned long ratio; | |
5799 | + int busy_retry; | |
5800 | + | |
5801 | + if (!rung->pages_to_scan) { | |
5802 | + i++; | |
5803 | + continue; | |
5804 | + } | |
5805 | + | |
5806 | + if (!rung->vma_root.num) { | |
5807 | + rung->pages_to_scan = 0; | |
5808 | + i++; | |
5809 | + continue; | |
5810 | + } | |
5811 | + | |
5812 | + ratio = rung_real_ratio(rung->cpu_ratio); | |
5813 | + if (ratio > max_cpu_ratio) | |
5814 | + max_cpu_ratio = ratio; | |
5815 | + | |
5816 | + busy_retry = BUSY_RETRY; | |
5817 | + /* | |
5818 | + * Do not consider rung_round_finished() here, just used up the | |
5819 | + * rung->pages_to_scan quota. | |
5820 | + */ | |
5821 | + while (rung->pages_to_scan && rung->vma_root.num && | |
5822 | + likely(!freezing(current))) { | |
5823 | + int reset = 0; | |
5824 | + | |
5825 | + slot = rung->current_scan; | |
5826 | + | |
5827 | + BUG_ON(vma_fully_scanned(slot)); | |
5828 | + | |
5829 | + if (mmsem_batch) { | |
5830 | + err = 0; | |
5831 | + } else { | |
5832 | + err = try_down_read_slot_mmap_sem(slot); | |
5833 | + } | |
5834 | + | |
5835 | + if (err == -ENOENT) { | |
5836 | +rm_slot: | |
5837 | + rung_rm_slot(slot); | |
5838 | + continue; | |
5839 | + } | |
5840 | + | |
5841 | + busy_mm = slot->mm; | |
5842 | + | |
5843 | + if (err == -EBUSY) { | |
5844 | + /* skip other vmas on the same mm */ | |
5845 | + do { | |
5846 | + reset = advance_current_scan(rung); | |
5847 | + iter = rung->current_scan; | |
5848 | + busy_retry--; | |
5849 | + if (iter->vma->vm_mm != busy_mm || | |
5850 | + !busy_retry || reset) | |
5851 | + break; | |
5852 | + } while (1); | |
5853 | + | |
5854 | + if (iter->vma->vm_mm != busy_mm) { | |
5855 | + continue; | |
5856 | + } else { | |
5857 | + /* scan round finsished */ | |
5858 | + break; | |
5859 | + } | |
5860 | + } | |
5861 | + | |
5862 | + BUG_ON(!vma_can_enter(slot->vma)); | |
5863 | + if (uksm_test_exit(slot->vma->vm_mm)) { | |
5864 | + mmsem_batch = 0; | |
5865 | + up_read(&slot->vma->vm_mm->mmap_sem); | |
5866 | + goto rm_slot; | |
5867 | + } | |
5868 | + | |
5869 | + if (mmsem_batch) | |
5870 | + mmsem_batch--; | |
5871 | + else | |
5872 | + mmsem_batch = UKSM_MMSEM_BATCH; | |
5873 | + | |
5874 | + /* Ok, we have take the mmap_sem, ready to scan */ | |
5875 | + scan_vma_one_page(slot); | |
5876 | + rung->pages_to_scan--; | |
5877 | + vpages++; | |
5878 | + | |
5879 | + if (rung->current_offset + rung->step > slot->pages - 1 | |
5880 | + || vma_fully_scanned(slot)) { | |
5881 | + up_read(&slot->vma->vm_mm->mmap_sem); | |
5882 | + judge_slot(slot); | |
5883 | + mmsem_batch = 0; | |
5884 | + } else { | |
5885 | + rung->current_offset += rung->step; | |
5886 | + if (!mmsem_batch) | |
5887 | + up_read(&slot->vma->vm_mm->mmap_sem); | |
5888 | + } | |
5889 | + | |
5890 | + busy_retry = BUSY_RETRY; | |
5891 | + cond_resched(); | |
5892 | + } | |
5893 | + | |
5894 | + if (mmsem_batch) { | |
5895 | + up_read(&slot->vma->vm_mm->mmap_sem); | |
5896 | + mmsem_batch = 0; | |
5897 | + } | |
5898 | + | |
5899 | + if (freezing(current)) | |
5900 | + break; | |
5901 | + | |
5902 | + cond_resched(); | |
5903 | + } | |
5904 | + end_time = task_sched_runtime(current); | |
5905 | + delta_exec = end_time - start_time; | |
5906 | + | |
5907 | + if (freezing(current)) | |
5908 | + return; | |
5909 | + | |
5910 | + cleanup_vma_slots(); | |
5911 | + uksm_enter_all_slots(); | |
5912 | + | |
5913 | + round_finished = 1; | |
5914 | + all_rungs_emtpy = 1; | |
5915 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
5916 | + struct scan_rung *rung = &uksm_scan_ladder[i]; | |
5917 | + | |
5918 | + if (rung->vma_root.num) { | |
5919 | + all_rungs_emtpy = 0; | |
5920 | + if (!rung_round_finished(rung)) | |
5921 | + round_finished = 0; | |
5922 | + } | |
5923 | + } | |
5924 | + | |
5925 | + if (all_rungs_emtpy) | |
5926 | + round_finished = 0; | |
5927 | + | |
5928 | + if (round_finished) { | |
5929 | + round_update_ladder(); | |
5930 | + uksm_eval_round++; | |
5931 | + | |
5932 | + if (hash_round_finished() && rshash_adjust()) { | |
5933 | + /* Reset the unstable root iff hash strength changed */ | |
5934 | + uksm_hash_round++; | |
5935 | + root_unstable_tree = RB_ROOT; | |
5936 | + free_all_tree_nodes(&unstable_tree_node_list); | |
5937 | + } | |
5938 | + | |
5939 | + /* | |
5940 | + * A number of pages can hang around indefinitely on per-cpu | |
5941 | + * pagevecs, raised page count preventing write_protect_page | |
5942 | + * from merging them. Though it doesn't really matter much, | |
5943 | + * it is puzzling to see some stuck in pages_volatile until | |
5944 | + * other activity jostles them out, and they also prevented | |
5945 | + * LTP's KSM test from succeeding deterministically; so drain | |
5946 | + * them here (here rather than on entry to uksm_do_scan(), | |
5947 | + * so we don't IPI too often when pages_to_scan is set low). | |
5948 | + */ | |
5949 | + lru_add_drain_all(); | |
5950 | + } | |
5951 | + | |
5952 | + | |
5953 | + if (vpages && delta_exec > 0) { | |
5954 | + pcost = (unsigned long) delta_exec / vpages; | |
5955 | + if (likely(uksm_ema_page_time)) | |
5956 | + uksm_ema_page_time = ema(pcost, uksm_ema_page_time); | |
5957 | + else | |
5958 | + uksm_ema_page_time = pcost; | |
5959 | + } | |
5960 | + | |
5961 | + uksm_calc_scan_pages(); | |
5962 | + uksm_sleep_real = uksm_sleep_jiffies; | |
5963 | + /* in case of radical cpu bursts, apply the upper bound */ | |
5964 | + end_time = task_sched_runtime(current); | |
5965 | + if (max_cpu_ratio && end_time > start_time) { | |
5966 | + scan_time = end_time - start_time; | |
5967 | + expected_jiffies = msecs_to_jiffies( | |
5968 | + scan_time_to_sleep(scan_time, max_cpu_ratio)); | |
5969 | + | |
5970 | + if (expected_jiffies > uksm_sleep_real) | |
5971 | + uksm_sleep_real = expected_jiffies; | |
5972 | + | |
5973 | + /* We have a 1 second up bound for responsiveness. */ | |
5974 | + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) | |
5975 | + uksm_sleep_real = msecs_to_jiffies(1000); | |
5976 | + } | |
5977 | + | |
5978 | + return; | |
5979 | +} | |
5980 | + | |
5981 | +static int ksmd_should_run(void) | |
5982 | +{ | |
5983 | + return uksm_run & UKSM_RUN_MERGE; | |
5984 | +} | |
5985 | + | |
5986 | +static int uksm_scan_thread(void *nothing) | |
5987 | +{ | |
5988 | + set_freezable(); | |
5989 | + set_user_nice(current, 5); | |
5990 | + | |
5991 | + while (!kthread_should_stop()) { | |
5992 | + mutex_lock(&uksm_thread_mutex); | |
5993 | + if (ksmd_should_run()) { | |
5994 | + uksm_do_scan(); | |
5995 | + } | |
5996 | + mutex_unlock(&uksm_thread_mutex); | |
5997 | + | |
5998 | + try_to_freeze(); | |
5999 | + | |
6000 | + if (ksmd_should_run()) { | |
6001 | + schedule_timeout_interruptible(uksm_sleep_real); | |
6002 | + uksm_sleep_times++; | |
6003 | + } else { | |
6004 | + wait_event_freezable(uksm_thread_wait, | |
6005 | + ksmd_should_run() || kthread_should_stop()); | |
6006 | + } | |
6007 | + } | |
6008 | + return 0; | |
6009 | +} | |
6010 | + | |
6011 | +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) | |
6012 | +{ | |
6013 | + struct stable_node *stable_node; | |
6014 | + struct node_vma *node_vma; | |
6015 | + struct rmap_item *rmap_item; | |
6016 | + int ret = SWAP_AGAIN; | |
6017 | + int search_new_forks = 0; | |
6018 | + unsigned long address; | |
6019 | + | |
6020 | + VM_BUG_ON_PAGE(!PageKsm(page), page); | |
6021 | + VM_BUG_ON_PAGE(!PageLocked(page), page); | |
6022 | + | |
6023 | + stable_node = page_stable_node(page); | |
6024 | + if (!stable_node) | |
6025 | + return ret; | |
6026 | +again: | |
6027 | + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { | |
6028 | + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { | |
6029 | + struct anon_vma *anon_vma = rmap_item->anon_vma; | |
6030 | + struct anon_vma_chain *vmac; | |
6031 | + struct vm_area_struct *vma; | |
6032 | + | |
6033 | + anon_vma_lock_read(anon_vma); | |
6034 | + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | |
6035 | + 0, ULONG_MAX) { | |
6036 | + vma = vmac->vma; | |
6037 | + address = get_rmap_addr(rmap_item); | |
6038 | + | |
6039 | + if (address < vma->vm_start || | |
6040 | + address >= vma->vm_end) | |
6041 | + continue; | |
6042 | + | |
6043 | + if ((rmap_item->slot->vma == vma) == | |
6044 | + search_new_forks) | |
6045 | + continue; | |
6046 | + | |
6047 | + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | |
6048 | + continue; | |
6049 | + | |
6050 | + ret = rwc->rmap_one(page, vma, address, rwc->arg); | |
6051 | + if (ret != SWAP_AGAIN) { | |
6052 | + anon_vma_unlock_read(anon_vma); | |
6053 | + goto out; | |
6054 | + } | |
6055 | + | |
6056 | + if (rwc->done && rwc->done(page)) { | |
6057 | + anon_vma_unlock_read(anon_vma); | |
6058 | + goto out; | |
6059 | + } | |
6060 | + } | |
6061 | + anon_vma_unlock_read(anon_vma); | |
6062 | + } | |
6063 | + } | |
6064 | + if (!search_new_forks++) | |
6065 | + goto again; | |
6066 | +out: | |
6067 | + return ret; | |
6068 | +} | |
6069 | + | |
6070 | +#ifdef CONFIG_MIGRATION | |
6071 | +/* Common ksm interface but may be specific to uksm */ | |
6072 | +void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |
6073 | +{ | |
6074 | + struct stable_node *stable_node; | |
6075 | + | |
6076 | + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | |
6077 | + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | |
6078 | + VM_BUG_ON(newpage->mapping != oldpage->mapping); | |
6079 | + | |
6080 | + stable_node = page_stable_node(newpage); | |
6081 | + if (stable_node) { | |
6082 | + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | |
6083 | + stable_node->kpfn = page_to_pfn(newpage); | |
6084 | + } | |
6085 | +} | |
6086 | +#endif /* CONFIG_MIGRATION */ | |
6087 | + | |
6088 | +#ifdef CONFIG_MEMORY_HOTREMOVE | |
6089 | +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, | |
6090 | + unsigned long end_pfn) | |
6091 | +{ | |
6092 | + struct rb_node *node; | |
6093 | + | |
6094 | + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { | |
6095 | + struct stable_node *stable_node; | |
6096 | + | |
6097 | + stable_node = rb_entry(node, struct stable_node, node); | |
6098 | + if (stable_node->kpfn >= start_pfn && | |
6099 | + stable_node->kpfn < end_pfn) | |
6100 | + return stable_node; | |
6101 | + } | |
6102 | + return NULL; | |
6103 | +} | |
6104 | + | |
6105 | +static int uksm_memory_callback(struct notifier_block *self, | |
6106 | + unsigned long action, void *arg) | |
6107 | +{ | |
6108 | + struct memory_notify *mn = arg; | |
6109 | + struct stable_node *stable_node; | |
6110 | + | |
6111 | + switch (action) { | |
6112 | + case MEM_GOING_OFFLINE: | |
6113 | + /* | |
6114 | + * Keep it very simple for now: just lock out ksmd and | |
6115 | + * MADV_UNMERGEABLE while any memory is going offline. | |
6116 | + * mutex_lock_nested() is necessary because lockdep was alarmed | |
6117 | + * that here we take uksm_thread_mutex inside notifier chain | |
6118 | + * mutex, and later take notifier chain mutex inside | |
6119 | + * uksm_thread_mutex to unlock it. But that's safe because both | |
6120 | + * are inside mem_hotplug_mutex. | |
6121 | + */ | |
6122 | + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); | |
6123 | + break; | |
6124 | + | |
6125 | + case MEM_OFFLINE: | |
6126 | + /* | |
6127 | + * Most of the work is done by page migration; but there might | |
6128 | + * be a few stable_nodes left over, still pointing to struct | |
6129 | + * pages which have been offlined: prune those from the tree. | |
6130 | + */ | |
6131 | + while ((stable_node = uksm_check_stable_tree(mn->start_pfn, | |
6132 | + mn->start_pfn + mn->nr_pages)) != NULL) | |
6133 | + remove_node_from_stable_tree(stable_node, 1, 1); | |
6134 | + /* fallthrough */ | |
6135 | + | |
6136 | + case MEM_CANCEL_OFFLINE: | |
6137 | + mutex_unlock(&uksm_thread_mutex); | |
6138 | + break; | |
6139 | + } | |
6140 | + return NOTIFY_OK; | |
6141 | +} | |
6142 | +#endif /* CONFIG_MEMORY_HOTREMOVE */ | |
6143 | + | |
6144 | +#ifdef CONFIG_SYSFS | |
6145 | +/* | |
6146 | + * This all compiles without CONFIG_SYSFS, but is a waste of space. | |
6147 | + */ | |
6148 | + | |
6149 | +#define UKSM_ATTR_RO(_name) \ | |
6150 | + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |
6151 | +#define UKSM_ATTR(_name) \ | |
6152 | + static struct kobj_attribute _name##_attr = \ | |
6153 | + __ATTR(_name, 0644, _name##_show, _name##_store) | |
6154 | + | |
6155 | +static ssize_t max_cpu_percentage_show(struct kobject *kobj, | |
6156 | + struct kobj_attribute *attr, char *buf) | |
6157 | +{ | |
6158 | + return sprintf(buf, "%u\n", uksm_max_cpu_percentage); | |
6159 | +} | |
6160 | + | |
6161 | +static ssize_t max_cpu_percentage_store(struct kobject *kobj, | |
6162 | + struct kobj_attribute *attr, | |
6163 | + const char *buf, size_t count) | |
6164 | +{ | |
6165 | + unsigned long max_cpu_percentage; | |
6166 | + int err; | |
6167 | + | |
6168 | + err = kstrtoul(buf, 10, &max_cpu_percentage); | |
6169 | + if (err || max_cpu_percentage > 100) | |
6170 | + return -EINVAL; | |
6171 | + | |
6172 | + if (max_cpu_percentage == 100) | |
6173 | + max_cpu_percentage = 99; | |
6174 | + else if (max_cpu_percentage < 10) | |
6175 | + max_cpu_percentage = 10; | |
6176 | + | |
6177 | + uksm_max_cpu_percentage = max_cpu_percentage; | |
6178 | + | |
6179 | + return count; | |
6180 | +} | |
6181 | +UKSM_ATTR(max_cpu_percentage); | |
6182 | + | |
6183 | +static ssize_t sleep_millisecs_show(struct kobject *kobj, | |
6184 | + struct kobj_attribute *attr, char *buf) | |
6185 | +{ | |
6186 | + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); | |
6187 | +} | |
6188 | + | |
6189 | +static ssize_t sleep_millisecs_store(struct kobject *kobj, | |
6190 | + struct kobj_attribute *attr, | |
6191 | + const char *buf, size_t count) | |
6192 | +{ | |
6193 | + unsigned long msecs; | |
6194 | + int err; | |
6195 | + | |
6196 | + err = kstrtoul(buf, 10, &msecs); | |
6197 | + if (err || msecs > MSEC_PER_SEC) | |
6198 | + return -EINVAL; | |
6199 | + | |
6200 | + uksm_sleep_jiffies = msecs_to_jiffies(msecs); | |
6201 | + uksm_sleep_saved = uksm_sleep_jiffies; | |
6202 | + | |
6203 | + return count; | |
6204 | +} | |
6205 | +UKSM_ATTR(sleep_millisecs); | |
6206 | + | |
6207 | + | |
6208 | +static ssize_t cpu_governor_show(struct kobject *kobj, | |
6209 | + struct kobj_attribute *attr, char *buf) | |
6210 | +{ | |
6211 | + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); | |
6212 | + int i; | |
6213 | + | |
6214 | + buf[0] = '\0'; | |
6215 | + for (i = 0; i < n ; i++) { | |
6216 | + if (uksm_cpu_governor == i) | |
6217 | + strcat(buf, "["); | |
6218 | + | |
6219 | + strcat(buf, uksm_cpu_governor_str[i]); | |
6220 | + | |
6221 | + if (uksm_cpu_governor == i) | |
6222 | + strcat(buf, "]"); | |
6223 | + | |
6224 | + strcat(buf, " "); | |
6225 | + } | |
6226 | + strcat(buf, "\n"); | |
6227 | + | |
6228 | + return strlen(buf); | |
6229 | +} | |
6230 | + | |
6231 | +static inline void init_performance_values(void) | |
6232 | +{ | |
6233 | + int i; | |
6234 | + struct scan_rung *rung; | |
6235 | + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; | |
6236 | + | |
6237 | + | |
6238 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6239 | + rung = uksm_scan_ladder + i; | |
6240 | + rung->cpu_ratio = preset->cpu_ratio[i]; | |
6241 | + rung->cover_msecs = preset->cover_msecs[i]; | |
6242 | + } | |
6243 | + | |
6244 | + uksm_max_cpu_percentage = preset->max_cpu; | |
6245 | +} | |
6246 | + | |
6247 | +static ssize_t cpu_governor_store(struct kobject *kobj, | |
6248 | + struct kobj_attribute *attr, | |
6249 | + const char *buf, size_t count) | |
6250 | +{ | |
6251 | + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); | |
6252 | + | |
6253 | + for (n--; n >=0 ; n--) { | |
6254 | + if (!strncmp(buf, uksm_cpu_governor_str[n], | |
6255 | + strlen(uksm_cpu_governor_str[n]))) | |
6256 | + break; | |
6257 | + } | |
6258 | + | |
6259 | + if (n < 0) | |
6260 | + return -EINVAL; | |
6261 | + else | |
6262 | + uksm_cpu_governor = n; | |
6263 | + | |
6264 | + init_performance_values(); | |
6265 | + | |
6266 | + return count; | |
6267 | +} | |
6268 | +UKSM_ATTR(cpu_governor); | |
6269 | + | |
6270 | +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | |
6271 | + char *buf) | |
6272 | +{ | |
6273 | + return sprintf(buf, "%u\n", uksm_run); | |
6274 | +} | |
6275 | + | |
6276 | +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |
6277 | + const char *buf, size_t count) | |
6278 | +{ | |
6279 | + int err; | |
6280 | + unsigned long flags; | |
6281 | + | |
6282 | + err = kstrtoul(buf, 10, &flags); | |
6283 | + if (err || flags > UINT_MAX) | |
6284 | + return -EINVAL; | |
6285 | + if (flags > UKSM_RUN_MERGE) | |
6286 | + return -EINVAL; | |
6287 | + | |
6288 | + mutex_lock(&uksm_thread_mutex); | |
6289 | + if (uksm_run != flags) { | |
6290 | + uksm_run = flags; | |
6291 | + } | |
6292 | + mutex_unlock(&uksm_thread_mutex); | |
6293 | + | |
6294 | + if (flags & UKSM_RUN_MERGE) | |
6295 | + wake_up_interruptible(&uksm_thread_wait); | |
6296 | + | |
6297 | + return count; | |
6298 | +} | |
6299 | +UKSM_ATTR(run); | |
6300 | + | |
6301 | +static ssize_t abundant_threshold_show(struct kobject *kobj, | |
6302 | + struct kobj_attribute *attr, char *buf) | |
6303 | +{ | |
6304 | + return sprintf(buf, "%u\n", uksm_abundant_threshold); | |
6305 | +} | |
6306 | + | |
6307 | +static ssize_t abundant_threshold_store(struct kobject *kobj, | |
6308 | + struct kobj_attribute *attr, | |
6309 | + const char *buf, size_t count) | |
6310 | +{ | |
6311 | + int err; | |
6312 | + unsigned long flags; | |
6313 | + | |
6314 | + err = kstrtoul(buf, 10, &flags); | |
6315 | + if (err || flags > 99) | |
6316 | + return -EINVAL; | |
6317 | + | |
6318 | + uksm_abundant_threshold = flags; | |
6319 | + | |
6320 | + return count; | |
6321 | +} | |
6322 | +UKSM_ATTR(abundant_threshold); | |
6323 | + | |
6324 | +static ssize_t thrash_threshold_show(struct kobject *kobj, | |
6325 | + struct kobj_attribute *attr, char *buf) | |
6326 | +{ | |
6327 | + return sprintf(buf, "%u\n", uksm_thrash_threshold); | |
6328 | +} | |
6329 | + | |
6330 | +static ssize_t thrash_threshold_store(struct kobject *kobj, | |
6331 | + struct kobj_attribute *attr, | |
6332 | + const char *buf, size_t count) | |
6333 | +{ | |
6334 | + int err; | |
6335 | + unsigned long flags; | |
6336 | + | |
6337 | + err = kstrtoul(buf, 10, &flags); | |
6338 | + if (err || flags > 99) | |
6339 | + return -EINVAL; | |
6340 | + | |
6341 | + uksm_thrash_threshold = flags; | |
6342 | + | |
6343 | + return count; | |
6344 | +} | |
6345 | +UKSM_ATTR(thrash_threshold); | |
6346 | + | |
6347 | +static ssize_t cpu_ratios_show(struct kobject *kobj, | |
6348 | + struct kobj_attribute *attr, char *buf) | |
6349 | +{ | |
6350 | + int i, size; | |
6351 | + struct scan_rung *rung; | |
6352 | + char *p = buf; | |
6353 | + | |
6354 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6355 | + rung = &uksm_scan_ladder[i]; | |
6356 | + | |
6357 | + if (rung->cpu_ratio > 0) | |
6358 | + size = sprintf(p, "%d ", rung->cpu_ratio); | |
6359 | + else | |
6360 | + size = sprintf(p, "MAX/%d ", | |
6361 | + TIME_RATIO_SCALE / -rung->cpu_ratio); | |
6362 | + | |
6363 | + p += size; | |
6364 | + } | |
6365 | + | |
6366 | + *p++ = '\n'; | |
6367 | + *p = '\0'; | |
6368 | + | |
6369 | + return p - buf; | |
6370 | +} | |
6371 | + | |
6372 | +static ssize_t cpu_ratios_store(struct kobject *kobj, | |
6373 | + struct kobj_attribute *attr, | |
6374 | + const char *buf, size_t count) | |
6375 | +{ | |
6376 | + int i, cpuratios[SCAN_LADDER_SIZE], err; | |
6377 | + unsigned long value; | |
6378 | + struct scan_rung *rung; | |
6379 | + char *p, *end = NULL; | |
6380 | + | |
6381 | + p = kzalloc(count, GFP_KERNEL); | |
6382 | + if (!p) | |
6383 | + return -ENOMEM; | |
6384 | + | |
6385 | + memcpy(p, buf, count); | |
6386 | + | |
6387 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6388 | + if (i != SCAN_LADDER_SIZE -1) { | |
6389 | + end = strchr(p, ' '); | |
6390 | + if (!end) | |
6391 | + return -EINVAL; | |
6392 | + | |
6393 | + *end = '\0'; | |
6394 | + } | |
6395 | + | |
6396 | + if (strstr(p, "MAX/")) { | |
6397 | + p = strchr(p, '/') + 1; | |
6398 | + err = kstrtoul(p, 10, &value); | |
6399 | + if (err || value > TIME_RATIO_SCALE || !value) | |
6400 | + return -EINVAL; | |
6401 | + | |
6402 | + cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); | |
6403 | + } else { | |
6404 | + err = kstrtoul(p, 10, &value); | |
6405 | + if (err || value > TIME_RATIO_SCALE || !value) | |
6406 | + return -EINVAL; | |
6407 | + | |
6408 | + cpuratios[i] = value; | |
6409 | + } | |
6410 | + | |
6411 | + p = end + 1; | |
6412 | + } | |
6413 | + | |
6414 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6415 | + rung = &uksm_scan_ladder[i]; | |
6416 | + | |
6417 | + rung->cpu_ratio = cpuratios[i]; | |
6418 | + } | |
6419 | + | |
6420 | + return count; | |
6421 | +} | |
6422 | +UKSM_ATTR(cpu_ratios); | |
6423 | + | |
6424 | +static ssize_t eval_intervals_show(struct kobject *kobj, | |
6425 | + struct kobj_attribute *attr, char *buf) | |
6426 | +{ | |
6427 | + int i, size; | |
6428 | + struct scan_rung *rung; | |
6429 | + char *p = buf; | |
6430 | + | |
6431 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6432 | + rung = &uksm_scan_ladder[i]; | |
6433 | + size = sprintf(p, "%u ", rung->cover_msecs); | |
6434 | + p += size; | |
6435 | + } | |
6436 | + | |
6437 | + *p++ = '\n'; | |
6438 | + *p = '\0'; | |
6439 | + | |
6440 | + return p - buf; | |
6441 | +} | |
6442 | + | |
6443 | +static ssize_t eval_intervals_store(struct kobject *kobj, | |
6444 | + struct kobj_attribute *attr, | |
6445 | + const char *buf, size_t count) | |
6446 | +{ | |
6447 | + int i, err; | |
6448 | + unsigned long values[SCAN_LADDER_SIZE]; | |
6449 | + struct scan_rung *rung; | |
6450 | + char *p, *end = NULL; | |
6451 | + | |
6452 | + p = kzalloc(count, GFP_KERNEL); | |
6453 | + if (!p) | |
6454 | + return -ENOMEM; | |
6455 | + | |
6456 | + memcpy(p, buf, count); | |
6457 | + | |
6458 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6459 | + if (i != SCAN_LADDER_SIZE -1) { | |
6460 | + end = strchr(p, ' '); | |
6461 | + if (!end) | |
6462 | + return -EINVAL; | |
6463 | + | |
6464 | + *end = '\0'; | |
6465 | + } | |
6466 | + | |
6467 | + err = kstrtoul(p, 10, &values[i]); | |
6468 | + if (err) | |
6469 | + return -EINVAL; | |
6470 | + | |
6471 | + p = end + 1; | |
6472 | + } | |
6473 | + | |
6474 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6475 | + rung = &uksm_scan_ladder[i]; | |
6476 | + | |
6477 | + rung->cover_msecs = values[i]; | |
6478 | + } | |
6479 | + | |
6480 | + return count; | |
6481 | +} | |
6482 | +UKSM_ATTR(eval_intervals); | |
6483 | + | |
6484 | +static ssize_t ema_per_page_time_show(struct kobject *kobj, | |
6485 | + struct kobj_attribute *attr, char *buf) | |
6486 | +{ | |
6487 | + return sprintf(buf, "%lu\n", uksm_ema_page_time); | |
6488 | +} | |
6489 | +UKSM_ATTR_RO(ema_per_page_time); | |
6490 | + | |
6491 | +static ssize_t pages_shared_show(struct kobject *kobj, | |
6492 | + struct kobj_attribute *attr, char *buf) | |
6493 | +{ | |
6494 | + return sprintf(buf, "%lu\n", uksm_pages_shared); | |
6495 | +} | |
6496 | +UKSM_ATTR_RO(pages_shared); | |
6497 | + | |
6498 | +static ssize_t pages_sharing_show(struct kobject *kobj, | |
6499 | + struct kobj_attribute *attr, char *buf) | |
6500 | +{ | |
6501 | + return sprintf(buf, "%lu\n", uksm_pages_sharing); | |
6502 | +} | |
6503 | +UKSM_ATTR_RO(pages_sharing); | |
6504 | + | |
6505 | +static ssize_t pages_unshared_show(struct kobject *kobj, | |
6506 | + struct kobj_attribute *attr, char *buf) | |
6507 | +{ | |
6508 | + return sprintf(buf, "%lu\n", uksm_pages_unshared); | |
6509 | +} | |
6510 | +UKSM_ATTR_RO(pages_unshared); | |
6511 | + | |
6512 | +static ssize_t full_scans_show(struct kobject *kobj, | |
6513 | + struct kobj_attribute *attr, char *buf) | |
6514 | +{ | |
6515 | + return sprintf(buf, "%llu\n", fully_scanned_round); | |
6516 | +} | |
6517 | +UKSM_ATTR_RO(full_scans); | |
6518 | + | |
6519 | +static ssize_t pages_scanned_show(struct kobject *kobj, | |
6520 | + struct kobj_attribute *attr, char *buf) | |
6521 | +{ | |
6522 | + unsigned long base = 0; | |
6523 | + u64 delta, ret; | |
6524 | + | |
6525 | + if (pages_scanned_stored) { | |
6526 | + base = pages_scanned_base; | |
6527 | + ret = pages_scanned_stored; | |
6528 | + delta = uksm_pages_scanned >> base; | |
6529 | + if (CAN_OVERFLOW_U64(ret, delta)) { | |
6530 | + ret >>= 1; | |
6531 | + delta >>= 1; | |
6532 | + base++; | |
6533 | + ret += delta; | |
6534 | + } | |
6535 | + } else { | |
6536 | + ret = uksm_pages_scanned; | |
6537 | + } | |
6538 | + | |
6539 | + while (ret > ULONG_MAX) { | |
6540 | + ret >>= 1; | |
6541 | + base++; | |
6542 | + } | |
6543 | + | |
6544 | + if (base) | |
6545 | + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); | |
6546 | + else | |
6547 | + return sprintf(buf, "%lu\n", (unsigned long)ret); | |
6548 | +} | |
6549 | +UKSM_ATTR_RO(pages_scanned); | |
6550 | + | |
6551 | +static ssize_t hash_strength_show(struct kobject *kobj, | |
6552 | + struct kobj_attribute *attr, char *buf) | |
6553 | +{ | |
6554 | + return sprintf(buf, "%lu\n", hash_strength); | |
6555 | +} | |
6556 | +UKSM_ATTR_RO(hash_strength); | |
6557 | + | |
6558 | +static ssize_t sleep_times_show(struct kobject *kobj, | |
6559 | + struct kobj_attribute *attr, char *buf) | |
6560 | +{ | |
6561 | + return sprintf(buf, "%llu\n", uksm_sleep_times); | |
6562 | +} | |
6563 | +UKSM_ATTR_RO(sleep_times); | |
6564 | + | |
6565 | + | |
6566 | +static struct attribute *uksm_attrs[] = { | |
6567 | + &max_cpu_percentage_attr.attr, | |
6568 | + &sleep_millisecs_attr.attr, | |
6569 | + &cpu_governor_attr.attr, | |
6570 | + &run_attr.attr, | |
6571 | + &ema_per_page_time_attr.attr, | |
6572 | + &pages_shared_attr.attr, | |
6573 | + &pages_sharing_attr.attr, | |
6574 | + &pages_unshared_attr.attr, | |
6575 | + &full_scans_attr.attr, | |
6576 | + &pages_scanned_attr.attr, | |
6577 | + &hash_strength_attr.attr, | |
6578 | + &sleep_times_attr.attr, | |
6579 | + &thrash_threshold_attr.attr, | |
6580 | + &abundant_threshold_attr.attr, | |
6581 | + &cpu_ratios_attr.attr, | |
6582 | + &eval_intervals_attr.attr, | |
6583 | + NULL, | |
6584 | +}; | |
6585 | + | |
6586 | +static struct attribute_group uksm_attr_group = { | |
6587 | + .attrs = uksm_attrs, | |
6588 | + .name = "uksm", | |
6589 | +}; | |
6590 | +#endif /* CONFIG_SYSFS */ | |
6591 | + | |
6592 | +static inline void init_scan_ladder(void) | |
6593 | +{ | |
6594 | + int i; | |
6595 | + struct scan_rung *rung; | |
6596 | + | |
6597 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
6598 | + rung = uksm_scan_ladder + i; | |
6599 | + slot_tree_init_root(&rung->vma_root); | |
6600 | + } | |
6601 | + | |
6602 | + init_performance_values(); | |
6603 | + uksm_calc_scan_pages(); | |
6604 | +} | |
6605 | + | |
6606 | +static inline int cal_positive_negative_costs(void) | |
6607 | +{ | |
6608 | + struct page *p1, *p2; | |
6609 | + unsigned char *addr1, *addr2; | |
6610 | + unsigned long i, time_start, hash_cost; | |
6611 | + unsigned long loopnum = 0; | |
6612 | + | |
6613 | + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ | |
6614 | + volatile u32 hash; | |
6615 | + volatile int ret; | |
6616 | + | |
6617 | + p1 = alloc_page(GFP_KERNEL); | |
6618 | + if (!p1) | |
6619 | + return -ENOMEM; | |
6620 | + | |
6621 | + p2 = alloc_page(GFP_KERNEL); | |
6622 | + if (!p2) | |
6623 | + return -ENOMEM; | |
6624 | + | |
6625 | + addr1 = kmap_atomic(p1); | |
6626 | + addr2 = kmap_atomic(p2); | |
6627 | + memset(addr1, prandom_u32(), PAGE_SIZE); | |
6628 | + memcpy(addr2, addr1, PAGE_SIZE); | |
6629 | + | |
6630 | + /* make sure that the two pages differ in last byte */ | |
6631 | + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; | |
6632 | + kunmap_atomic(addr2); | |
6633 | + kunmap_atomic(addr1); | |
6634 | + | |
6635 | + time_start = jiffies; | |
6636 | + while (jiffies - time_start < 100) { | |
6637 | + for (i = 0; i < 100; i++) | |
6638 | + hash = page_hash(p1, HASH_STRENGTH_FULL, 0); | |
6639 | + loopnum += 100; | |
6640 | + } | |
6641 | + hash_cost = (jiffies - time_start); | |
6642 | + | |
6643 | + time_start = jiffies; | |
6644 | + for (i = 0; i < loopnum; i++) | |
6645 | + ret = pages_identical(p1, p2); | |
6646 | + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); | |
6647 | + memcmp_cost /= hash_cost; | |
6648 | + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " | |
6649 | + "hash=%u cmp_ret=%d.\n", | |
6650 | + memcmp_cost, hash, ret); | |
6651 | + | |
6652 | + __free_page(p1); | |
6653 | + __free_page(p2); | |
6654 | + return 0; | |
6655 | +} | |
6656 | + | |
6657 | +static int init_zeropage_hash_table(void) | |
6658 | +{ | |
6659 | + struct page *page; | |
6660 | + char *addr; | |
6661 | + int i; | |
6662 | + | |
6663 | + page = alloc_page(GFP_KERNEL); | |
6664 | + if (!page) | |
6665 | + return -ENOMEM; | |
6666 | + | |
6667 | + addr = kmap_atomic(page); | |
6668 | + memset(addr, 0, PAGE_SIZE); | |
6669 | + kunmap_atomic(addr); | |
6670 | + | |
6671 | + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), | |
6672 | + GFP_KERNEL); | |
6673 | + if (!zero_hash_table) | |
6674 | + return -ENOMEM; | |
6675 | + | |
6676 | + for (i = 0; i < HASH_STRENGTH_MAX; i++) | |
6677 | + zero_hash_table[i] = page_hash(page, i, 0); | |
6678 | + | |
6679 | + __free_page(page); | |
6680 | + | |
6681 | + return 0; | |
6682 | +} | |
6683 | + | |
6684 | +static inline int init_random_sampling(void) | |
6685 | +{ | |
6686 | + unsigned long i; | |
6687 | + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); | |
6688 | + if (!random_nums) | |
6689 | + return -ENOMEM; | |
6690 | + | |
6691 | + for (i = 0; i < HASH_STRENGTH_FULL; i++) | |
6692 | + random_nums[i] = i; | |
6693 | + | |
6694 | + for (i = 0; i < HASH_STRENGTH_FULL; i++) { | |
6695 | + unsigned long rand_range, swap_index, tmp; | |
6696 | + | |
6697 | + rand_range = HASH_STRENGTH_FULL - i; | |
6698 | + swap_index = i + prandom_u32() % rand_range; | |
6699 | + tmp = random_nums[i]; | |
6700 | + random_nums[i] = random_nums[swap_index]; | |
6701 | + random_nums[swap_index] = tmp; | |
6702 | + } | |
6703 | + | |
6704 | + rshash_state.state = RSHASH_NEW; | |
6705 | + rshash_state.below_count = 0; | |
6706 | + rshash_state.lookup_window_index = 0; | |
6707 | + | |
6708 | + return cal_positive_negative_costs(); | |
6709 | +} | |
6710 | + | |
6711 | +static int __init uksm_slab_init(void) | |
6712 | +{ | |
6713 | + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); | |
6714 | + if (!rmap_item_cache) | |
6715 | + goto out; | |
6716 | + | |
6717 | + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); | |
6718 | + if (!stable_node_cache) | |
6719 | + goto out_free1; | |
6720 | + | |
6721 | + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); | |
6722 | + if (!node_vma_cache) | |
6723 | + goto out_free2; | |
6724 | + | |
6725 | + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); | |
6726 | + if (!vma_slot_cache) | |
6727 | + goto out_free3; | |
6728 | + | |
6729 | + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); | |
6730 | + if (!tree_node_cache) | |
6731 | + goto out_free4; | |
6732 | + | |
6733 | + return 0; | |
6734 | + | |
6735 | +out_free4: | |
6736 | + kmem_cache_destroy(vma_slot_cache); | |
6737 | +out_free3: | |
6738 | + kmem_cache_destroy(node_vma_cache); | |
6739 | +out_free2: | |
6740 | + kmem_cache_destroy(stable_node_cache); | |
6741 | +out_free1: | |
6742 | + kmem_cache_destroy(rmap_item_cache); | |
6743 | +out: | |
6744 | + return -ENOMEM; | |
6745 | +} | |
6746 | + | |
6747 | +static void __init uksm_slab_free(void) | |
6748 | +{ | |
6749 | + kmem_cache_destroy(stable_node_cache); | |
6750 | + kmem_cache_destroy(rmap_item_cache); | |
6751 | + kmem_cache_destroy(node_vma_cache); | |
6752 | + kmem_cache_destroy(vma_slot_cache); | |
6753 | + kmem_cache_destroy(tree_node_cache); | |
6754 | +} | |
6755 | + | |
6756 | +/* Common interface to ksm, different to it. */ | |
6757 | +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |
6758 | + unsigned long end, int advice, unsigned long *vm_flags) | |
6759 | +{ | |
6760 | + int err; | |
6761 | + | |
6762 | + switch (advice) { | |
6763 | + case MADV_MERGEABLE: | |
6764 | + return 0; /* just ignore the advice */ | |
6765 | + | |
6766 | + case MADV_UNMERGEABLE: | |
6767 | + if (!(*vm_flags & VM_MERGEABLE)) | |
6768 | + return 0; /* just ignore the advice */ | |
6769 | + | |
6770 | + if (vma->anon_vma) { | |
6771 | + err = unmerge_uksm_pages(vma, start, end); | |
6772 | + if (err) | |
6773 | + return err; | |
6774 | + } | |
6775 | + | |
6776 | + uksm_remove_vma(vma); | |
6777 | + *vm_flags &= ~VM_MERGEABLE; | |
6778 | + break; | |
6779 | + } | |
6780 | + | |
6781 | + return 0; | |
6782 | +} | |
6783 | + | |
6784 | +/* Common interface to ksm, actually the same. */ | |
6785 | +struct page *ksm_might_need_to_copy(struct page *page, | |
6786 | + struct vm_area_struct *vma, unsigned long address) | |
6787 | +{ | |
6788 | + struct anon_vma *anon_vma = page_anon_vma(page); | |
6789 | + struct page *new_page; | |
6790 | + | |
6791 | + if (PageKsm(page)) { | |
6792 | + if (page_stable_node(page)) | |
6793 | + return page; /* no need to copy it */ | |
6794 | + } else if (!anon_vma) { | |
6795 | + return page; /* no need to copy it */ | |
6796 | + } else if (anon_vma->root == vma->anon_vma->root && | |
6797 | + page->index == linear_page_index(vma, address)) { | |
6798 | + return page; /* still no need to copy it */ | |
6799 | + } | |
6800 | + if (!PageUptodate(page)) | |
6801 | + return page; /* let do_swap_page report the error */ | |
6802 | + | |
6803 | + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | |
6804 | + if (new_page) { | |
6805 | + copy_user_highpage(new_page, page, address, vma); | |
6806 | + | |
6807 | + SetPageDirty(new_page); | |
6808 | + __SetPageUptodate(new_page); | |
6809 | + __set_page_locked(new_page); | |
6810 | + } | |
6811 | + | |
6812 | + return new_page; | |
6813 | +} | |
6814 | + | |
6815 | +static int __init uksm_init(void) | |
6816 | +{ | |
6817 | + struct task_struct *uksm_thread; | |
6818 | + int err; | |
6819 | + | |
6820 | + uksm_sleep_jiffies = msecs_to_jiffies(100); | |
6821 | + uksm_sleep_saved = uksm_sleep_jiffies; | |
6822 | + | |
6823 | + slot_tree_init(); | |
6824 | + init_scan_ladder(); | |
6825 | + | |
6826 | + | |
6827 | + err = init_random_sampling(); | |
6828 | + if (err) | |
6829 | + goto out_free2; | |
6830 | + | |
6831 | + err = uksm_slab_init(); | |
6832 | + if (err) | |
6833 | + goto out_free1; | |
6834 | + | |
6835 | + err = init_zeropage_hash_table(); | |
6836 | + if (err) | |
6837 | + goto out_free0; | |
6838 | + | |
6839 | + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); | |
6840 | + if (IS_ERR(uksm_thread)) { | |
6841 | + printk(KERN_ERR "uksm: creating kthread failed\n"); | |
6842 | + err = PTR_ERR(uksm_thread); | |
6843 | + goto out_free; | |
6844 | + } | |
6845 | + | |
6846 | +#ifdef CONFIG_SYSFS | |
6847 | + err = sysfs_create_group(mm_kobj, &uksm_attr_group); | |
6848 | + if (err) { | |
6849 | + printk(KERN_ERR "uksm: register sysfs failed\n"); | |
6850 | + kthread_stop(uksm_thread); | |
6851 | + goto out_free; | |
6852 | + } | |
6853 | +#else | |
6854 | + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ | |
6855 | + | |
6856 | +#endif /* CONFIG_SYSFS */ | |
6857 | + | |
6858 | +#ifdef CONFIG_MEMORY_HOTREMOVE | |
6859 | + /* | |
6860 | + * Choose a high priority since the callback takes uksm_thread_mutex: | |
6861 | + * later callbacks could only be taking locks which nest within that. | |
6862 | + */ | |
6863 | + hotplug_memory_notifier(uksm_memory_callback, 100); | |
6864 | +#endif | |
6865 | + return 0; | |
6866 | + | |
6867 | +out_free: | |
6868 | + kfree(zero_hash_table); | |
6869 | +out_free0: | |
6870 | + uksm_slab_free(); | |
6871 | +out_free1: | |
6872 | + kfree(random_nums); | |
6873 | +out_free2: | |
6874 | + kfree(uksm_scan_ladder); | |
6875 | + return err; | |
6876 | +} | |
6877 | + | |
6878 | +#ifdef MODULE | |
6879 | +subsys_initcall(ksm_init); | |
6880 | +#else | |
6881 | +late_initcall(uksm_init); | |
6882 | +#endif | |
6883 | + | |
6884 | diff --git a/mm/vmstat.c b/mm/vmstat.c | |
6885 | index 1b12d39..24b803f 100644 | |
6886 | --- a/mm/vmstat.c | |
6887 | +++ b/mm/vmstat.c | |
6888 | @@ -795,6 +795,9 @@ const char * const vmstat_text[] = { | |
6889 | "nr_anon_transparent_hugepages", | |
6890 | "nr_free_cma", | |
6891 | ||
6892 | +#ifdef CONFIG_UKSM | |
6893 | + "nr_uksm_zero_pages", | |
6894 | +#endif | |
6895 | /* enum writeback_stat_item counters */ | |
6896 | "nr_dirty_threshold", | |
6897 | "nr_dirty_background_threshold", |