diff --git a/fs/exec.c b/fs/exec.c index f9e8f6f..b060dce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -1167,6 +1168,8 @@ int do_execve(char * filename, if (IS_ERR(file)) goto out_kfree; + prefetch_exec_hook(filename); + sched_exec(); bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); diff --git a/include/linux/prefetch_core.h b/include/linux/prefetch_core.h new file mode 100644 index 0000000..a5fbd56 --- /dev/null +++ b/include/linux/prefetch_core.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2007 Krzysztof Lichota + * + * This is prefetch core - common code used for tracing and saving trace files. + * It is used by prefetching modules, such as boot and app. + */ + +#ifndef _LINUX_PREFETCH_CORE_H +#define _LINUX_PREFETCH_CORE_H + +#include +#include + +/** + * Trace record, records one range of pages for inode put into trace. +*/ +struct prefetch_trace_record { + dev_t device; + unsigned long inode_no; + pgoff_t range_start; + pgoff_t range_length; +}; + +extern char trace_file_magic[4]; + +enum { + PREFETCH_FORMAT_VERSION_MAJOR = 1, + PREFETCH_FORMAT_VERSION_MINOR = 0 +}; + +/** + * Trace on-disk header. + * Major version is increased with major changes of format. + * If you do not support this format explicitely, do not read other fields. + * Minor version is increased with backward compatible changes and + * you can read other fields and raw data, provided that you read + * trace data from @data_start offset in file. +*/ +struct prefetch_trace_header { + char magic[4]; /*Trace file signature - should contain trace_file_magic */ + u16 version_major; /*Major version of trace file format */ + u16 version_minor; /*Minor version of trace file format */ + u16 data_start; /*Trace raw data start */ +}; + +struct trace_marker { + unsigned position; + unsigned generation; +}; + +int prefetch_start_trace(struct trace_marker *marker); +int prefetch_continue_trace(struct trace_marker *marker); +int prefetch_stop_trace(struct trace_marker *marker); +int prefetch_release_trace(struct trace_marker end_marker); + +int prefetch_trace_fragment_size(struct trace_marker start_marker, + struct trace_marker end_marker); + +int get_prefetch_trace_fragment(struct trace_marker start_marker, + struct trace_marker end_marker, + void **fragment_result, + int *fragment_size_result); + +void *alloc_trace_buffer(int len); +void free_trace_buffer(void *buffer, int len); +void sort_trace_fragment(void *trace, int trace_size); + +int prefetch_save_trace_between_markers(char *filename, + struct trace_marker start_marker, + struct trace_marker end_marker); +int prefetch_save_trace_fragment(char *filename, + void *trace_buffer, int trace_size); +int prefetch_load_trace_fragment(char *filename, + void **trace_buffer, int *trace_size); + +int prefetch_start_prefetch(void *trace, int trace_size, int async); +int do_prefetch_from_file(char *filename); + +void print_marker(char *msg, struct trace_marker marker); + +/* Hook for mm page release code */ +#ifdef CONFIG_PREFETCH_CORE +void prefetch_page_release_hook(struct page *page); +#else +#define prefetch_page_release_hook(param) do {} while (0) +#endif + +struct proc_dir_entry; +extern struct proc_dir_entry *prefetch_proc_dir; + +int param_match(char *line, char *param_name); +int param_match_prefix(char *line, char *param_name); + +/*Auxiliary functions for reading and writing in kernel*/ +struct file *kernel_open(char const *file_name, int flags, int mode); +int kernel_write(struct file *file, unsigned long offset, const char *addr, + unsigned long count); +/*NOTE: kernel_read is already available in kernel*/ +int kernel_close(struct file *file); + +/* App prefetching hooks */ +#ifdef CONFIG_PREFETCH_APP +void prefetch_exec_hook(char *filename); +void prefetch_exit_hook(pid_t pid); +#else +#define prefetch_exec_hook(param) do {} while (0) +#define prefetch_exit_hook(param) do {} while (0) +#endif + +#endif /*_LINUX_PREFETCH_CORE_H*/ diff --git a/init/Kconfig b/init/Kconfig index a9e99f8..df3d532 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -104,6 +104,38 @@ config SWAP for so called swap devices or swap files in your kernel that are used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config PREFETCH_CORE + bool "Prefetching support (core)" + default n + depends on MMU && BLOCK && EXPERIMENTAL + select TASK_DELAY_ACCT + help + This option enables core of tracing and prefetching facility + The core provides functions used by real prefetching modules, + so you have to enable one of them as well. +config PREFETCH_BOOT + tristate "Boot prefetching support" + default n + depends on PREFETCH_CORE && PROC_FS && EXPERIMENTAL + help + This option enables facility for tracing and prefetching during system boot. + In order to use it you have to install appropriate prefetch init scripts. +config PREFETCH_APP + bool "Application prefetching support" + default n + depends on PREFETCH_CORE && PROC_FS && EXPERIMENTAL + help + This option enables facility for tracing and prefetching during application start. + Upon application start tracing is started and after some, configurable time, + tracing is stopped and written to file. Upon next start the files in saved + file are prefetched. +config PREFETCH_DEBUG + bool "Prefetching debug interface and debugging facilities" + default n + depends on PREFETCH_CORE && PROC_FS + help + This option enables facilities for testing and debugging tracing and prefetching. + Do not enable on production systems. config SYSVIPC bool "System V IPC" diff --git a/kernel/exit.c b/kernel/exit.c index 5b888c2..c136765 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -44,6 +44,7 @@ #include #include #include /* for audit_free() */ +#include #include #include #include @@ -864,6 +865,8 @@ fastcall NORET_TYPE void do_exit(long co struct task_struct *tsk = current; int group_dead; + prefetch_exit_hook(tsk->pid); + profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); diff --git a/mm/Makefile b/mm/Makefile index a9148ea..5433e6e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -31,4 +31,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_PREFETCH_CORE) += prefetch_core.o +obj-$(CONFIG_PREFETCH_BOOT) += prefetch_boot.o +obj-$(CONFIG_PREFETCH_APP) += prefetch_app.o diff --git a/mm/filemap.c b/mm/filemap.c index edb1b0b..405487c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "filemap.h" #include "internal.h" @@ -115,7 +116,9 @@ generic_file_direct_IO(int rw, struct ki void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - + + prefetch_page_release_hook(page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/prefetch_app.c b/mm/prefetch_app.c new file mode 100644 index 0000000..b7f3d43 --- /dev/null +++ b/mm/prefetch_app.c @@ -0,0 +1,1071 @@ +/* + * linux/mm/prefetch_app.c + * + * Copyright (C) 2007 Krzysztof Lichota + * + * This is application tracing and prefetching module. It traces application start + * for specified time, then upon next start it prefetches these files. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/*Enables/disables whole functionality of the module*/ +static int enabled = 1; +module_param(enabled, bool, 0); +MODULE_PARM_DESC(enabled, + "Enables or disables whole app prefetching module functionality (tracing and prefetching)"); + +static int initialized = 0; + +/*Controls whether prefetching should be done along with tracing.*/ +static int prefetch_enabled = 1; +module_param(prefetch_enabled, bool, 0); +MODULE_PARM_DESC(prefetch_enabled, + "Enables or disables prefetching during app start. If disabled, only tracing will be done"); + +/*Size of shortened name, together with hash it should be <=DNAME_INLINE_LEN_MIN*/ +static int short_name_len = 10; +module_param(short_name_len, bool, 0); +MODULE_PARM_DESC(short_name_len, + "Length of shortened file name, used to name prefetch file together with hash of whole name"); + +#define DEFAULT_APP_TRACE_FILENAME_TEMPLATE "/.prefetch/%s" +static char *filename_template = DEFAULT_APP_TRACE_FILENAME_TEMPLATE; +module_param(filename_template, charp, 0); +MODULE_PARM_DESC(filename_template, + "Template for application trace name, where trace will be saved and read from. %s will be replaced with name of application and hash. The default is: " + DEFAULT_APP_TRACE_FILENAME_TEMPLATE); + +/*Size of hashtable for filenames*/ +static int filename_hashtable_size = 128; +module_param(filename_hashtable_size, uint, 0); +MODULE_PARM_DESC(filename_hashtable_size, "Size of hashtable for filenames"); + +/** + * Time (in seconds) after which app tracing is stopped. +*/ +static int tracing_timeout = 10; +module_param(tracing_timeout, uint, 0); +MODULE_PARM_DESC(tracing_timeout, + "Time (in seconds) after which app tracing is stopped"); + +/** + * IO ticks (in centisecs) threshold above which application will be traced and prefetching done. +*/ +static int tracing_ticks_threshold = 200; +module_param(tracing_ticks_threshold, uint, 0); +MODULE_PARM_DESC(tracing_ticks_threshold, + "IO ticks (in centisecs) threshold above which application will be traced and prefetching done"); + +/** + * Hashtable of apps names blacklisted from tracing/prefetching. + * If filename is on this list, it will not be traced. + * Protected by prefetch_apps_blacklist_mutex. +*/ +struct hlist_head *prefetch_apps_blacklist; +DEFINE_MUTEX(prefetch_apps_blacklist_mutex); + +/** + * Hashtable of apps names which should be traced/prefetched. + * If filename is on this list, it means it has been decided that tracing/prefetching + * should be done for it. + * This list is protected by prefetch_apps_list_mutex. +*/ +struct hlist_head *prefetch_apps_list; +DEFINE_MUTEX(prefetch_apps_list_mutex); + +/** + * Entry in filename hashtable list. +*/ +struct filename_entry { + struct hlist_node entries_list; + char *filename; +}; + +struct trace_job; + +/** + * Entry in traced pids hashtable list. +*/ +struct traced_pid_entry { + struct hlist_node entries_list; + pid_t pid; + struct trace_job *trace_job; +}; + +#define TRACED_HASH_SIZE 16 +/** + * Hashtable of concurrently traced applications. + * The key is pid. + * Protected by traced_pids_mutex. +*/ +struct hlist_head *traced_pids; + +DEFINE_MUTEX(traced_pids_mutex); + +/** + * Frees filename entry contents and entry itself. +*/ +void free_filename_entry(struct filename_entry *entry) +{ + kfree(entry->filename); + kfree(entry); +} + +void __clear_hashtable(struct hlist_head *list, int hashtable_size) +{ + struct filename_entry *entry; + struct hlist_node *cursor; + struct hlist_node *tmp; + int i; + + for (i = 0; i < hashtable_size; ++i) { + hlist_for_each_entry_safe(entry, cursor, tmp, &list[i], + entries_list) { + free_filename_entry(entry); + } + /* clear whole list at once */ + INIT_HLIST_HEAD(&list[i]); + } +} + +void clear_hashtable(struct hlist_head *list, int hashtable_size, + struct mutex *mutex) +{ + mutex_lock(mutex); + __clear_hashtable(list, hashtable_size); + mutex_unlock(mutex); +} + +int initialize_hashtable(struct hlist_head **list, int hashtable_size) +{ + struct hlist_head *h; + int i; + + h = kmalloc(sizeof(struct hlist_head) * hashtable_size, GFP_KERNEL); + if (h == NULL) + return -ENOMEM; + + for (i = 0; i < hashtable_size; ++i) { + INIT_HLIST_HEAD(&h[i]); + } + + *list = h; + return 0; +} + +u32 filename_hash(char *s) +{ + return crc32_le(0, s, strlen(s)); +} + +static inline unsigned filename_hashtable_index(char *filename) +{ + return filename_hash(filename) % filename_hashtable_size; +} + +/** + * Checks if filename @filename is in hashtable @list + */ +int filename_on_list(char *filename, struct hlist_head *list) +{ + struct filename_entry *entry; + struct hlist_node *cursor; + unsigned hashtable_index = filename_hashtable_index(filename); + + hlist_for_each_entry(entry, cursor, &list[hashtable_index], + entries_list) { + if (strcmp(entry->filename, filename) == 0) + return 1; + } + return 0; +} + +/** + * Adds filename @filename to hashtable @list + * Filename contents is copied. + * Proper mutex must be held. + */ +static int __add_filename_to_list(char *filename, struct hlist_head *list) +{ + int ret = 0; + struct filename_entry *entry = NULL; + unsigned hashtable_index = filename_hashtable_index(filename); + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (entry == NULL) { + ret = -ENOMEM; + goto out_error; + } + INIT_HLIST_NODE(&entry->entries_list); + + entry->filename = kstrdup(filename, GFP_KERNEL); + if (entry->filename == NULL) { + ret = -ENOMEM; + goto out_error; + } + + hlist_add_head(&entry->entries_list, &list[hashtable_index]); + + return ret; + + out_error: + if (entry != NULL) { + if (entry->filename != NULL) + kfree(entry->filename); + kfree(entry); + } + return ret; +} + +static int add_filename_to_list_unique(char *filename, struct hlist_head *list, + struct mutex *mutex) +{ + int ret = 0; + + mutex_lock(mutex); + if (!filename_on_list(filename, list)) + ret = __add_filename_to_list(filename, list); + mutex_unlock(mutex); + + return ret; +} + +/** + * Removes filename @filename from hashtable @list + * Frees filename entry and its contents. + * Returns true (non-zero) if entry was found and removed. + */ +int remove_filename_from_list(char *filename, struct hlist_head *list) +{ + struct filename_entry *entry; + struct hlist_node *cursor; + unsigned hashtable_index = filename_hashtable_index(filename); + + hlist_for_each_entry(entry, cursor, &list[hashtable_index], + entries_list) { + if (strcmp(entry->filename, filename) == 0) { + hlist_del(&entry->entries_list); + free_filename_entry(entry); + return 1; + } + } + return 0; +} + +static inline unsigned traced_pid_hash(pid_t pid) +{ + return pid % TRACED_HASH_SIZE; +} + +/** + * Adds pid @pid to traced pids with trace job @job. + */ +int add_traced_pid(pid_t pid, struct trace_job *job, + struct hlist_head *hashtable) +{ + int ret = 0; + struct traced_pid_entry *entry = NULL; + unsigned hashtable_index = traced_pid_hash(pid); + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (entry == NULL) { + ret = -ENOMEM; + goto out_error; + } + INIT_HLIST_NODE(&entry->entries_list); + entry->trace_job = job; + entry->pid = pid; + + hlist_add_head(&entry->entries_list, &hashtable[hashtable_index]); + + return ret; + + out_error: + kfree(entry); + return ret; +} + +/** + * Removes trace job for pid @pid. + * Frees entry and its contents. + * Does not free job. + */ +int remove_traced_pid(pid_t pid, struct hlist_head *hashtable) +{ + struct traced_pid_entry *entry = NULL; + unsigned hashtable_index = traced_pid_hash(pid); + struct hlist_node *cursor; + + hlist_for_each_entry(entry, cursor, &hashtable[hashtable_index], + entries_list) { + if (entry->pid == pid) { + hlist_del(&entry->entries_list); + kfree(entry); + return 1; + } + } + return 0; +} + +struct traced_pid_entry *find_traced_pid(pid_t pid, + struct hlist_head *hashtable) +{ + struct traced_pid_entry *entry = NULL; + unsigned hashtable_index = traced_pid_hash(pid); + struct hlist_node *cursor; + + hlist_for_each_entry(entry, cursor, &hashtable[hashtable_index], + entries_list) { + if (entry->pid == pid) + return entry; + } + return NULL; +} + +/** + Structure describing tracing or monitoring job. +*/ +struct trace_job { + struct delayed_work work; + char *filename; + pid_t pid; + struct trace_marker start_marker; +}; + +char *create_trace_filename(char *filename) +{ + char *basename = NULL; + u32 hash; + int filename_len = strlen(filename); + char *file_name = NULL; + char *short_name = NULL; + char *slash_pos; + + hash = crc32_le(0, filename, filename_len); + + slash_pos = strrchr(filename, '/'); + if (slash_pos == NULL) { + printk(KERN_WARNING "File name does not contain slash\n"); + goto out; + } + + basename = kmalloc(short_name_len + 1, GFP_KERNEL); + + if (basename == NULL) { + printk(KERN_WARNING "Cannot allocate memory for basename\n"); + goto out; + } + strncpy(basename, slash_pos + 1, short_name_len); + basename[short_name_len] = '\0'; + + file_name = kasprintf(GFP_KERNEL, "%s-%x", basename, hash); + if (file_name == NULL) { + printk(KERN_WARNING "Cannot allocate memory for file name\n"); + goto out; + } + + short_name = kasprintf(GFP_KERNEL, filename_template, file_name); + if (short_name == NULL) { + printk(KERN_WARNING "Cannot allocate memory for short name\n"); + goto out; + } + + out: + if (file_name != NULL) + kfree(file_name); + if (basename != NULL) + kfree(basename); + return short_name; +} + +static void do_finish_monitoring(struct trace_job *trace_job) +{ + struct task_struct *process = NULL; + int ticks = -1; + + read_lock(&tasklist_lock); + process = find_task_by_pid(trace_job->pid); + if (process != NULL) + ticks = delayacct_blkio_ticks(process); + read_unlock(&tasklist_lock); + + if (ticks == -1) { + /* Process was terminated earlier than our timeout, stopping monitoring was handled by exit hook */ + goto out; + } + + if (ticks > tracing_ticks_threshold) { + /* Add app to tracing list if it does not appear there yet */ +#ifdef CONFIG_PREFETCH_DEBUG + printk(KERN_INFO + "Application %s qualifies for prefetching, ticks=%d\n", + trace_job->filename, ticks); +#endif + mutex_lock(&prefetch_apps_list_mutex); + if (!filename_on_list(trace_job->filename, prefetch_apps_list)) { + __add_filename_to_list(trace_job->filename, + prefetch_apps_list); +#ifdef CONFIG_PREFETCH_DEBUG + printk(KERN_INFO + "Added application %s to prefetching list\n", + trace_job->filename); +#endif + } + mutex_unlock(&prefetch_apps_list_mutex); + } else { + /* App does not require prefetching, remove app from tracing list if it there */ + mutex_lock(&prefetch_apps_list_mutex); + remove_filename_from_list(trace_job->filename, + prefetch_apps_list); + mutex_unlock(&prefetch_apps_list_mutex); + } + out: + return; +} + +static void finish_trace_job(struct trace_job *trace_job) +{ + mutex_lock(&traced_pids_mutex); + if (!remove_traced_pid(trace_job->pid, traced_pids)) + printk(KERN_WARNING + "Did not remove pid %d from traced pids, inconsistency in pids handling, filename for job=%s\n", + trace_job->pid, trace_job->filename); + mutex_unlock(&traced_pids_mutex); + + kfree(trace_job->filename); + kfree(trace_job); +} + +static void finish_monitoring(struct work_struct *work) +{ + struct trace_job *trace_job = + container_of(container_of(work, struct delayed_work, work), + struct trace_job, work); + do_finish_monitoring(trace_job); + finish_trace_job(trace_job); +} + +static void finish_tracing(struct work_struct *work) +{ + struct trace_marker end_marker; + void *trace_fragment = NULL; + int trace_fragment_size = 0; + int ret; + struct trace_job *trace_job = + container_of(container_of(work, struct delayed_work, work), + struct trace_job, work); + char *trace_filename = NULL; + + do_finish_monitoring(trace_job); + + ret = prefetch_stop_trace(&end_marker); + + if (ret < 0) { + printk(KERN_WARNING "Failed to stop trace for application %s\n", + trace_job->filename); + end_marker = trace_job->start_marker; /*at least this we can do to release as much as possible */ + goto out_release; + } + + ret = get_prefetch_trace_fragment(trace_job->start_marker, + end_marker, + &trace_fragment, + &trace_fragment_size); + if (ret < 0) { + printk(KERN_WARNING + "Failed to fetch trace fragment for application %s, error=%d\n", + trace_job->filename, ret); + goto out_release; + } + + if (trace_fragment_size <= 0) { + printk(KERN_WARNING "Empty trace for application %s\n", + trace_job->filename); + goto out_release; + } + + trace_filename = create_trace_filename(trace_job->filename); + if (trace_filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for short filename, trace for application %s not saved\n", + trace_job->filename); + goto out_free_release; + } + + sort_trace_fragment(trace_fragment, trace_fragment_size); + /* + * NOTE: the race between saving and loading trace is possible, but it should only + * result in reading prefetch file failing or prefetch not done very efficiently. + */ + ret = + prefetch_save_trace_fragment(trace_filename, trace_fragment, + trace_fragment_size); + if (ret < 0) { + printk(KERN_WARNING + "Failed to save trace for application %s to file %s, error=%d\n", + trace_job->filename, trace_filename, ret); + goto out_free_release; + } + + out_free_release: + free_trace_buffer(trace_fragment, trace_fragment_size); + + out_release: + ret = prefetch_release_trace(end_marker); + if (ret < 0) + printk(KERN_WARNING + "Releasing trace for app tracing returned error, error=%d\n", + ret); + if (trace_filename != NULL) + kfree(trace_filename); + finish_trace_job(trace_job); +} + +static int start_tracing_job(char *filename) +{ + int ret = 0; + struct trace_job *trace_job; + + trace_job = kzalloc(sizeof(*trace_job), GFP_KERNEL); + + if (trace_job == NULL) { + printk(KERN_WARNING + "Cannot allocate memory to start tracing for app %s\n", + filename); + ret = -ENOMEM; + goto out_error; + } + + trace_job->filename = kstrdup(filename, GFP_KERNEL); + + if (trace_job->filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for filename to start tracing for app %s\n", + filename); + ret = -ENOMEM; + goto out_free; + } + + ret = prefetch_start_trace(&trace_job->start_marker); + if (ret < 0) { + printk(KERN_WARNING "Failed to start tracing for app %s\n", + filename); + goto out_free; + } + + trace_job->pid = current->pid; + + mutex_lock(&traced_pids_mutex); + add_traced_pid(trace_job->pid, trace_job, traced_pids); + mutex_unlock(&traced_pids_mutex); + + INIT_DELAYED_WORK(&trace_job->work, finish_tracing); + schedule_delayed_work(&trace_job->work, HZ * tracing_timeout); + +#ifdef CONFIG_PREFETCH_DEBUG + printk(KERN_INFO "Successfully started tracing for application %s\n", + filename); +#endif + + return 0; + + out_free: + if (trace_job != NULL) { + if (trace_job->filename != NULL) + kfree(trace_job->filename); + kfree(trace_job); + } + out_error: + return ret; +} + +static int start_monitoring_job(char *filename) +{ + int ret = 0; + struct trace_job *trace_job; + + trace_job = kzalloc(sizeof(*trace_job), GFP_KERNEL); + + if (trace_job == NULL) { + printk(KERN_WARNING + "Cannot allocate memory to start monitoring for app %s\n", + filename); + ret = -ENOMEM; + goto out_error; + } + + trace_job->filename = kstrdup(filename, GFP_KERNEL); + + if (trace_job->filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for filename to start monitoring for app %s\n", + filename); + ret = -ENOMEM; + goto out_free; + } + + trace_job->pid = current->pid; + + mutex_lock(&traced_pids_mutex); + add_traced_pid(trace_job->pid, trace_job, traced_pids); + mutex_unlock(&traced_pids_mutex); + + INIT_DELAYED_WORK(&trace_job->work, finish_monitoring); + schedule_delayed_work(&trace_job->work, HZ * tracing_timeout); + + return 0; + + out_free: + if (trace_job != NULL) { + if (trace_job->filename != NULL) + kfree(trace_job->filename); + kfree(trace_job); + } + out_error: + return ret; +} + +int start_app_prefetch(char *filename) +{ + char *trace_filename = NULL; + int ret = 0; + + trace_filename = create_trace_filename(filename); + if (trace_filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for short filename, cannot start prefetetching for application %s\n", + filename); + ret = -ENOMEM; + goto out; + } + + ret = do_prefetch_from_file(trace_filename); + if (ret < 0) { + printk(KERN_WARNING + "Failed to start prefetching for application %s, error=%d\n", + filename, ret); + goto out_free; + } + + out_free: + kfree(trace_filename); + + out: + return ret; +} + +void try_app_prefetch(char *filename) +{ + int app_on_list; + + if (!enabled) + return; + + mutex_lock(&prefetch_apps_blacklist_mutex); + if (filename_on_list(filename, prefetch_apps_blacklist)) { +#ifdef CONFIG_PREFETCH_DEBUG + printk(KERN_INFO + "Not doing tracing nor prefetching for blacklisted file %s\n", + filename); +#endif + mutex_unlock(&prefetch_apps_blacklist_mutex); + return; + } + mutex_unlock(&prefetch_apps_blacklist_mutex); + + mutex_lock(&prefetch_apps_list_mutex); + app_on_list = filename_on_list(filename, prefetch_apps_list); + mutex_unlock(&prefetch_apps_list_mutex); + + if (app_on_list) { + /* Start tracing and schedule end tracing work */ + start_tracing_job(filename); + + if (prefetch_enabled) { + start_app_prefetch(filename); + } + } else { + start_monitoring_job(filename); + } +} + +void prefetch_exec_hook(char *filename) +{ + try_app_prefetch(filename); +} + +/** + Prefetch hook for intercepting exit() of process. +*/ +void prefetch_exit_hook(pid_t pid) +{ + struct traced_pid_entry *entry = NULL; + if (!initialized || !enabled) + return; + + mutex_lock(&traced_pids_mutex); + entry = find_traced_pid(pid, traced_pids); + if (entry != NULL) + do_finish_monitoring(entry->trace_job); + mutex_unlock(&traced_pids_mutex); + /*NOTE: job is not cancelled, it will wake up and clean up after itself */ +} + +#define PREFETCH_PATH_MAX 512 +#define PREFETCH_PATH_MAX_S "512" + +ssize_t app_prefetch_proc_write(struct file *proc_file, + const char __user * buffer, size_t count, + loff_t * ppos) +{ + char *name; + int e = 0; + int tmp; + int r; + char *s = NULL; + + if (count >= PATH_MAX) + return -ENAMETOOLONG; + + name = kmalloc(count + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + if (copy_from_user(name, buffer, count)) { + e = -EFAULT; + goto out; + } + + /* strip the optional newline */ + if (count && name[count - 1] == '\n') + name[count - 1] = '\0'; + else + name[count] = '\0'; + + if (param_match(name, "prefetch enable")) { + printk(KERN_INFO "Prefetching for apps enabled\n"); + prefetch_enabled = 1; + goto out; + } + + if (param_match(name, "prefetch disable")) { + printk(KERN_INFO "Prefetching for apps disabled\n"); + prefetch_enabled = 0; + goto out; + } + + if (param_match(name, "enable")) { + printk(KERN_INFO "App prefetching module enabled\n"); + enabled = 1; + goto out; + } + + if (param_match(name, "disable")) { + printk(KERN_INFO "App prefetching module disabled\n"); + enabled = 0; + goto out; + } + + if (param_match_prefix(name, "set tracing timeout")) { + r = sscanf(name, "set tracing timeout %d", &tmp); + if (r != 1) { + e = -EINVAL; + printk(KERN_WARNING + "Wrong parameter to set tracing timeout command, command was: %s\n", + name); + goto out; + } + if (tmp <= 0) { + e = -EINVAL; + printk(KERN_WARNING + "Wrong timeout specified, must be >0, timeout was: %d\n", + tmp); + goto out; + } + tracing_timeout = tmp; + printk(KERN_INFO "Set tracing timeout to %d seconds\n", + tracing_timeout); + goto out; + } + + if (param_match(name, "clear app-list")) { + clear_hashtable(prefetch_apps_list, filename_hashtable_size, + &prefetch_apps_list_mutex); + printk(KERN_INFO "List of traced applications cleared\n"); + goto out; + } + + if (param_match_prefix(name, "add app-list")) { + s = kzalloc(PREFETCH_PATH_MAX + 1, GFP_KERNEL); + if (s == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for path\n"); + e = -ENOMEM; + goto out; + } + r = sscanf(name, "add app-list %" PREFETCH_PATH_MAX_S "s", s); + if (r != 1) { + e = -EINVAL; + printk(KERN_WARNING + "Wrong parameter to add app-list command, command was: %s\n", + name); + } else { + e = add_filename_to_list_unique(s, prefetch_apps_list, + &prefetch_apps_list_mutex); + if (e < 0) + printk(KERN_WARNING + "Failed to add application %s to prefetched applications list, error=%d\n", + s, e); + } + kfree(s); + goto out; + } + + if (param_match(name, "clear app-blacklist")) { + clear_hashtable(prefetch_apps_blacklist, + filename_hashtable_size, + &prefetch_apps_blacklist_mutex); + printk(KERN_INFO "Blacklist of traced applications cleared\n"); + goto out; + } + + if (param_match_prefix(name, "add app-blacklist")) { + s = kzalloc(PREFETCH_PATH_MAX + 1, GFP_KERNEL); + if (s == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for path\n"); + e = -ENOMEM; + goto out; + } + + r = sscanf(name, "add app-blacklist %s", s); + if (r != 1) { + e = -EINVAL; + printk(KERN_WARNING + "Wrong parameter to add app-blacklist command, command was: %s\n", + name); + } else { + e = add_filename_to_list_unique(s, + prefetch_apps_blacklist, + &prefetch_apps_blacklist_mutex); + if (e < 0) + printk(KERN_WARNING + "Failed to add application %s to blacklisted applications list, error=%d\n", + s, e); + } + kfree(s); + goto out; + } + out: + kfree(name); + + return e ? e : count; +} + +void seq_print_filename_list(struct seq_file *m, struct hlist_head *list) +{ + struct filename_entry *entry; + struct hlist_node *cursor; + int i; + + for (i = 0; i < filename_hashtable_size; ++i) { + hlist_for_each_entry(entry, cursor, &list[i], entries_list) { + seq_printf(m, "%s\n", entry->filename); + } + } +} + +static void *app_prefetch_proc_start(struct seq_file *m, loff_t * pos) +{ + if (*pos != 0) + return NULL; + + return &tracing_ticks_threshold; /*whatever pointer, must not be NULL */ +} + +static void *app_prefetch_proc_next(struct seq_file *m, void *v, loff_t * pos) +{ + return NULL; +} + +static int app_prefetch_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "### Traced applications: ###\n"); + mutex_lock(&prefetch_apps_list_mutex); + seq_print_filename_list(m, prefetch_apps_list); + mutex_unlock(&prefetch_apps_list_mutex); + + seq_printf(m, "### Blacklisted applications: ###\n"); + mutex_lock(&prefetch_apps_blacklist_mutex); + seq_print_filename_list(m, prefetch_apps_blacklist); + mutex_unlock(&prefetch_apps_blacklist_mutex); + + return 0; +} + +static void app_prefetch_proc_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations seq_app_prefetch_op = { + .start = app_prefetch_proc_start, + .next = app_prefetch_proc_next, + .stop = app_prefetch_proc_stop, + .show = app_prefetch_proc_show, +}; + +static int app_prefetch_proc_open(struct inode *inode, struct file *proc_file) +{ + return seq_open(proc_file, &seq_app_prefetch_op); +} + +static int app_prefetch_proc_release(struct inode *inode, + struct file *proc_file) +{ + return seq_release(inode, proc_file); +} + +static struct file_operations proc_app_prefetch_fops = { + .owner = THIS_MODULE, + .open = app_prefetch_proc_open, + .release = app_prefetch_proc_release, + .write = app_prefetch_proc_write, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int app_list_show(struct seq_file *m, void *v) +{ + mutex_lock(&prefetch_apps_list_mutex); + seq_print_filename_list(m, prefetch_apps_list); + mutex_unlock(&prefetch_apps_list_mutex); + + return 0; +} + +static int app_list_open(struct inode *inode, struct file *proc_file) +{ + return single_open(proc_file, app_list_show, NULL); +} + +static int app_list_release_generic(struct inode *inode, struct file *proc_file) +{ + return single_release(inode, proc_file); +} + +static struct file_operations proc_app_list_fops = { + .owner = THIS_MODULE, + .open = app_list_open, + .release = app_list_release_generic, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int app_blacklist_show(struct seq_file *m, void *v) +{ + mutex_lock(&prefetch_apps_blacklist_mutex); + seq_print_filename_list(m, prefetch_apps_blacklist); + mutex_unlock(&prefetch_apps_blacklist_mutex); + + return 0; +} + +static int app_blacklist_open(struct inode *inode, struct file *proc_file) +{ + return single_open(proc_file, app_blacklist_show, NULL); +} + +static struct file_operations proc_app_blacklist_fops = { + .owner = THIS_MODULE, + .open = app_blacklist_open, + .release = app_list_release_generic, + .read = seq_read, + .llseek = seq_lseek, +}; + +static __init int app_prefetch_init(void) +{ + struct proc_dir_entry *entry; + int ret; + + /* Initialize hashtables */ + ret = + initialize_hashtable(&prefetch_apps_blacklist, + filename_hashtable_size); + if (ret < 0) { + printk(KERN_WARNING + "Cannot initialize app blacklist hashtable, error=%d\n", + ret); + goto out_error; + } + + ret = + initialize_hashtable(&prefetch_apps_list, filename_hashtable_size); + if (ret < 0) { + printk(KERN_WARNING + "Cannot initialize app hashtable, error=%d\n", ret); + goto out_error; + } + + ret = initialize_hashtable(&traced_pids, TRACED_HASH_SIZE); + if (ret < 0) { + printk(KERN_WARNING + "Cannot initialize traced pids hashtable, error=%d\n", + ret); + goto out_error; + } + + if (prefetch_proc_dir == NULL) { + printk(KERN_WARNING + "Prefetch proc directory not present, proc interface for app prefetching will not be available\n"); + } else { + entry = create_proc_entry("app", 0600, prefetch_proc_dir); + if (entry) + entry->proc_fops = &proc_app_prefetch_fops; + entry = create_proc_entry("app-list", 0600, prefetch_proc_dir); + if (entry) + entry->proc_fops = &proc_app_list_fops; + entry = + create_proc_entry("app-blacklist", 0600, prefetch_proc_dir); + if (entry) + entry->proc_fops = &proc_app_blacklist_fops; + } + + printk(KERN_INFO + "App prefetching module started, enabled=%d, prefetching=%d\n", + enabled, prefetch_enabled); + + initialized = 1; + + return 0; + + out_error: + return ret; +} + +static void app_prefetch_exit(void) +{ + remove_proc_entry("app", prefetch_proc_dir); +} + +MODULE_AUTHOR("Krzysztof Lichota "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Application tracing and prefetching during startup"); + +module_init(app_prefetch_init); +module_exit(app_prefetch_exit); diff --git a/mm/prefetch_boot.c b/mm/prefetch_boot.c new file mode 100644 index 0000000..da7f89b --- /dev/null +++ b/mm/prefetch_boot.c @@ -0,0 +1,396 @@ +/* + * linux/mm/prefetch_core.c + * + * Copyright (C) 2007 Krzysztof Lichota + * + * This is boot prefetch support implementation. + * It consists mainly of proc interface, the rest is done by init scripts using proc interface. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +/*************** Boot tracing **************/ +#define DEFAULT_BOOT_TRACE_FILENAME_TEMPLATE "/.prefetch-boot-trace.%s" +static char *filename_template = DEFAULT_BOOT_TRACE_FILENAME_TEMPLATE; +module_param(filename_template, charp, 0); +MODULE_PARM_DESC(filename_template, + "Template for boot trace name, where trace will be saved and read from. %s will be replaced with name of phase. The default is: " + DEFAULT_BOOT_TRACE_FILENAME_TEMPLATE); + +/*maximum size of phase name, not including trailing NULL*/ +#define PHASE_NAME_MAX 10 +/*maximum size as string, keep in sync with PHASE_NAME_MAX*/ +#define PHASE_NAME_MAX_S "10" + +/*maximum size of command name, not including trailing NULL*/ +#define CMD_NAME_MAX 10 +/*maximum size as string, keep in sync with CMD_NAME_MAX*/ +#define CMD_NAME_MAX_S "10" + +/*Enables/disables whole functionality of the module*/ +static int enabled = 1; +module_param(enabled, bool, 0); +MODULE_PARM_DESC(enabled, + "Enables or disables whole boot prefetching module functionality (tracing and prefetching)"); + +/*Controls whether prefetching should be done along with tracing.*/ +static int prefetch_enabled = 1; +module_param(prefetch_enabled, bool, 0); +MODULE_PARM_DESC(prefetch_enabled, + "Enables or disables prefetching during boot. If disabled, only tracing will be done"); + +static struct mutex boot_prefetch_mutex; +/** + * Phase start marker, protected by boot_prefetch_mutex. +*/ +static struct trace_marker boot_start_marker; +static char boot_tracing_phase[PHASE_NAME_MAX + 1] = "init"; +static int boot_tracing_running = 0; + +/** + Saves boot trace fragment for phase @phase_name which + starts at boot_start_marker and ends at @end_phase_marker. + + boot_prefetch_mutex must be held while calling this function. +*/ +static int prefetch_save_boot_trace(char *phase_name, + struct trace_marker end_phase_marker) +{ + char *boot_trace_filename = NULL; + int ret = 0; + + boot_trace_filename = kasprintf(GFP_KERNEL, filename_template, + phase_name); + + if (boot_trace_filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for trace filename in phase %s\n", + phase_name); + ret = -ENOMEM; + goto out; + } + ret = prefetch_save_trace_between_markers(boot_trace_filename, + boot_start_marker, + end_phase_marker); + out: + if (boot_trace_filename != NULL) + kfree(boot_trace_filename); + return ret; +} + +/** + Starts tracing for given boot phase. + boot_prefetch_mutex is taken by this function. +*/ +int prefetch_start_boot_tracing_phase(char *phase_name) +{ + int r; + int ret = 0; + struct trace_marker marker; + + mutex_lock(&boot_prefetch_mutex); + + if (boot_tracing_running) { + /*boot tracing was already running */ + ret = prefetch_continue_trace(&marker); + if (ret < 0) { + printk(KERN_WARNING + "Cannot continue tracing, error=%d\n", ret); + goto out_unlock; + } + + r = prefetch_save_boot_trace(boot_tracing_phase, marker); + if (r < 0) + /*NOTE: just warn and continue, prefetching might still succeed and phase has been started */ + printk(KERN_WARNING + "Saving boot trace failed, phase %s, error=%d\n", + boot_tracing_phase, r); + + boot_start_marker = marker; + } else { + /*first phase of tracing */ + ret = prefetch_start_trace(&boot_start_marker); + if (ret < 0) { + printk(KERN_WARNING "Cannot start tracing, error=%d\n", + ret); + goto out_unlock; + } + } + + strncpy(boot_tracing_phase, phase_name, PHASE_NAME_MAX); + boot_tracing_phase[PHASE_NAME_MAX] = 0; + + boot_tracing_running = 1; + +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Boot command %s, phase %s marker: ", cmd_name, + phase_name); + print_marker("Marker: ", boot_start_marker); +#endif + out_unlock: + mutex_unlock(&boot_prefetch_mutex); + return ret; +} + +int prefetch_start_boot_prefetching_phase(char *phase_name) +{ + char *boot_trace_filename = NULL; + int ret = 0; + if (!prefetch_enabled) { + printk(KERN_INFO + "Prefetching disabled, not starting prefetching for boot phase: %s\n", + phase_name); + return 0; + } + + boot_trace_filename = kasprintf(GFP_KERNEL, filename_template, + phase_name); + + if (boot_trace_filename == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for trace filename\n"); + ret = -ENOMEM; + goto out; + } + + printk(KERN_INFO "Starting prefetching for boot phase: %s\n", + phase_name); + ret = do_prefetch_from_file(boot_trace_filename); + + if (ret < 0) + printk("Failed to prefetch trace from file %s, error=%d\n", + boot_trace_filename, ret); + + out: + if (boot_trace_filename != NULL) + kfree(boot_trace_filename); + + return ret; +} + +/** + Starts next phase of boot. + Starts tracing. Then, if trace is available, loads it and starts + prefetch. + @cmd_name is the name of action, if you want to keep its contents, + copy it somewhere, as it will be deallocated. + @phase_name is the name of new phase, if you want to keep its contents, + copy it somewhere, as it will be deallocated. +*/ +static int prefetch_start_boot_phase(char *cmd_name, char *phase_name) +{ + int ret = 0; + int start_prefetching = 0; + int start_tracing = 0; + + if (strcmp(cmd_name, "prefetch") == 0) + start_prefetching = 1; + else if (strcmp(cmd_name, "trace") == 0) + start_tracing = 1; + else if (strcmp(cmd_name, "both") == 0) { + start_prefetching = 1; + start_tracing = 1; + } else { + printk(KERN_WARNING + "Boot prefetch: unknown command: %s for phase %s\n", + cmd_name, phase_name); + return -EINVAL; + } + if (start_tracing) + prefetch_start_boot_tracing_phase(phase_name); + + if (start_prefetching) + ret = prefetch_start_boot_prefetching_phase(phase_name); + + return ret; +} + +static int prefetch_stop_boot_tracing(void) +{ + struct trace_marker marker; + int ret = 0; + printk(KERN_INFO "Stopping boot tracing and prefetching\n"); + + mutex_lock(&boot_prefetch_mutex); + + if (!boot_tracing_running) { + printk + ("Trying to stop boot tracing although tracing is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = prefetch_stop_trace(&marker); + if (ret < 0) + printk(KERN_WARNING + "Stopping tracing for boot tracing returned error, error=%d\n", + ret); + + boot_tracing_running = 0; + +#ifdef PREFETCH_DEBUG + print_marker("Boot stop marker: ", marker); +#endif + + ret = prefetch_save_boot_trace(boot_tracing_phase, marker); + if (ret < 0) { + printk(KERN_WARNING + "Saving final boot trace failed, phase %s, error=%d\n", + boot_tracing_phase, ret); + goto out_unlock_release; + } + + out_unlock_release: + ret = prefetch_release_trace(marker); + if (ret < 0) + printk(KERN_WARNING + "Releasing trace for boot tracing returned error, error=%d\n", + ret); + + out_unlock: + mutex_unlock(&boot_prefetch_mutex); + return ret; +} + +ssize_t boot_prefetch_proc_write(struct file * proc_file, + const char __user * buffer, size_t count, + loff_t * ppos) +{ + char *name; + int e = 0; + int r; + char *phase_name; + char *cmd_name; + + if (count >= PATH_MAX) + return -ENAMETOOLONG; + + name = kmalloc(count + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + if (copy_from_user(name, buffer, count)) { + e = -EFAULT; + goto out; + } + + /* strip the optional newline */ + if (count && name[count - 1] == '\n') + name[count - 1] = '\0'; + else + name[count] = '\0'; + + if (param_match(name, "prefetch enable")) { + printk(KERN_INFO "Prefetching enabled\n"); + prefetch_enabled = 1; + goto out; + } + + if (param_match(name, "prefetch disable")) { + printk(KERN_INFO "Prefetching disabled\n"); + prefetch_enabled = 0; + goto out; + } + + if (param_match_prefix(name, "start ")) { + phase_name = kzalloc(PHASE_NAME_MAX + 1, GFP_KERNEL); /*1 for terminating NULL */ + if (phase_name == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for phase name\n"); + goto out; + } + cmd_name = kzalloc(CMD_NAME_MAX + 1, GFP_KERNEL); /*1 for terminating NULL */ + if (cmd_name == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for command name\n"); + goto out; + } + r = sscanf(name, + "start %" CMD_NAME_MAX_S "s phase %" PHASE_NAME_MAX_S + "s", cmd_name, phase_name); + if (r != 2) { + e = -EINVAL; + printk(KERN_WARNING + "Wrong parameter to start command, command was: %s\n", + name); + kfree(phase_name); + kfree(cmd_name); + goto out; + } + e = prefetch_start_boot_phase(cmd_name, phase_name); + kfree(phase_name); + kfree(cmd_name); + goto out; + } + + if (param_match(name, "boot tracing stop")) { + e = prefetch_stop_boot_tracing(); + goto out; + } + out: + kfree(name); + + return e ? e : count; +} + +static int boot_prefetch_proc_open(struct inode *inode, struct file *proc_file) +{ + return 0; +} + +static int boot_prefetch_proc_release(struct inode *inode, + struct file *proc_file) +{ + return 0; +} + +static struct file_operations proc_boot_prefetch_fops = { + .owner = THIS_MODULE, + .open = boot_prefetch_proc_open, + .release = boot_prefetch_proc_release, + .write = boot_prefetch_proc_write, +}; + +static __init int boot_prefetch_init(void) +{ + struct proc_dir_entry *entry; + + mutex_init(&boot_prefetch_mutex); + + if (prefetch_proc_dir == NULL) { + printk(KERN_WARNING + "Prefetch proc directory not present, proc interface for boot prefetching will not be available\n"); + } else { + entry = create_proc_entry("boot", 0600, prefetch_proc_dir); + if (entry) + entry->proc_fops = &proc_boot_prefetch_fops; + } + printk(KERN_INFO + "Boot prefetching module started, enabled=%d, prefetching=%d\n", + enabled, prefetch_enabled); + + return 0; +} + +static void boot_prefetch_exit(void) +{ + remove_proc_entry("boot", prefetch_proc_dir); +} + +MODULE_AUTHOR("Krzysztof Lichota "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION + ("Boot prefetching - support for tracing and prefetching during system boot"); + +module_init(boot_prefetch_init); +module_exit(boot_prefetch_exit); diff --git a/mm/prefetch_core.c b/mm/prefetch_core.c new file mode 100644 index 0000000..001470b --- /dev/null +++ b/mm/prefetch_core.c @@ -0,0 +1,1527 @@ +/* + * linux/mm/prefetch_core.c + * + * Copyright (C) 2006 Fengguang Wu + * Copyright (C) 2007 Krzysztof Lichota + * + * This is prefetch core - common code used for tracing and saving trace files. + * It is used by prefetching modules, such as boot and app. + * + * Based on filecache code by Fengguang Wu. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char trace_file_magic[4] = { 'P', 'F', 'C', 'H' }; + +/*Inode walk session*/ +struct inode_walk_session { + int private_session; + pgoff_t next_offset; + struct { + unsigned long cursor; + unsigned long origin; + unsigned long size; + struct inode **inodes; + } ivec; + struct { + unsigned long pos; + unsigned long i_state; + struct inode *inode; + struct inode *pinned_inode; + } icur; + int inodes_walked; + int pages_walked; + int pages_referenced; + int page_blocks; +}; + +/*Disables/enables the whole module functionality*/ +static int enabled = 1; +module_param(enabled, bool, 0); +MODULE_PARM_DESC(enabled, + "Enables or disables whole prefetching module functionality (tracing and prefetching)"); + +#define DEFAULT_TRACE_SIZE_KB 256 + +/*NOTE: changing trace size in runtime is not supported - do not do it.*/ +unsigned trace_size_kb = DEFAULT_TRACE_SIZE_KB; /*in kilobytes */ +module_param(trace_size_kb, uint, 0); +MODULE_PARM_DESC(trace_size_kb, + "Size of memory allocated for trace (in KB), set to 0 to use default"); + +static inline unsigned prefetch_trace_size(void) +{ + if (likely(trace_size_kb > 0)) + return trace_size_kb << 10; + + /*if set to 0, then use default */ + return DEFAULT_TRACE_SIZE_KB * 1024; +} + +enum tracing_command { + START_TRACING, + STOP_TRACING, + CONTINUE_TRACING +}; + +/*Structure holding all information needed for trace*/ +struct prefetch_trace_t { + spinlock_t prefetch_trace_lock; + unsigned int buffer_used; + unsigned int buffer_size; + void *buffer; + int generation; + int overflow; + int overflow_reported; + /*fields above protected by prefetch_trace_lock */ + int page_release_traced; + /** + * Number of traces started and not finished. + * Used to check if it is necessary to add entries to trace. + */ + atomic_t tracers_count; + int trace_users; /*number of trace users, protected by prefetch_trace_mutex */ + struct mutex prefetch_trace_mutex; +}; + +struct prefetch_trace_t prefetch_trace = { + SPIN_LOCK_UNLOCKED, /*prefetch_trace_lock */ + 0, /*buffer_used */ + 0, /*buffer_size */ + NULL, /*buffer */ + 0, /*generation */ + 0, /*overflow */ + 0, /*overflow_reported */ + 0, /*page_release_traced */ + ATOMIC_INIT(0), /*tracers_count */ + 0, /*trace_users */ + __MUTEX_INITIALIZER(prefetch_trace.prefetch_trace_mutex) /*prefetch_trace_mutex */ +}; + +/** + Set if walk_pages() decided that it is the start of tracing + and bits should be cleared, not recorded. + Using it is protected by inode_lock. + If lock breaking is enabled, this variable makes sure that + second caller of walk_pages(START_TRACING) will not + race with first caller and will not start recording changes. +*/ +static int clearing_in_progress = 0; + +/** + * Timer used for measuring tracing and prefetching time. +*/ +struct prefetch_timer { + struct timespec ts_start; + struct timespec ts_end; + char *name; +}; + +static void clear_trace(void); + +/** + * Starts timer. +*/ +void prefetch_start_timing(struct prefetch_timer *timer, char *name) +{ + timer->name = name; + do_posix_clock_monotonic_gettime(&timer->ts_start); +} + +/** + * Stops timer. +*/ +void prefetch_end_timing(struct prefetch_timer *timer) +{ + do_posix_clock_monotonic_gettime(&timer->ts_end); +} + +/** + * Prints timer name and time duration into kernel log. +*/ +void prefetch_print_timing(struct prefetch_timer *timer) +{ + struct timespec ts = timespec_sub(timer->ts_end, timer->ts_start); + s64 ns = timespec_to_ns(&ts); + + printk(KERN_INFO "Prefetch timing (%s): %lld ns, %ld.%.9ld\n", + timer->name, ns, ts.tv_sec, ts.tv_nsec); +} + +struct async_prefetch_params { + void *trace; + int trace_size; +}; + +static int prefetch_do_prefetch(void *trace, int trace_size); + +static int async_prefetch_thread(void *p) +{ + int ret; + struct async_prefetch_params *params = + (struct async_prefetch_params *)p; +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Started async prefetch thread\n"); +#endif + ret = prefetch_do_prefetch(params->trace, params->trace_size); + kfree(params); + return ret; +} + +static int prefetch_start_prefetch_async(void *trace, int trace_size) +{ + struct async_prefetch_params *params = + kmalloc(sizeof(struct async_prefetch_params), GFP_KERNEL); + if (params == NULL) + return -ENOMEM; + params->trace = trace; + params->trace_size = trace_size; + + if (kernel_thread(async_prefetch_thread, params, 0) < 0) { + printk(KERN_WARNING "Cannot start async prefetch thread\n"); + return -EINVAL; + } + return 0; +} + +static int prefetch_start_prefetch_sync(void *trace, int trace_size) +{ + return prefetch_do_prefetch(trace, trace_size); +} + +/** + * Starts prefetch based on given @trace, whose length (in bytes) is @trace_size. + * If async is false, the function will return only after prefetching is finished. + * Otherwise, prefetching will be started in separate thread and function will + * return immediately. +*/ +int prefetch_start_prefetch(void *trace, int trace_size, int async) +{ + if (async) + return prefetch_start_prefetch_async(trace, trace_size); + else + return prefetch_start_prefetch_sync(trace, trace_size); +} + +EXPORT_SYMBOL(prefetch_start_prefetch); + +static int prefetch_do_prefetch(void *trace, int trace_size) +{ + struct prefetch_trace_record *record = trace; + struct prefetch_trace_record *prev_record = NULL; +#ifdef PREFETCH_DEBUG + struct prefetch_timer timer; +#endif + struct super_block *sb = NULL; + struct file *file = NULL; + struct inode *inode = NULL; + int ret = 0; + int readaheads_failed = 0; + int readahead_ret; + + if (!enabled) + return -ENODEV; /*module disabled */ + +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Delay io ticks before prefetching: %d\n", + (int)delayacct_blkio_ticks(current)); + prefetch_start_timing(&timer, "Prefetching"); +#endif + + for (; + (void *)(record + sizeof(struct prefetch_trace_record)) <= + trace + trace_size; prev_record = record, ++record) { + if (prev_record == NULL + || prev_record->device != record->device) { + /*open next device */ + if (sb) + drop_super(sb); + sb = user_get_super(record->device); + } + if (sb == NULL) + continue; /*no such device or error getting device */ + + if (prev_record == NULL || prev_record->device != record->device + || prev_record->inode_no != record->inode_no) { + /*open next file */ + if (inode) + iput(inode); + + inode = iget(sb, record->inode_no); + if (IS_ERR(inode)) { + /*no such inode or other error */ + inode = NULL; + continue; + } + + if (file) + put_filp(file); + + file = get_empty_filp(); + if (file == NULL) { + ret = -ENFILE; + goto out; + } + /*only most important file fields filled, ext3_readpages doesn't use it anyway. */ + file->f_op = inode->i_fop; + file->f_mapping = inode->i_mapping; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + } + if (inode == NULL) + continue; + + readahead_ret = + force_page_cache_readahead(inode->i_mapping, file, + record->range_start, + record->range_length); + if (readahead_ret < 0) { + readaheads_failed++; +#ifdef PREFETCH_DEBUG + if (readaheads_failed < 10) { + printk(KERN_WARNING + "Readahead failed, device=%d:%d, inode=%ld, start=%ld, length=%ld, error=%d\n", + MAJOR(record->device), + MINOR(record->device), record->inode_no, + record->range_start, + record->range_length, readahead_ret); + } + if (readaheads_failed == 10) + printk(KERN_WARNING + "Readaheads failed reached limit, not printing next failures\n"); +#endif + } + } + + out: + if (readaheads_failed > 0) + printk(KERN_INFO "Readaheads not performed: %d\n", + readaheads_failed); + + if (sb) + drop_super(sb); + if (inode) + iput(inode); + if (file) + put_filp(file); + +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Delay io ticks after prefetching: %d\n", + (int)delayacct_blkio_ticks(current)); + prefetch_end_timing(&timer); + prefetch_print_timing(&timer); +#endif + return ret; +} + +/** + * Adds trace record. Does not sleep. +*/ +void prefetch_trace_add(dev_t device, + unsigned long inode_no, + pgoff_t range_start, pgoff_t range_length) +{ + struct prefetch_trace_record *record; + + spin_lock(&prefetch_trace.prefetch_trace_lock); + + if (prefetch_trace.buffer_used + sizeof(struct prefetch_trace_record) >= + prefetch_trace.buffer_size) { + prefetch_trace.overflow = 1; + spin_unlock(&prefetch_trace.prefetch_trace_lock); + return; + } + + record = + (struct prefetch_trace_record *)(prefetch_trace.buffer + + prefetch_trace.buffer_used); + prefetch_trace.buffer_used += sizeof(struct prefetch_trace_record); + + record->device = device; + record->inode_no = inode_no; + record->range_start = range_start; + record->range_length = range_length; + spin_unlock(&prefetch_trace.prefetch_trace_lock); +} + +#define IVEC_SIZE (PAGE_SIZE / sizeof(struct inode *)) + +/* + * Full: there are more data following. + */ +static int ivec_full(struct inode_walk_session *s) +{ + return !s->ivec.cursor || + s->ivec.cursor > s->ivec.origin + s->ivec.size; +} + +static int ivec_push(struct inode_walk_session *s, struct inode *inode) +{ + if (!atomic_read(&inode->i_count)) + return 0; + if (!inode->i_mapping) + return 0; + + s->ivec.cursor++; + + if (s->ivec.size >= IVEC_SIZE) + return 1; + + if (s->ivec.cursor > s->ivec.origin) + s->ivec.inodes[s->ivec.size++] = inode; + return 0; +} + +/* + * Travease the inode lists in order - newest first. + * And fill @s->ivec.inodes with inodes positioned in [@pos, @pos+IVEC_SIZE). + */ +static int ivec_fill(struct inode_walk_session *s, unsigned long pos) +{ + struct inode *inode; + struct super_block *sb; + + s->ivec.origin = pos; + s->ivec.cursor = 0; + s->ivec.size = 0; + + /* + * We have a cursor inode, clean and expected to be unchanged. + */ + if (s->icur.inode && pos >= s->icur.pos && + !(s->icur.i_state & I_DIRTY) && + s->icur.i_state == s->icur.inode->i_state) { + inode = s->icur.inode; + s->ivec.cursor = s->icur.pos; + goto continue_from_saved; + } + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry(inode, &sb->s_dirty, i_list) { + if (ivec_push(s, inode)) + goto out_full_unlock; + } + list_for_each_entry(inode, &sb->s_io, i_list) { + if (ivec_push(s, inode)) + goto out_full_unlock; + } + } + spin_unlock(&sb_lock); + + list_for_each_entry(inode, &inode_in_use, i_list) { + if (ivec_push(s, inode)) + goto out_full; + continue_from_saved: + ; + } + + list_for_each_entry(inode, &inode_unused, i_list) { + if (ivec_push(s, inode)) + goto out_full; + } + + return 0; + + out_full_unlock: + spin_unlock(&sb_lock); + out_full: + return 1; +} + +static struct inode *ivec_inode(struct inode_walk_session *s, unsigned long pos) +{ + if ((ivec_full(s) && pos >= s->ivec.origin + s->ivec.size) + || pos < s->ivec.origin) + ivec_fill(s, pos); + + if (pos >= s->ivec.cursor) + return NULL; + + s->icur.pos = pos; + s->icur.inode = s->ivec.inodes[pos - s->ivec.origin]; + return s->icur.inode; +} + +static void add_referenced_page_range(struct inode_walk_session *s, + struct address_space *mapping, + pgoff_t start, pgoff_t len) +{ + struct inode *inode; + + s->pages_referenced += len; + s->page_blocks++; + if (!clearing_in_progress) { + inode = mapping->host; + if (inode && inode->i_sb && inode->i_sb->s_bdev) + prefetch_trace_add(inode->i_sb->s_bdev->bd_dev, + inode->i_ino, start, len); + } +} + +/** + Add page to trace if it was referenced. + + NOTE: spinlock might be held while this function is called. +*/ +void prefetch_add_page_to_trace(struct page *page) +{ + struct address_space *mapping; + struct inode *inode; + + /*if not tracing, nothing to be done */ + if (atomic_read(&prefetch_trace.tracers_count) <= 0) + return; + + /*if page was not touched */ + if (!PageReferenced(page)) + return; + + /*swap pages are not interesting */ + if (PageSwapCache(page)) + return; + + /*no locking, just stats */ + prefetch_trace.page_release_traced++; + + mapping = page_mapping(page); + + inode = mapping->host; + if (inode && inode->i_sb && inode->i_sb->s_bdev) + prefetch_trace_add(inode->i_sb->s_bdev->bd_dev, inode->i_ino, + page_index(page), 1); +} + +/** + Hook called when page is about to be freed, so we have to check + if it was referenced, as inode walk will not notice it. + + NOTE: spinlock is held while this function is called. +*/ +void prefetch_page_release_hook(struct page *page) +{ + prefetch_add_page_to_trace(page); +} + +static void walk_file_cache(struct inode_walk_session *s, + struct address_space *mapping) +{ + int i; + pgoff_t len = 0; + struct pagevec pvec; + struct page *page; + struct page *page0 = NULL; + int current_page_referenced = 0; + int previous_page_referenced = 0; + pgoff_t start = 0; + + for (;;) { + pagevec_init(&pvec, 0); + pvec.nr = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pvec.pages, + start + len, PAGEVEC_SIZE); + + if (pvec.nr == 0) { + /*no more pages present + add the last range, if present */ + if (previous_page_referenced) + add_referenced_page_range(s, mapping, start, + len); + goto out; + } + + if (!page0) { + page0 = pvec.pages[0]; + previous_page_referenced = PageReferenced(page0); + } + + for (i = 0; i < pvec.nr; i++) { + + page = pvec.pages[i]; + current_page_referenced = TestClearPageReferenced(page); + + s->pages_walked++; + + if (page->index == start + len + && previous_page_referenced == + current_page_referenced) + len++; + else { + if (previous_page_referenced) + add_referenced_page_range(s, mapping, + start, len); + + page0 = page; + start = page->index; + len = 1; + } + previous_page_referenced = current_page_referenced; + } + } + + out: + return; +} + +static void show_inode(struct inode_walk_session *s, struct inode *inode) +{ + ++s->inodes_walked; /*just for stats, so not using atomic_inc() */ + + if (inode->i_mapping) + walk_file_cache(s, inode->i_mapping); +} + +/** + Allocates memory for trace buffer. + This memory should be freed using free_trace_buffer(). +*/ +void *alloc_trace_buffer(int len) +{ + return (void *)__get_free_pages(GFP_KERNEL, get_order(len)); +} + +EXPORT_SYMBOL(alloc_trace_buffer); + +/** + Frees memory allocated using alloc_trace_buffer(). +*/ +void free_trace_buffer(void *buffer, int len) +{ + free_pages((unsigned long)buffer, get_order(len)); +} + +EXPORT_SYMBOL(free_trace_buffer); + +/*NOTE: this function is called with inode_lock spinlock held*/ +static int inode_walk_show(struct inode_walk_session *s, loff_t pos) +{ + unsigned long index = pos; + struct inode *inode; + + inode = ivec_inode(s, index); + BUG_ON(!inode); + show_inode(s, inode); + + return 0; +} + +static void *inode_walk_start(struct inode_walk_session *s, loff_t * pos) +{ + s->ivec.inodes = (struct inode **)__get_free_page(GFP_KERNEL); + if (!s->ivec.inodes) + return NULL; + s->ivec.size = 0; + + spin_lock(&inode_lock); + + BUG_ON(s->icur.pinned_inode); + s->icur.pinned_inode = s->icur.inode; + return ivec_inode(s, *pos) ? pos : NULL; +} + +static void inode_walk_stop(struct inode_walk_session *s) +{ + if (s->icur.inode) { + __iget(s->icur.inode); + s->icur.i_state = s->icur.inode->i_state; + } + + spin_unlock(&inode_lock); + free_page((unsigned long)s->ivec.inodes); + + if (s->icur.pinned_inode) { + iput(s->icur.pinned_inode); + s->icur.pinned_inode = NULL; + } +} + +/*NOTE: this function is called with inode_lock spinlock held*/ +static void *inode_walk_next(struct inode_walk_session *s, loff_t * pos) +{ + (*pos)++; + + return ivec_inode(s, *pos) ? pos : NULL; +} + +static struct inode_walk_session *inode_walk_session_create(void) +{ + struct inode_walk_session *s; + int err = 0; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + err = -ENOMEM; + + return err ? ERR_PTR(err) : s; +} + +static void inode_walk_session_release(struct inode_walk_session *s) +{ + if (s->icur.inode) + iput(s->icur.inode); + kfree(s); +} + +/** + * Prints message followed by marker. +*/ +void print_marker(char *msg, struct trace_marker marker) +{ + printk("%s %u.%u\n", msg, marker.generation, marker.position); +} + +EXPORT_SYMBOL(print_marker); + +/** + Returns current trace marker. + Note: marker ranges are open on the right side, i.e. + [start_marker, end_marker) +*/ +static struct trace_marker get_trace_marker(void) +{ + struct trace_marker marker; + + spin_lock(&prefetch_trace.prefetch_trace_lock); + marker.position = prefetch_trace.buffer_used; + marker.generation = prefetch_trace.generation; + spin_unlock(&prefetch_trace.prefetch_trace_lock); + + return marker; +} + +/** + Returns size of prefetch trace between start and end marker. + Returns <0 if error occurs. +*/ +int prefetch_trace_fragment_size(struct trace_marker start_marker, + struct trace_marker end_marker) +{ + if (start_marker.generation != end_marker.generation) + return -EINVAL; /*trace must have wrapped around and trace is no longer available */ + if (end_marker.position < start_marker.position) + return -ERANGE; /*invalid markers */ + + return end_marker.position - start_marker.position; +} + +EXPORT_SYMBOL(prefetch_trace_fragment_size); + +/** + Returns position in trace buffer for given marker. + prefetch_trace_lock spinlock must be held when calling this function. + Returns < 0 in case of error. + Returns -ENOSPC if this marker is not in buffer. + Note: marker ranges are open on right side, so this position + might point to first byte after the buffer for end markers. +*/ +static int trace_marker_position_in_buffer(struct trace_marker marker) +{ + if (marker.generation != prefetch_trace.generation) + return -EINVAL; /*trace must have wrapped around and trace is no longer available */ + + if (prefetch_trace.buffer_used < marker.position) + return -ENOSPC; + + /*for now simple, not circular buffer */ + return marker.position; +} + +/** + Fetches fragment of trace between start marker and end_marker. + Returns memory (allocated using alloc_trace_buffer()) which holds trace fragment + or error on @fragment_result in case of success and its size on @fragment_size_result. + This memory should be freed using free_trace_buffer(). + If fragment_size == 0, fragment is NULL. +*/ +int get_prefetch_trace_fragment(struct trace_marker start_marker, + struct trace_marker end_marker, + void **fragment_result, + int *fragment_size_result) +{ + int start_position; + int end_position; + int len; + int ret; + void *fragment; + int fragment_size; + + fragment_size = prefetch_trace_fragment_size(start_marker, end_marker); + if (fragment_size < 0) + return fragment_size; + if (fragment_size == 0) { + *fragment_size_result = 0; + *fragment_result = NULL; + return 0; + } + + fragment = alloc_trace_buffer(fragment_size); + if (fragment == NULL) + return -ENOMEM; + + spin_lock(&prefetch_trace.prefetch_trace_lock); + + start_position = trace_marker_position_in_buffer(start_marker); + end_position = trace_marker_position_in_buffer(end_marker); + + if (start_position < 0) { + ret = -ESRCH; + goto out_free; + } + if (end_position < 0) { + ret = -ESRCH; + goto out_free; + } + + len = end_position - start_position; + BUG_ON(len <= 0 || len != fragment_size); + + memcpy(fragment, prefetch_trace.buffer + start_position, len); + + spin_unlock(&prefetch_trace.prefetch_trace_lock); + + *fragment_result = fragment; + *fragment_size_result = fragment_size; + return 0; + + out_free: + spin_unlock(&prefetch_trace.prefetch_trace_lock); + free_trace_buffer(fragment, fragment_size); + return ret; +} + +EXPORT_SYMBOL(get_prefetch_trace_fragment); + +struct file *kernel_open(char const *file_name, int flags, int mode) +{ + int orig_fsuid = current->fsuid; + int orig_fsgid = current->fsgid; + struct file *file = NULL; +#if BITS_PER_LONG != 32 + flags |= O_LARGEFILE; +#endif + current->fsuid = 0; + current->fsgid = 0; + + file = filp_open(file_name, flags, mode); + current->fsuid = orig_fsuid; + current->fsgid = orig_fsgid; + return file; +} + +int kernel_close(struct file *file) +{ + if (file->f_op && file->f_op->flush) { + file->f_op->flush(file, current->files); + } + fput(file); + + return 0; /*no errors known for now */ +} + +int kernel_write(struct file *file, unsigned long offset, const char *addr, + unsigned long count) +{ + mm_segment_t old_fs; + loff_t pos = offset; + int result = -ENOSYS; + + if (!file->f_op->write) + goto fail; + old_fs = get_fs(); + set_fs(get_ds()); + result = file->f_op->write(file, addr, count, &pos); + set_fs(old_fs); + fail: + return result; +} + +/** + * Compares 2 traces records and returns -1, 0 or 1, depending on result of comparison. + * Comparison is lexicographical on device, inode, range_start and range_length (range_length descending). + */ +static int trace_cmp(const void *p1, const void *p2) +{ + struct prefetch_trace_record *r1 = (struct prefetch_trace_record *)p1; + struct prefetch_trace_record *r2 = (struct prefetch_trace_record *)p2; + + if (r1->device < r2->device) + return -1; + if (r1->device > r2->device) + return 1; + + if (r1->inode_no < r2->inode_no) + return -1; + if (r1->inode_no > r2->inode_no) + return 1; + + if (r1->range_start < r2->range_start) + return -1; + if (r1->range_start > r2->range_start) + return 1; + + /*longer range_length is preferred as we want to fetch large fragments first */ + if (r1->range_length < r2->range_length) + return 1; + if (r1->range_length > r2->range_length) + return -1; + return 0; +} + +/** + * Sorts trace fragment by device, inode and start. +*/ +void sort_trace_fragment(void *trace, int trace_size) +{ + sort(trace, trace_size / sizeof(struct prefetch_trace_record), + sizeof(struct prefetch_trace_record), trace_cmp, NULL); +} + +EXPORT_SYMBOL(sort_trace_fragment); + +/** + * Saves trace fragment from buffer @trace_buffer of size @trace_size into file @filename. + * Returns 0, if success <0 if error (with error code). +*/ +int prefetch_save_trace_fragment(char *filename, + void *fragment, int fragment_size) +{ + int ret = 0; + int written = 0; + struct file *file; + struct prefetch_trace_header header; + int data_start = 0; + + file = kernel_open(filename, O_CREAT | O_TRUNC | O_RDWR, 0600); + + if (IS_ERR(file)) { + ret = PTR_ERR(file); + printk(KERN_WARNING + "Cannot open file %s for writing to save trace, error=%d\n", + filename, ret); + goto out; + } + + data_start = sizeof(header); + /*copy magic signature */ + memcpy(&header.magic[0], trace_file_magic, sizeof(header.magic)); + header.version_major = PREFETCH_FORMAT_VERSION_MAJOR; + header.version_minor = PREFETCH_FORMAT_VERSION_MINOR; + header.data_start = data_start; + + ret = kernel_write(file, 0, (char *)&header, sizeof(header)); + if (ret < 0 || ret != sizeof(header)) { + printk("Error while writing header to file %s, error=%d\n", + filename, ret); + goto out_close; + } + + while (written < fragment_size) { + ret = + kernel_write(file, data_start + written, fragment + written, + fragment_size - written); + + if (ret < 0) { + printk("Error while writing to file %s, error=%d\n", + filename, ret); + goto out_close; + } + written += ret; + } + out_close: + kernel_close(file); + out: + return ret; +} + +EXPORT_SYMBOL(prefetch_save_trace_fragment); + +/** + * Saves trace fragment between @start_marker and @end_marker into file @filename. + * Returns 0, if success <0 if error (with error code). +*/ +int prefetch_save_trace_between_markers(char *filename, + struct trace_marker start_marker, + struct trace_marker end_marker) +{ + void *fragment = NULL; + int fragment_size = 0; + int ret = 0; + + ret = get_prefetch_trace_fragment(start_marker, + end_marker, + &fragment, &fragment_size); + + if (ret < 0) { + printk(KERN_WARNING + "Cannot save trace fragment - cannot get trace fragment, error=%d\n", + ret); + goto out; + } + + ret = prefetch_save_trace_fragment(filename, fragment, fragment_size); + if (ret < 0) { + printk(KERN_WARNING + "Cannot save trace fragment - error saving file, error=%d\n", + ret); + goto out_free; + } + + out_free: + if (fragment_size > 0) + free_trace_buffer(fragment, fragment_size); + out: + return ret; +} + +EXPORT_SYMBOL(prefetch_save_trace_between_markers); + +static int walk_pages(enum tracing_command command, struct trace_marker *marker) +{ + void *retptr; + loff_t pos = 0; + int ret; + loff_t next; + struct inode_walk_session *s; + int clearing = 0; + int invalid_trace_counter = 0; + int report_overflow = 0; +#ifdef PREFETCH_DEBUG + struct prefetch_timer walk_pages_timer; +#endif + + spin_lock(&prefetch_trace.prefetch_trace_lock); + if (prefetch_trace.overflow && !prefetch_trace.overflow_reported) { + prefetch_trace.overflow_reported = 1; + report_overflow = 1; + } + spin_unlock(&prefetch_trace.prefetch_trace_lock); + + if (report_overflow) { + if (command == STOP_TRACING) { + if (atomic_dec_return(&prefetch_trace.tracers_count) < + 0) + printk(KERN_WARNING + "Trace counter is invalid\n"); + } + printk(KERN_WARNING "Prefetch buffer overflow\n"); + return -ENOSPC; + } + + s = inode_walk_session_create(); + if (IS_ERR(s)) { + retptr = s; + goto out; + } + + retptr = inode_walk_start(s, &pos); + + if (IS_ERR(retptr)) + goto out_error_session_release; + + /*inode_lock spinlock held from here */ + if (command == START_TRACING) { + if (atomic_inc_return(&prefetch_trace.tracers_count) == 1) { + /*prefetch_trace.tracers_count was 0, this is first tracer, so just clear bits */ + clearing = 1; + clearing_in_progress = 1; + *marker = get_trace_marker(); + } + } +#ifdef PREFETCH_DEBUG + if (!clearing) { + prefetch_start_timing(&walk_pages_timer, "walk pages"); + } else + prefetch_start_timing(&walk_pages_timer, "clearing pages"); +#endif + + while (retptr != NULL) { + /*FIXME: add lock breaking */ + ret = inode_walk_show(s, pos); + if (ret < 0) { + retptr = ERR_PTR(ret); + goto out_error; + } + + next = pos; + retptr = inode_walk_next(s, &next); + if (IS_ERR(retptr)) + goto out_error; + pos = next; + } + + if (command == STOP_TRACING) { + if (atomic_dec_return(&prefetch_trace.tracers_count) < 0) { + invalid_trace_counter = 1; + } + *marker = get_trace_marker(); + } else if (command == CONTINUE_TRACING) { + *marker = get_trace_marker(); + } + + out_error: + if (clearing) + clearing_in_progress = 0; + + inode_walk_stop(s); + /*inode_lock spinlock released */ +#ifdef PREFETCH_DEBUG + if (clearing) + printk(KERN_INFO "Clearing run finished\n"); +#endif + if (invalid_trace_counter) + printk(KERN_WARNING "Trace counter is invalid\n"); + +#ifdef PREFETCH_DEBUG + if (!IS_ERR(retptr)) { + prefetch_end_timing(&walk_pages_timer); + prefetch_print_timing(&walk_pages_timer); + printk(KERN_INFO + "Inodes walked: %d, pages walked: %d, referenced: %d" + " blocks: %d\n", s->inodes_walked, s->pages_walked, + s->pages_referenced, s->page_blocks); + } +#endif + + out_error_session_release: + inode_walk_session_release(s); + out: + return PTR_ERR(retptr); +} + +/** + Starts tracing, if no error happens returns marker which points to start of trace on @marker. +*/ +int prefetch_start_trace(struct trace_marker *marker) +{ + int ret; + if (!enabled) + return -ENODEV; /*module disabled */ + + ret = walk_pages(START_TRACING, marker); + + if (ret >= 0) { + mutex_lock(&prefetch_trace.prefetch_trace_mutex); + prefetch_trace.trace_users++; + mutex_unlock(&prefetch_trace.prefetch_trace_mutex); + } + return ret; +} + +EXPORT_SYMBOL(prefetch_start_trace); + +/** + Performs interim tracing run, returns marker which points to current place in trace. +*/ +int prefetch_continue_trace(struct trace_marker *marker) +{ + if (!enabled) + return -ENODEV; /*module disabled */ + + return walk_pages(CONTINUE_TRACING, marker); +} + +EXPORT_SYMBOL(prefetch_continue_trace); + +/** + Stops tracing, returns marker which points to end of trace. +*/ +int prefetch_stop_trace(struct trace_marker *marker) +{ + if (!enabled) { + /*trace might have been started when module was enabled */ + if (atomic_dec_return(&prefetch_trace.tracers_count) < 0) + printk(KERN_WARNING + "Trace counter is invalid after decrementing it in disabled module\n"); + + return -ENODEV; /*module disabled */ + } +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Released pages traced: %d\n", + prefetch_trace.page_release_traced); +#endif + return walk_pages(STOP_TRACING, marker); +} + +EXPORT_SYMBOL(prefetch_stop_trace); + +/** + Releases trace up to @end marker. + Each successful call to prefetch_start_trace() should + be matched with exactly one call to prefetch_release_trace(). + NOTE: end_marker is currently not used, but might + be used in the future to release only part of trace. +*/ +int prefetch_release_trace(struct trace_marker end_marker) +{ + mutex_lock(&prefetch_trace.prefetch_trace_mutex); + + prefetch_trace.trace_users--; + if (prefetch_trace.trace_users == 0) + clear_trace(); + if (prefetch_trace.trace_users < 0) + printk(KERN_WARNING "Trace users count is invalid, count=%d\n", + prefetch_trace.trace_users); + + mutex_unlock(&prefetch_trace.prefetch_trace_mutex); + + return 0; +} + +EXPORT_SYMBOL(prefetch_release_trace); + +/** + * Loads trace fragment from @filename. + * Returns <0 in case of errors. + * If successful, returns pointer to trace data on @trace_buffer and its size on @trace_size, + * in such case caller is responsible for freeing the buffer using free_trace_buffer(). +*/ +int prefetch_load_trace_fragment(char *filename, void **trace_buffer, + int *trace_size) +{ + struct file *file; + void *buffer; + int data_start; + int data_read = 0; + int raw_data_size; + int file_size; + int ret = 0; + struct prefetch_trace_header header; + + file = kernel_open(filename, O_RDONLY, 0600); + + if (IS_ERR(file)) { + ret = PTR_ERR(file); + printk("Cannot open file %s for reading, error=%d\n", filename, + ret); + return ret; + } + + file_size = file->f_mapping->host->i_size; + + ret = kernel_read(file, 0, (char *)&header, sizeof(header)); + + if (ret < 0 || ret != sizeof(header)) { + printk(KERN_WARNING + "Cannot read trace header for trace file %s, error=%d\n", + filename, ret); + ret = -EINVAL; + goto out_close; + } + + if (strncmp + (&header.magic[0], &trace_file_magic[0], + sizeof(header.magic)) != 0) { + printk(KERN_WARNING + "Trace file %s does not have valid trace file signature\n", + filename); + ret = -EINVAL; + goto out_close; + } + + if (header.version_major != PREFETCH_FORMAT_VERSION_MAJOR) { + printk(KERN_WARNING + "Trace file %s has unsupported major version %d\n", + filename, header.version_major); + ret = -EINVAL; + goto out_close; + } + data_start = header.data_start; + if (data_start < sizeof(header)) { + /*NOTE: exceeding file size is checked implicitely below with raw_data_size check */ + printk(KERN_WARNING + "Trace file %s contains invalid data start: %d\n", + filename, data_start); + ret = -EINVAL; + goto out_close; + } + + raw_data_size = file_size - data_start; + if (raw_data_size < 0) { + ret = -EINVAL; + printk(KERN_WARNING "Invalid trace file %s, not loading\n", + filename); + goto out_close; + } + + if (raw_data_size == 0) { + ret = -EINVAL; + printk(KERN_INFO "Empty trace file %s, not loading\n", + filename); + goto out_close; + } + + buffer = alloc_trace_buffer(raw_data_size); + if (buffer == NULL) { + printk(KERN_INFO "Cannot allocate memory for trace %s\n", + filename); + ret = -ENOMEM; + goto out_close; + } + + while (data_read < raw_data_size) { + ret = + kernel_read(file, data_start + data_read, + buffer + data_read, raw_data_size - data_read); + + if (ret < 0) { + printk("Error while reading from file %s, error=%d\n", + filename, ret); + goto out_close_free; + } + if (ret == 0) { + printk(KERN_WARNING + "File too short, data read=%d, expected size=%d\n", + data_read, raw_data_size); + break; + } + + data_read += ret; + } + + if (data_read == raw_data_size) { + *trace_size = raw_data_size; + *trace_buffer = buffer; + } else { + printk(KERN_WARNING + "Trace file size changed beneath us, cancelling read\n"); + ret = -ETXTBSY; + goto out_close_free; + } + + /*everything OK, caller will free the buffer */ + kernel_close(file); + return 0; + + out_close_free: + free_trace_buffer(buffer, file_size); + out_close: + kernel_close(file); + return ret; +} + +/** + * Prefetches files based on trace read from @filename. +*/ +int do_prefetch_from_file(char *filename) +{ + int ret = 0; + void *buffer = NULL; + int buffer_size; + + ret = prefetch_load_trace_fragment(filename, &buffer, &buffer_size); + if (ret < 0) { + printk(KERN_WARNING "Reading trace file %s failed, error=%d\n", + filename, ret); + goto out; + } + + ret = prefetch_start_prefetch(buffer, buffer_size, 0); + if (ret < 0) { + printk(KERN_WARNING + "Prefetching for trace file %s failed, error=%d\n", + filename, ret); + goto out_free; + } +#ifdef CONFIG_PREFETCH_DEBUG + printk(KERN_INFO "Prefetch from file %s successful\n", filename); +#endif + + out_free: + free_trace_buffer(buffer, buffer_size); + out: + return ret; +} + +EXPORT_SYMBOL(do_prefetch_from_file); + +static void clear_trace(void) +{ + void *new_buffer = NULL; + +#ifdef PREFETCH_DEBUG + printk(KERN_INFO "Clearing prefetch trace buffer\n"); +#endif + + spin_lock(&prefetch_trace.prefetch_trace_lock); + + if (prefetch_trace.buffer == NULL) { + spin_unlock(&prefetch_trace.prefetch_trace_lock); + + new_buffer = alloc_trace_buffer(prefetch_trace_size()); + + if (new_buffer == NULL) { + printk(KERN_WARNING + "Cannot allocate memory for trace buffer\n"); + goto out; + } + + spin_lock(&prefetch_trace.prefetch_trace_lock); + + if (prefetch_trace.buffer != NULL) { + /*someone already allocated it */ + free_trace_buffer(new_buffer, prefetch_trace_size()); + } else { + prefetch_trace.buffer = new_buffer; + prefetch_trace.buffer_size = prefetch_trace_size(); + } + } + /*reset used buffer counter */ + prefetch_trace.buffer_used = 0; + prefetch_trace.overflow = 0; + prefetch_trace.overflow_reported = 0; + prefetch_trace.page_release_traced = 0; + prefetch_trace.generation++; /*next generation, markers are not comparable */ + + spin_unlock(&prefetch_trace.prefetch_trace_lock); + out: + return; +} + +/** + * Checks if @line is exactly the same as @param_name. + */ +int param_match(char *line, char *param_name) +{ + if (strcmp(line, param_name) == 0) + return 1; + + return 0; +} + +EXPORT_SYMBOL(param_match); + +/** + * Checks if @line starts with @param_name, not exceeding param_name length for safety. + */ +int param_match_prefix(char *line, char *param_name) +{ + unsigned param_len = strlen(param_name); + if (strncmp(line, param_name, param_len) == 0) + return 1; + + return 0; +} + +EXPORT_SYMBOL(param_match_prefix); + +ssize_t prefetch_proc_write(struct file * proc_file, const char __user * buffer, + size_t count, loff_t * ppos) +{ + char *name; + int e = 0; + + if (count >= PATH_MAX) + return -ENAMETOOLONG; + + name = kmalloc(count + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + if (copy_from_user(name, buffer, count)) { + e = -EFAULT; + goto out; + } + + /* strip the optional newline */ + if (count && name[count - 1] == '\n') + name[count - 1] = '\0'; + else + name[count] = '\0'; + + if (param_match(name, "enable")) { + printk(KERN_INFO "Prefetch module enabled\n"); + enabled = 1; + goto out; + } + + if (param_match(name, "disable")) { + printk(KERN_INFO "Prefetch module disabled\n"); + enabled = 0; + goto out; + } + out: + kfree(name); + + return e ? e : count; +} + +static int prefetch_proc_open(struct inode *inode, struct file *proc_file) +{ + return 0; +} + +static int prefetch_proc_release(struct inode *inode, struct file *proc_file) +{ + return 0; +} + +static struct file_operations proc_prefetch_fops = { + .owner = THIS_MODULE, + .open = prefetch_proc_open, + .release = prefetch_proc_release, + .write = prefetch_proc_write +}; + +struct proc_dir_entry *prefetch_proc_dir = NULL; +EXPORT_SYMBOL(prefetch_proc_dir); + +static __init int prefetch_core_init(void) +{ + struct proc_dir_entry *entry; + + mutex_lock(&prefetch_trace.prefetch_trace_mutex); + clear_trace(); + mutex_unlock(&prefetch_trace.prefetch_trace_mutex); + + prefetch_proc_dir = proc_mkdir("prefetch", NULL); + + if (prefetch_proc_dir == NULL) { + printk(KERN_WARNING + "Creating prefetch proc directory failed, proc interface will not be available\n"); + } else { + entry = create_proc_entry("control", 0600, prefetch_proc_dir); + if (entry) + entry->proc_fops = &proc_prefetch_fops; + } + + printk(KERN_INFO "Prefetching core module started, enabled=%d\n", + enabled); + + return 0; +} + +static void prefetch_core_exit(void) +{ + remove_proc_entry("control", prefetch_proc_dir); + remove_proc_entry("prefetch", NULL); /*remove directory */ +} + +MODULE_AUTHOR("Krzysztof Lichota "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION + ("Prefetching core - functions used for tracing and prefetching by prefetching modules"); + +module_init(prefetch_core_init); +module_exit(prefetch_core_exit);