# name : innodb_io_patches.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c --- a/storage/innobase/buf/buf0buf.c 2010-12-03 15:09:51.273986410 +0900 +++ b/storage/innobase/buf/buf0buf.c 2010-12-03 15:10:08.934990091 +0900 @@ -320,6 +320,7 @@ /* When we traverse all the flush lists we don't want another thread to add a dirty page to any flush list. */ + if (srv_buf_pool_instances > 1) log_flush_order_mutex_enter(); for (i = 0; i < srv_buf_pool_instances; i++) { @@ -343,6 +344,7 @@ } } + if (srv_buf_pool_instances > 1) log_flush_order_mutex_exit(); /* The returned answer may be out of date: the flush_list can diff -ruN a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c --- a/storage/innobase/buf/buf0flu.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/buf/buf0flu.c 2010-12-03 15:10:08.934990091 +0900 @@ -1376,7 +1376,7 @@ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) { /* If there is little space, it is better not to flush any block except from the end of the LRU list */ diff -ruN a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c --- a/storage/innobase/buf/buf0rea.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/buf/buf0rea.c 2010-12-03 15:10:08.937050537 +0900 @@ -260,6 +260,10 @@ = BUF_READ_AHEAD_LINEAR_AREA(buf_pool); ulint threshold; + if (!(srv_read_ahead & 2)) { + return(0); + } + if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { /* No read-ahead to avoid thread deadlocks */ return(0); diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc --- a/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:09:51.283956391 +0900 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:10:08.963980444 +0900 @@ -426,6 +426,12 @@ "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG, + "Set to 0 (write and flush once per second)," + " 1 (write and flush at each commit)" + " or 2 (write at commit, flush once per second).", + NULL, NULL, 1, 0, 2, 0); + static handler *innobase_create_handler(handlerton *hton, TABLE_SHARE *table, @@ -820,6 +826,17 @@ } } +/******************************************************************//** +*/ +extern "C" UNIV_INTERN +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd) +{ + return(THDVAR((THD*) thd, flush_log_at_trx_commit)); +} + /********************************************************************//** Obtain the InnoDB transaction of a MySQL thread. @return reference to transaction pointer */ @@ -2391,6 +2408,9 @@ srv_n_read_io_threads = (ulint) innobase_read_io_threads; srv_n_write_io_threads = (ulint) innobase_write_io_threads; + srv_read_ahead &= 3; + srv_adaptive_flushing_method %= 3; + srv_force_recovery = (ulint) innobase_force_recovery; srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; @@ -10919,9 +10939,9 @@ static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "Purge threads can be either 0 or 1. Default is 0.", + "Purge threads can be either 0 or 1. Default is 1.", NULL, NULL, - 0, /* Default setting */ + 1, /* Default setting */ 0, /* Minimum value */ 1, 0); /* Maximum value */ @@ -10963,12 +10983,18 @@ innodb_file_format_max_validate, innodb_file_format_max_update, "Antelope"); -static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, - PLUGIN_VAR_OPCMDARG, - "Set to 0 (write and flush once per second)," - " 1 (write and flush at each commit)" - " or 2 (write at commit, flush once per second).", - NULL, NULL, 1, 0, 2, 0); +/* Changed to the THDVAR */ +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, +// PLUGIN_VAR_OPCMDARG, +// "Set to 0 (write and flush once per second)," +// " 1 (write and flush at each commit)" +// " or 2 (write at commit, flush once per second).", +// NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit, + PLUGIN_VAR_NOCMDARG, + "Use global innodb_flush_log_at_trx_commit value. (default: ON).", + NULL, NULL, TRUE); static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -11063,7 +11089,7 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", - NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); + NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L); static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -11208,6 +11234,95 @@ "trigger a readahead.", NULL, NULL, 56, 0, 64, 0); +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The maximum size of the insert buffer. (in bytes)", + NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable active_contract of insert buffer. 0:disable 1:enable", + NULL, NULL, 1, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate, + PLUGIN_VAR_RQCMDARG, + "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)", + NULL, NULL, 100, 100, 999999999, 0); + +static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target, + PLUGIN_VAR_RQCMDARG, + "Control soft limit of checkpoint age. (0 : not control)", + NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable flushing also neighbor pages. 0:disable 1:enable", + NULL, NULL, 1, 0, 1, 0); + +static +void +innodb_read_ahead_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) & 3; +} +const char *read_ahead_names[]= +{ + "none", /* 0 */ + "random", + "linear", + "both", /* 3 */ + /* For compatibility of the older patch */ + "0", /* 4 ("none" + 4) */ + "1", + "2", + "3", /* 7 ("both" + 4) */ + NullS +}; +TYPELIB read_ahead_typelib= +{ + array_elements(read_ahead_names) - 1, "read_ahead_typelib", + read_ahead_names, NULL +}; +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead, + PLUGIN_VAR_RQCMDARG, + "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]", + NULL, innodb_read_ahead_update, 2, &read_ahead_typelib); + +static +void +innodb_adaptive_flushing_method_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) % 4; +} +const char *adaptive_flushing_method_names[]= +{ + "native", /* 0 */ + "estimate", /* 1 */ + "keep_average", /* 2 */ + /* For compatibility of the older patch */ + "0", /* 3 ("none" + 3) */ + "1", /* 4 ("estimate" + 3) */ + "2", /* 5 ("keep_average" + 3) */ + NullS +}; +TYPELIB adaptive_flushing_method_typelib= +{ + array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib", + adaptive_flushing_method_names, NULL +}; +static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method, + PLUGIN_VAR_RQCMDARG, + "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)", + NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), @@ -11228,6 +11343,7 @@ MYSQL_SYSVAR(file_format_check), MYSQL_SYSVAR(file_format_max), MYSQL_SYSVAR(flush_log_at_trx_commit), + MYSQL_SYSVAR(use_global_flush_log_at_trx_commit), MYSQL_SYSVAR(flush_method), MYSQL_SYSVAR(force_recovery), MYSQL_SYSVAR(locks_unsafe_for_binlog), @@ -11264,6 +11380,13 @@ MYSQL_SYSVAR(show_verbose_locks), MYSQL_SYSVAR(show_locks_held), MYSQL_SYSVAR(version), + MYSQL_SYSVAR(ibuf_max_size), + MYSQL_SYSVAR(ibuf_active_contract), + MYSQL_SYSVAR(ibuf_accel_rate), + MYSQL_SYSVAR(checkpoint_age_target), + MYSQL_SYSVAR(flush_neighbor_pages), + MYSQL_SYSVAR(read_ahead), + MYSQL_SYSVAR(adaptive_flushing_method), MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(use_native_aio), MYSQL_SYSVAR(change_buffering), diff -ruN a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c --- a/storage/innobase/ibuf/ibuf0ibuf.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/ibuf/ibuf0ibuf.c 2010-12-03 15:10:09.073984282 +0900 @@ -524,8 +524,10 @@ grow in size, as the references on the upper levels of the tree can change */ - ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE - / IBUF_POOL_SIZE_PER_MAX_SIZE; + ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE + / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE); + + srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE; mutex_create(ibuf_pessimistic_insert_mutex_key, &ibuf_pessimistic_insert_mutex, @@ -2729,9 +2731,11 @@ size = ibuf->size; max_size = ibuf->max_size; + if (!srv_ibuf_active_contract) { if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { return; } + } sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC); diff -ruN a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h --- a/storage/innobase/include/buf0rea.h 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/include/buf0rea.h 2010-12-03 15:10:09.076066335 +0900 @@ -124,8 +124,7 @@ /** The size in pages of the area which the read-ahead algorithms read if invoked */ -#define BUF_READ_AHEAD_AREA(b) \ - ut_min(64, ut_2_power_up((b)->curr_size / 32)) +#define BUF_READ_AHEAD_AREA(b) 64 /** @name Modes used in read-ahead @{ */ /** read only pages belonging to the insert buffer tree */ diff -ruN a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h --- a/storage/innobase/include/ha_prototypes.h 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/include/ha_prototypes.h 2010-12-03 15:10:09.078026360 +0900 @@ -275,5 +275,12 @@ /*===================*/ void* thd, /*!< in: thread handle (THD*) */ ulint value); /*!< in: time waited for the lock */ +/******************************************************************//** +*/ + +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd); #endif diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h --- a/storage/innobase/include/srv0srv.h 2010-12-03 15:09:51.291955835 +0900 +++ b/storage/innobase/include/srv0srv.h 2010-12-03 15:10:09.079029047 +0900 @@ -141,7 +141,8 @@ extern ulint srv_n_log_files; extern ulint srv_log_file_size; extern ulint srv_log_buffer_size; -extern ulong srv_flush_log_at_trx_commit; +//extern ulong srv_flush_log_at_trx_commit; +extern char srv_use_global_flush_log_at_trx_commit; extern char srv_adaptive_flushing; @@ -214,6 +215,16 @@ extern ulong srv_max_purge_lag; extern ulong srv_replication_delay; + +extern long long srv_ibuf_max_size; +extern ulint srv_ibuf_active_contract; +extern ulint srv_ibuf_accel_rate; +extern ulint srv_checkpoint_age_target; +extern ulint srv_flush_neighbor_pages; +extern ulint srv_enable_unsafe_group_commit; +extern ulint srv_read_ahead; +extern ulint srv_adaptive_flushing_method; + /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; @@ -389,8 +400,9 @@ when writing data files, but do flush after writing to log files */ SRV_UNIX_NOSYNC, /*!< do not flush after writing */ - SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on data files */ + SRV_UNIX_ALL_O_DIRECT /* new method for examination: logfile also open O_DIRECT */ }; /** Alternatives for file i/o in Windows */ diff -ruN a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c --- a/storage/innobase/log/log0log.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/log/log0log.c 2010-12-03 15:10:09.084023562 +0900 @@ -48,6 +48,7 @@ #include "srv0start.h" #include "trx0sys.h" #include "trx0trx.h" +#include "ha_prototypes.h" /* General philosophy of InnoDB redo-logs: @@ -359,6 +360,33 @@ } /************************************************************//** +*/ +UNIV_INLINE +ulint +log_max_modified_age_async() +{ + if (srv_checkpoint_age_target) { + return(ut_min(log_sys->max_modified_age_async, + srv_checkpoint_age_target + - srv_checkpoint_age_target / 8)); + } else { + return(log_sys->max_modified_age_async); + } +} + +UNIV_INLINE +ulint +log_max_checkpoint_age_async() +{ + if (srv_checkpoint_age_target) { + return(ut_min(log_sys->max_checkpoint_age_async, + srv_checkpoint_age_target)); + } else { + return(log_sys->max_checkpoint_age_async); + } +} + +/************************************************************//** Closes the log. @return lsn */ UNIV_INTERN @@ -427,7 +455,7 @@ } } - if (checkpoint_age <= log->max_modified_age_async) { + if (checkpoint_age <= log_max_modified_age_async()) { goto function_exit; } @@ -435,8 +463,8 @@ oldest_lsn = buf_pool_get_oldest_modification(); if (!oldest_lsn - || lsn - oldest_lsn > log->max_modified_age_async - || checkpoint_age > log->max_checkpoint_age_async) { + || lsn - oldest_lsn > log_max_modified_age_async() + || checkpoint_age > log_max_checkpoint_age_async()) { log->check_flush_or_checkpoint = TRUE; } @@ -1100,6 +1128,7 @@ group = (log_group_t*)((ulint)group - 1); if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { fil_flush(group->space_id); @@ -1121,8 +1150,9 @@ logs and cannot end up here! */ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT && srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && srv_flush_log_at_trx_commit != 2) { + && thd_flush_log_at_trx_commit(NULL) != 2) { fil_flush(group->space_id); } @@ -1501,7 +1531,8 @@ mutex_exit(&(log_sys->mutex)); - if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC + || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { /* O_DSYNC means the OS did not buffer the log file at all: so we have also flushed to disk what we have written */ @@ -2120,10 +2151,10 @@ sync = TRUE; advance = 2 * (age - log->max_modified_age_sync); - } else if (age > log->max_modified_age_async) { + } else if (age > log_max_modified_age_async()) { /* A flush is not urgent: we do an asynchronous preflush */ - advance = age - log->max_modified_age_async; + advance = age - log_max_modified_age_async(); } else { advance = 0; } @@ -2137,7 +2168,7 @@ do_checkpoint = TRUE; - } else if (checkpoint_age > log->max_checkpoint_age_async) { + } else if (checkpoint_age > log_max_checkpoint_age_async()) { /* A checkpoint is not urgent: do it asynchronously */ do_checkpoint = TRUE; @@ -3349,6 +3380,17 @@ log_sys->flushed_to_disk_lsn, log_sys->last_checkpoint_lsn); + fprintf(file, + "Max checkpoint age %lu\n" + "Checkpoint age target %lu\n" + "Modified age %lu\n" + "Checkpoint age %lu\n", + (ulong) log_sys->max_checkpoint_age, + (ulong) log_max_checkpoint_age_async(), + (ulong) (log_sys->lsn - + log_buf_pool_get_oldest_modification()), + (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn)); + current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c --- a/storage/innobase/log/log0recv.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/log/log0recv.c 2010-12-03 15:10:09.089024191 +0900 @@ -2906,9 +2906,12 @@ ib_uint64_t archived_lsn; #endif /* UNIV_LOG_ARCHIVE */ byte* buf; - byte log_hdr_buf[LOG_FILE_HDR_SIZE]; + byte* log_hdr_buf; + byte log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE]; ulint err; + log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE); + #ifdef UNIV_LOG_ARCHIVE ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX); /** TRUE when recovering from a checkpoint */ diff -ruN a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c --- a/storage/innobase/os/os0file.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/os/os0file.c 2010-12-03 15:10:09.093023540 +0900 @@ -1424,7 +1424,7 @@ #endif #ifdef UNIV_NON_BUFFERED_IO # ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ @@ -1440,7 +1440,7 @@ attributes = 0; #ifdef UNIV_NON_BUFFERED_IO # ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ @@ -1585,6 +1585,11 @@ os_file_set_nocache(file, name, mode_str); } + /* ALL_O_DIRECT: O_DIRECT also for transaction log file */ + if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + os_file_set_nocache(file, name, mode_str); + } + #ifdef USE_FILE_LOCK if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c --- a/storage/innobase/srv/srv0srv.c 2010-12-03 15:09:51.301987792 +0900 +++ b/storage/innobase/srv/srv0srv.c 2010-12-03 15:13:29.369986988 +0900 @@ -190,7 +190,8 @@ UNIV_INTERN ulint srv_log_file_size = ULINT_MAX; /* size in database pages */ UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX; -UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; +//UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; +UNIV_INTERN char srv_use_global_flush_log_at_trx_commit = TRUE; /* Try to flush dirty pages so as to avoid IO bursts at the checkpoints. */ @@ -401,6 +402,17 @@ UNIV_INTERN ulong srv_replication_delay = 0; +UNIV_INTERN long long srv_ibuf_max_size = 0; +UNIV_INTERN ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_ibuf_accel_rate = 100; +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0))) + +UNIV_INTERN ulint srv_checkpoint_age_target = 0; +UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */ + +UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ +UNIV_INTERN ulint srv_adaptive_flushing_method = 0; /* 0: native 1: estimate 2: keep_average */ /*-------------------------------------------*/ UNIV_INTERN ulong srv_n_spin_wait_rounds = 30; UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500; @@ -2737,6 +2749,7 @@ ulint n_pages_purged = 0; ulint n_bytes_merged; ulint n_pages_flushed; + ulint n_pages_flushed_prev = 0; ulint n_bytes_archived; ulint n_tables_to_drop; ulint n_ios; @@ -2744,7 +2757,20 @@ ulint n_ios_very_old; ulint n_pend_ios; ulint next_itr_time; + ulint prev_adaptive_flushing_method = ULINT_UNDEFINED; + ulint inner_loop = 0; + ibool skip_sleep = FALSE; ulint i; + struct t_prev_flush_info_struct { + ulint count; + unsigned space:32; + unsigned offset:32; + ib_uint64_t oldest_modification; + } prev_flush_info[MAX_BUFFER_POOLS]; + + ib_uint64_t lsn_old; + + ib_uint64_t oldest_lsn; #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Master thread starts, id %lu\n", @@ -2766,6 +2792,9 @@ mutex_exit(&kernel_mutex); + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); loop: /*****************************************************************/ /* ---- When there is database activity by users, we cycle in this @@ -2796,9 +2825,13 @@ /* Sleep for 1 second on entrying the for loop below the first time. */ next_itr_time = ut_time_ms() + 1000; + skip_sleep = FALSE; + for (i = 0; i < 10; i++) { ulint cur_time = ut_time_ms(); + n_pages_flushed = 0; /* initialize */ + /* ALTER TABLE in MySQL requires on Unix that the table handler can drop tables lazily after there no longer are SELECT queries to them. */ @@ -2822,6 +2855,7 @@ srv_main_thread_op_info = "sleeping"; srv_main_1_second_loops++; + if (!skip_sleep) { if (next_itr_time > cur_time && srv_shutdown_state == SRV_SHUTDOWN_NONE) { @@ -2832,10 +2866,26 @@ (next_itr_time - cur_time) * 1000)); srv_main_sleeps++; + + /* + mutex_enter(&(log_sys->mutex)); + oldest_lsn = buf_pool_get_oldest_modification(); + ib_uint64_t lsn = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + if(oldest_lsn) + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old); + */ } /* Each iteration should happen at 1 second interval. */ next_itr_time = ut_time_ms() + 1000; + } /* if (!skip_sleep) */ + + skip_sleep = FALSE; /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2855,7 +2905,7 @@ if (n_pend_ios < SRV_PEND_IO_THRESHOLD && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) { srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(FALSE, PCT_IO(5)); + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5)); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2872,7 +2922,11 @@ n_pages_flushed = buf_flush_list( PCT_IO(100), IB_ULONGLONG_MAX); - } else if (srv_adaptive_flushing) { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) { /* Try to keep the rate of flushing of dirty pages such that redo log generation does not @@ -2888,6 +2942,224 @@ n_flush, IB_ULONGLONG_MAX); } + + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) { + + /* Try to keep modified age not to exceed + max_checkpoint_age * 7/8 line */ + + mutex_enter(&(log_sys->mutex)); + + oldest_lsn = buf_pool_get_oldest_modification(); + if (oldest_lsn == 0) { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + } else { + if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { + /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ + /* We should not flush from here. */ + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } else if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age)/4 ) { + + /* defence line (max_checkpoint_age * 1/2) */ + ib_uint64_t lsn = log_sys->lsn; + + ib_uint64_t level, bpl; + buf_page_t* bpage; + ulint j; + + mutex_exit(&(log_sys->mutex)); + + bpl = 0; + + for (j = 0; j < srv_buf_pool_instances; j++) { + buf_pool_t* buf_pool; + ulint n_blocks; + + buf_pool = buf_pool_from_array(j); + + /* The scanning flush_list is optimistic here */ + + level = 0; + n_blocks = 0; + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (bpage != NULL) { + ib_uint64_t oldest_modification = bpage->oldest_modification; + if (oldest_modification != 0) { + level += log_sys->max_checkpoint_age + - (lsn - oldest_modification); + } + bpage = UT_LIST_GET_NEXT(list, bpage); + n_blocks++; + } + + if (level) { + bpl += ((ib_uint64_t) n_blocks * n_blocks + * (lsn - lsn_old)) / level; + } + + } + + if (!srv_use_doublewrite_buf) { + /* flush is faster than when doublewrite */ + bpl = (bpl * 7) / 8; + } + + if (bpl) { +retry_flush_batch: + n_pages_flushed = buf_flush_list(bpl, + oldest_lsn + (lsn - lsn_old)); + if (n_pages_flushed == ULINT_UNDEFINED) { + os_thread_sleep(5000); + goto retry_flush_batch; + } + } + + lsn_old = lsn; + /* + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old, bpl); + */ + } else { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } + } + prev_adaptive_flushing_method = 1; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) { + buf_pool_t* buf_pool; + buf_page_t* bpage; + ib_uint64_t lsn; + ulint j; + + mutex_enter(&(log_sys->mutex)); + oldest_lsn = buf_pool_get_oldest_modification(); + lsn = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + /* upper loop/sec. (x10) */ + next_itr_time -= 900; /* 1000 - 900 == 100 */ + inner_loop++; + if (inner_loop < 10) { + i--; + } else { + inner_loop = 0; + } + + if (prev_adaptive_flushing_method == 2) { + lint n_flush; + lint blocks_sum; + ulint new_blocks_sum, flushed_blocks_sum; + + blocks_sum = new_blocks_sum = flushed_blocks_sum = 0; + + /* prev_flush_info[j] should be the previous loop's */ + for (j = 0; j < srv_buf_pool_instances; j++) { + lint blocks_num, new_blocks_num, flushed_blocks_num; + ibool found; + + buf_pool = buf_pool_from_array(j); + + blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list); + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + new_blocks_num = 0; + + found = FALSE; + while (bpage != NULL) { + if (prev_flush_info[j].space == bpage->space + && prev_flush_info[j].offset == bpage->offset + && prev_flush_info[j].oldest_modification + == bpage->oldest_modification) { + found = TRUE; + break; + } + bpage = UT_LIST_GET_NEXT(list, bpage); + new_blocks_num++; + } + if (!found) { + new_blocks_num = blocks_num; + } + + flushed_blocks_num = new_blocks_num + prev_flush_info[j].count + - blocks_num; + if (flushed_blocks_num < 0) { + flushed_blocks_num = 0; + } + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list); + if (bpage) { + prev_flush_info[j].space = bpage->space; + prev_flush_info[j].offset = bpage->offset; + prev_flush_info[j].oldest_modification = bpage->oldest_modification; + } else { + prev_flush_info[j].space = 0; + prev_flush_info[j].offset = 0; + prev_flush_info[j].oldest_modification = 0; + } + + new_blocks_sum += new_blocks_num; + flushed_blocks_sum += flushed_blocks_num; + blocks_sum += blocks_num; + } + + n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async; + if (flushed_blocks_sum > n_pages_flushed_prev) { + n_flush -= (flushed_blocks_sum - n_pages_flushed_prev); + } + + if (n_flush > 0) { + n_flush++; + n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old)); + } else { + n_pages_flushed = 0; + } + } else { + /* store previous first pages of the flush_list */ + for (j = 0; j < srv_buf_pool_instances; j++) { + buf_pool = buf_pool_from_array(j); + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list); + if (bpage) { + prev_flush_info[j].space = bpage->space; + prev_flush_info[j].offset = bpage->offset; + prev_flush_info[j].oldest_modification = bpage->oldest_modification; + } else { + prev_flush_info[j].space = 0; + prev_flush_info[j].offset = 0; + prev_flush_info[j].oldest_modification = 0; + } + } + n_pages_flushed = 0; + } + + lsn_old = lsn; + prev_adaptive_flushing_method = 2; + } else { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } + + if (n_pages_flushed == ULINT_UNDEFINED) { + n_pages_flushed_prev = 0; + } else { + n_pages_flushed_prev = n_pages_flushed; } if (srv_activity_count == old_activity_count) { @@ -2936,7 +3208,7 @@ even if the server were active */ srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(FALSE, PCT_IO(5)); + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5)); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -3044,7 +3316,7 @@ buf_flush_list below. Otherwise, the system favors clean pages over cleanup throughput. */ n_bytes_merged = ibuf_contract_for_n_pages(FALSE, - PCT_IO(100)); + PCT_IBUF_IO(100)); } srv_main_thread_op_info = "reserving kernel mutex"; @@ -3190,6 +3462,7 @@ srv_slot_t* slot; ulint slot_no = ULINT_UNDEFINED; ulint n_total_purged = ULINT_UNDEFINED; + ulint next_itr_time; ut_a(srv_n_purge_threads == 1); @@ -3212,9 +3485,12 @@ mutex_exit(&kernel_mutex); + next_itr_time = ut_time_ms(); + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { ulint n_pages_purged; + ulint cur_time; /* If there are very few records to purge or the last purge didn't purge any records then wait for activity. @@ -3255,6 +3531,16 @@ } while (n_pages_purged > 0 && !srv_fast_shutdown); srv_sync_log_buffer_in_background(); + + cur_time = ut_time_ms(); + if (next_itr_time > cur_time) { + os_thread_sleep(ut_min(1000000, + (next_itr_time - cur_time) + * 1000)); + next_itr_time = ut_time_ms() + 1000; + } else { + next_itr_time = cur_time + 1000; + } } mutex_enter(&kernel_mutex); diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c --- a/storage/innobase/srv/srv0start.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/srv/srv0start.c 2010-12-03 15:10:09.103023543 +0900 @@ -1212,6 +1212,9 @@ } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; diff -ruN a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c --- a/storage/innobase/trx/trx0trx.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/trx/trx0trx.c 2010-12-03 15:10:09.106023937 +0900 @@ -865,6 +865,7 @@ trx->read_view = NULL; if (lsn) { + ulint flush_log_at_trx_commit; mutex_exit(&kernel_mutex); @@ -873,6 +874,12 @@ trx_undo_insert_cleanup(trx); } + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + /* NOTE that we could possibly make a group commit more efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ @@ -904,9 +911,9 @@ if (trx->flush_log_later) { /* Do nothing yet */ trx->must_flush_log_later = TRUE; - } else if (srv_flush_log_at_trx_commit == 0) { + } else if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -918,7 +925,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ @@ -1582,16 +1589,23 @@ trx_t* trx) /*!< in: trx handle */ { ib_uint64_t lsn = trx->commit_lsn; + ulint flush_log_at_trx_commit; ut_a(trx); trx->op_info = "flushing log"; + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + if (!trx->must_flush_log_later) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 0) { + } else if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -1602,7 +1616,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ @@ -1855,6 +1869,8 @@ /*--------------------------------------*/ if (lsn) { + ulint flush_log_at_trx_commit; + /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the prepared state of the transaction durable if the OS does not crash. We may also @@ -1874,9 +1890,15 @@ mutex_exit(&kernel_mutex); - if (srv_flush_log_at_trx_commit == 0) { + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -1888,7 +1910,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */