# name : innodb_io_patches.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -320,6 +320,7 @@ /* When we traverse all the flush lists we don't want another thread to add a dirty page to any flush list. */ + if (srv_buf_pool_instances > 1) log_flush_order_mutex_enter(); for (i = 0; i < srv_buf_pool_instances; i++) { @@ -343,6 +344,7 @@ } } + if (srv_buf_pool_instances > 1) log_flush_order_mutex_exit(); /* The returned answer may be out of date: the flush_list can --- a/storage/innobase/buf/buf0flu.c +++ b/storage/innobase/buf/buf0flu.c @@ -857,7 +857,7 @@ flush: /* Now flush the doublewrite buffer data to disk */ - fil_flush(TRX_SYS_SPACE); + fil_flush(TRX_SYS_SPACE, FALSE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -1375,10 +1375,11 @@ ulint high; ulint count = 0; buf_pool_t* buf_pool = buf_pool_get(space, offset); + ibool is_forward_scan; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) { /* If there is little space, it is better not to flush any block except from the end of the LRU list */ @@ -1405,7 +1406,32 @@ high = fil_space_get_size(space); } - for (i = low; i < high; i++) { + if (srv_flush_neighbor_pages == 2) { + + /* In the case of contiguous flush where the requested page + does not fall at the start of flush area, first scan backward + from the page and later forward from it. */ + is_forward_scan = (offset == low); + } + else { + is_forward_scan = TRUE; + } + +scan: + if (srv_flush_neighbor_pages == 2) { + if (is_forward_scan) { + i = offset; + } + else { + i = offset - 1; + } + } + else { + i = low; + } + + for (; is_forward_scan ? (i < high) : (i >= low); + is_forward_scan ? i++ : i--) { buf_page_t* bpage; @@ -1434,6 +1460,12 @@ if (!bpage) { buf_pool_mutex_exit(buf_pool); + if (srv_flush_neighbor_pages == 2) { + + /* This is contiguous neighbor page flush and + the pages here are not contiguous. */ + break; + } continue; } @@ -1470,6 +1502,22 @@ } } buf_pool_mutex_exit(buf_pool); + + if (srv_flush_neighbor_pages == 2) { + + /* We are trying to do the contiguous neighbor page + flush, but the last page we checked was unflushable, + making a "hole" in the flush, so stop this attempt. */ + break; + } + } + + if (!is_forward_scan) { + + /* Backward scan done, now do the forward scan */ + ut_a (srv_flush_neighbor_pages == 2); + is_forward_scan = TRUE; + goto scan; } return(count); @@ -1940,6 +1988,22 @@ buf_pool = buf_pool_from_array(i); + if (lsn_limit != IB_ULONGLONG_MAX) { + buf_page_t* bpage; + + buf_flush_list_mutex_enter(buf_pool); + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + if (!bpage + || bpage->oldest_modification >= lsn_limit) { + + buf_flush_list_mutex_exit(buf_pool); + continue; + } else { + + buf_flush_list_mutex_exit(buf_pool); + } + } + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { /* We have two choices here. If lsn_limit was specified then skipping an instance of buffer --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -427,6 +427,10 @@ = BUF_READ_AHEAD_AREA(buf_pool); ulint threshold; + if (!(srv_read_ahead & 2)) { + return(0); + } + if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { /* No read-ahead to avoid thread deadlocks */ return(0); --- a/storage/innobase/fil/fil0fil.c +++ b/storage/innobase/fil/fil0fil.c @@ -2609,7 +2609,7 @@ os_thread_sleep(20000); - fil_flush(id); + fil_flush(id, TRUE); goto retry; @@ -2823,7 +2823,7 @@ goto error_exit; } - ret = os_file_flush(file); + ret = os_file_flush(file, TRUE); if (!ret) { fputs("InnoDB: Error: file flush of tablespace ", stderr); @@ -3009,7 +3009,7 @@ } } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); if (!success) { goto func_exit; @@ -3031,7 +3031,7 @@ goto func_exit; } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); func_exit: os_file_close(file); ut_free(buf2); @@ -4014,7 +4014,7 @@ size_after_extend, *actual_size); */ mutex_exit(&fil_system->mutex); - fil_flush(space_id); + fil_flush(space_id, TRUE); return(success); } @@ -4585,8 +4585,9 @@ void fil_flush( /*======*/ - ulint space_id) /*!< in: file space id (this can be a group of + ulint space_id, /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ + ibool metadata) { fil_space_t* space; fil_node_t* node; @@ -4657,7 +4658,7 @@ /* fprintf(stderr, "Flushing to file %s\n", node->name); */ - os_file_flush(file); + os_file_flush(file, metadata); mutex_enter(&fil_system->mutex); @@ -4740,7 +4741,7 @@ a non-existing space id. */ for (i = 0; i < n_space_ids; i++) { - fil_flush(space_ids[i]); + fil_flush(space_ids[i], TRUE); } mem_free(space_ids); --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -445,6 +445,12 @@ "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG, + "Set to 0 (write and flush once per second)," + " 1 (write and flush at each commit)" + " or 2 (write at commit, flush once per second).", + NULL, NULL, 1, 0, 2, 0); + static handler *innobase_create_handler(handlerton *hton, TABLE_SHARE *table, @@ -841,6 +847,17 @@ } } +/******************************************************************//** +*/ +extern "C" UNIV_INTERN +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd) +{ + return(THDVAR((THD*) thd, flush_log_at_trx_commit)); +} + /********************************************************************//** Obtain the InnoDB transaction of a MySQL thread. @return reference to transaction pointer */ @@ -2471,6 +2488,9 @@ srv_n_read_io_threads = (ulint) innobase_read_io_threads; srv_n_write_io_threads = (ulint) innobase_write_io_threads; + srv_read_ahead &= 3; + srv_adaptive_flushing_method %= 3; + srv_force_recovery = (ulint) innobase_force_recovery; srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; @@ -11141,7 +11161,7 @@ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Purge threads can be either 0 or 1.", NULL, NULL, - 0, /* Default setting */ + 1, /* Default setting */ 0, /* Minimum value */ 1, 0); /* Maximum value */ @@ -11183,12 +11203,18 @@ innodb_file_format_max_validate, innodb_file_format_max_update, "Antelope"); -static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, - PLUGIN_VAR_OPCMDARG, - "Set to 0 (write and flush once per second)," - " 1 (write and flush at each commit)" - " or 2 (write at commit, flush once per second).", - NULL, NULL, 1, 0, 2, 0); +/* Changed to the THDVAR */ +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, +// PLUGIN_VAR_OPCMDARG, +// "Set to 0 (write and flush once per second)," +// " 1 (write and flush at each commit)" +// " or 2 (write at commit, flush once per second).", +// NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit, + PLUGIN_VAR_NOCMDARG, + "Use global innodb_flush_log_at_trx_commit value. (default: ON).", + NULL, NULL, TRUE); static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -11293,7 +11319,7 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", - NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); + NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L); static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -11450,6 +11476,135 @@ "trigger a readahead.", NULL, NULL, 56, 0, 64, 0); +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The maximum size of the insert buffer. (in bytes)", + NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable active_contract of insert buffer. 0:disable 1:enable", + NULL, NULL, 1, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate, + PLUGIN_VAR_RQCMDARG, + "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)", + NULL, NULL, 100, 100, 999999999, 0); + +static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target, + PLUGIN_VAR_RQCMDARG, + "Control soft limit of checkpoint age. (0 : not control)", + NULL, NULL, 0, 0, ~0UL, 0); + +static +void +innodb_flush_neighbor_pages_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr = (*(long *)save) % 3; +} + +const char *flush_neighbor_pages_names[]= +{ + "none", /* 0 */ + "area", + "cont", /* 2 */ + /* For compatibility with the older patch */ + "0", /* "none" + 3 */ + "1", /* "area" + 3 */ + "2", /* "cont" + 3 */ + NullS +}; + +TYPELIB flush_neighbor_pages_typelib= +{ + array_elements(flush_neighbor_pages_names) - 1, + "flush_neighbor_pages_typelib", + flush_neighbor_pages_names, + NULL +}; + +static MYSQL_SYSVAR_ENUM(flush_neighbor_pages, srv_flush_neighbor_pages, + PLUGIN_VAR_RQCMDARG, "Neighbor page flushing behaviour: none: do not flush, " + "[area]: flush selected pages one-by-one, " + "cont: flush a contiguous block of pages", NULL, + innodb_flush_neighbor_pages_update, 1, &flush_neighbor_pages_typelib); + +static +void +innodb_read_ahead_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) & 3; +} +const char *read_ahead_names[]= +{ + "none", /* 0 */ + "random", + "linear", + "both", /* 3 */ + /* For compatibility of the older patch */ + "0", /* 4 ("none" + 4) */ + "1", + "2", + "3", /* 7 ("both" + 4) */ + NullS +}; +TYPELIB read_ahead_typelib= +{ + array_elements(read_ahead_names) - 1, "read_ahead_typelib", + read_ahead_names, NULL +}; +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead, + PLUGIN_VAR_RQCMDARG, + "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]", + NULL, innodb_read_ahead_update, 2, &read_ahead_typelib); + +static +void +innodb_adaptive_flushing_method_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) % 4; +} +const char *adaptive_flushing_method_names[]= +{ + "native", /* 0 */ + "estimate", /* 1 */ + "keep_average", /* 2 */ + /* For compatibility of the older patch */ + "0", /* 3 ("none" + 3) */ + "1", /* 4 ("estimate" + 3) */ + "2", /* 5 ("keep_average" + 3) */ + NullS +}; +TYPELIB adaptive_flushing_method_typelib= +{ + array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib", + adaptive_flushing_method_names, NULL +}; +static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method, + PLUGIN_VAR_RQCMDARG, + "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)", + NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_ULONG(flush_checkpoint_debug, srv_flush_checkpoint_debug, + PLUGIN_VAR_RQCMDARG, + "Debug flags for InnoDB flushing and checkpointing (0=none," + "1=stop preflush and checkpointing)", + NULL, NULL, 0, 0, 1, 0); +#endif + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), @@ -11470,6 +11625,7 @@ MYSQL_SYSVAR(file_format_check), MYSQL_SYSVAR(file_format_max), MYSQL_SYSVAR(flush_log_at_trx_commit), + MYSQL_SYSVAR(use_global_flush_log_at_trx_commit), MYSQL_SYSVAR(flush_method), MYSQL_SYSVAR(force_recovery), MYSQL_SYSVAR(large_prefix), @@ -11509,6 +11665,13 @@ MYSQL_SYSVAR(show_verbose_locks), MYSQL_SYSVAR(show_locks_held), MYSQL_SYSVAR(version), + MYSQL_SYSVAR(ibuf_max_size), + MYSQL_SYSVAR(ibuf_active_contract), + MYSQL_SYSVAR(ibuf_accel_rate), + MYSQL_SYSVAR(checkpoint_age_target), + MYSQL_SYSVAR(flush_neighbor_pages), + MYSQL_SYSVAR(read_ahead), + MYSQL_SYSVAR(adaptive_flushing_method), MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(use_native_aio), MYSQL_SYSVAR(change_buffering), @@ -11521,6 +11684,9 @@ MYSQL_SYSVAR(purge_threads), MYSQL_SYSVAR(purge_batch_size), MYSQL_SYSVAR(rollback_segments), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(flush_checkpoint_debug), +#endif NULL }; --- a/storage/innobase/ibuf/ibuf0ibuf.c +++ b/storage/innobase/ibuf/ibuf0ibuf.c @@ -523,8 +523,10 @@ grow in size, as the references on the upper levels of the tree can change */ - ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE - / IBUF_POOL_SIZE_PER_MAX_SIZE; + ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE + / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE); + + srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE; mutex_create(ibuf_pessimistic_insert_mutex_key, &ibuf_pessimistic_insert_mutex, @@ -2763,9 +2765,11 @@ size = ibuf->size; max_size = ibuf->max_size; + if (!srv_ibuf_active_contract) { if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { return; } + } sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC); --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -149,8 +149,7 @@ /** The size in pages of the area which the read-ahead algorithms read if invoked */ -#define BUF_READ_AHEAD_AREA(b) \ - ut_min(64, ut_2_power_up((b)->curr_size / 32)) +#define BUF_READ_AHEAD_AREA(b) 64 /** @name Modes used in read-ahead @{ */ /** read only pages belonging to the insert buffer tree */ --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -663,8 +663,9 @@ void fil_flush( /*======*/ - ulint space_id); /*!< in: file space id (this can be a group of + ulint space_id, /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ + ibool metadata); /**********************************************************************//** Flushes to disk writes in file spaces of the given type possibly cached by the OS. */ --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -284,6 +284,13 @@ /*===================*/ void* thd, /*!< in: thread handle (THD*) */ ulint value); /*!< in: time waited for the lock */ +/******************************************************************//** +*/ + +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd); /**********************************************************************//** Get the current setting of the lower_case_table_names global parameter from --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -296,8 +296,8 @@ pfs_os_file_write_func(name, file, buf, offset, offset_high, \ n, __FILE__, __LINE__) -# define os_file_flush(file) \ - pfs_os_file_flush_func(file, __FILE__, __LINE__) +# define os_file_flush(file, metadata) \ + pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__) # define os_file_rename(key, oldpath, newpath) \ pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__) @@ -333,7 +333,7 @@ # define os_file_write(name, file, buf, offset, offset_high, n) \ os_file_write_func(name, file, buf, offset, offset_high, n) -# define os_file_flush(file) os_file_flush_func(file) +# define os_file_flush(file, metadata) os_file_flush_func(file, metadata) # define os_file_rename(key, oldpath, newpath) \ os_file_rename_func(oldpath, newpath) @@ -781,6 +781,7 @@ pfs_os_file_flush_func( /*===================*/ os_file_t file, /*!< in, own: handle to a file */ + ibool metadata, const char* src_file,/*!< in: file name where func invoked */ ulint src_line);/*!< in: line where the func invoked */ @@ -860,7 +861,8 @@ ibool os_file_flush_func( /*===============*/ - os_file_t file); /*!< in, own: handle to a file */ + os_file_t file, /*!< in, own: handle to a file */ + ibool metadata); /***********************************************************************//** Retrieves the last error number if an error occurs in a file io function. The number should be retrieved before any other OS calls (because they may --- a/storage/innobase/include/os0file.ic +++ b/storage/innobase/include/os0file.ic @@ -369,6 +369,7 @@ pfs_os_file_flush_func( /*===================*/ os_file_t file, /*!< in, own: handle to a file */ + ibool metadata, const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -378,7 +379,7 @@ register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line); - result = os_file_flush_func(file); + result = os_file_flush_func(file, metadata); register_pfs_file_io_end(locker, 0); --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -138,7 +138,8 @@ extern ulint srv_n_log_files; extern ulint srv_log_file_size; extern ulint srv_log_buffer_size; -extern ulong srv_flush_log_at_trx_commit; +//extern ulong srv_flush_log_at_trx_commit; +extern char srv_use_global_flush_log_at_trx_commit; extern char srv_adaptive_flushing; /* If this flag is TRUE, then we will load the indexes' (and tables') metadata @@ -221,6 +222,16 @@ extern ulong srv_max_purge_lag; extern ulong srv_replication_delay; + +extern long long srv_ibuf_max_size; +extern ulint srv_ibuf_active_contract; +extern ulint srv_ibuf_accel_rate; +extern ulint srv_checkpoint_age_target; +extern ulint srv_flush_neighbor_pages; +extern ulint srv_enable_unsafe_group_commit; +extern ulint srv_read_ahead; +extern ulint srv_adaptive_flushing_method; + /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; @@ -255,6 +266,9 @@ extern ibool srv_print_buf_io; extern ibool srv_print_log_io; extern ibool srv_print_latch_waits; + +extern ulint srv_flush_checkpoint_debug; + #else /* UNIV_DEBUG */ # define srv_print_thread_releases FALSE # define srv_print_lock_waits FALSE @@ -399,8 +413,9 @@ when writing data files, but do flush after writing to log files */ SRV_UNIX_NOSYNC, /*!< do not flush after writing */ - SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on data files */ + SRV_UNIX_ALL_O_DIRECT /* new method for examination: logfile also open O_DIRECT */ }; /** Alternatives for file i/o in Windows */ --- a/storage/innobase/log/log0log.c +++ b/storage/innobase/log/log0log.c @@ -48,6 +48,7 @@ #include "srv0start.h" #include "trx0sys.h" #include "trx0trx.h" +#include "ha_prototypes.h" /* General philosophy of InnoDB redo-logs: @@ -359,6 +360,33 @@ } /************************************************************//** +*/ +UNIV_INLINE +ulint +log_max_modified_age_async() +{ + if (srv_checkpoint_age_target) { + return(ut_min(log_sys->max_modified_age_async, + srv_checkpoint_age_target + - srv_checkpoint_age_target / 8)); + } else { + return(log_sys->max_modified_age_async); + } +} + +UNIV_INLINE +ulint +log_max_checkpoint_age_async() +{ + if (srv_checkpoint_age_target) { + return(ut_min(log_sys->max_checkpoint_age_async, + srv_checkpoint_age_target)); + } else { + return(log_sys->max_checkpoint_age_async); + } +} + +/************************************************************//** Closes the log. @return lsn */ UNIV_INTERN @@ -427,7 +455,7 @@ } } - if (checkpoint_age <= log->max_modified_age_async) { + if (checkpoint_age <= log_max_modified_age_async()) { goto function_exit; } @@ -435,8 +463,8 @@ oldest_lsn = buf_pool_get_oldest_modification(); if (!oldest_lsn - || lsn - oldest_lsn > log->max_modified_age_async - || checkpoint_age > log->max_checkpoint_age_async) { + || lsn - oldest_lsn > log_max_modified_age_async() + || checkpoint_age > log_max_checkpoint_age_async()) { log->check_flush_or_checkpoint = TRUE; } @@ -1100,9 +1128,10 @@ group = (log_group_t*)((ulint)group - 1); if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); } #ifdef UNIV_DEBUG @@ -1121,10 +1150,11 @@ logs and cannot end up here! */ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT && srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && srv_flush_log_at_trx_commit != 2) { + && thd_flush_log_at_trx_commit(NULL) != 2) { - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); } mutex_enter(&(log_sys->mutex)); @@ -1501,7 +1531,8 @@ mutex_exit(&(log_sys->mutex)); - if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC + || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { /* O_DSYNC means the OS did not buffer the log file at all: so we have also flushed to disk what we have written */ @@ -1511,7 +1542,7 @@ group = UT_LIST_GET_FIRST(log_sys->log_groups); - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); log_sys->flushed_to_disk_lsn = log_sys->write_lsn; } @@ -1655,10 +1686,13 @@ recv_apply_hashed_log_recs(TRUE); } + retry: n_pages = buf_flush_list(ULINT_MAX, new_oldest); - if (sync) { - buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + if (sync && n_pages != 0) { + //buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + os_thread_sleep(100000); + goto retry; } if (n_pages == ULINT_UNDEFINED) { @@ -1979,6 +2013,13 @@ { ib_uint64_t oldest_lsn; +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug == 1) { + + return TRUE; + } +#endif + if (recv_recovery_is_on()) { recv_apply_hashed_log_recs(TRUE); } @@ -2070,7 +2111,11 @@ physical write will always be made to log files */ { - /* Preflush pages synchronously */ +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug == 1) + return; +#endif +/* Preflush pages synchronously */ while (!log_preflush_pool_modified_pages(lsn, TRUE)); @@ -2096,7 +2141,13 @@ ibool checkpoint_sync; ibool do_checkpoint; ibool success; -loop: + +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug == 1) + return; +#endif + + loop: sync = FALSE; checkpoint_sync = FALSE; do_checkpoint = FALSE; @@ -2119,13 +2170,15 @@ /* A flush is urgent: we have to do a synchronous preflush */ sync = TRUE; - advance = 2 * (age - log->max_modified_age_sync); - } else if (age > log->max_modified_age_async) { + advance = age - log->max_modified_age_sync; + } else if (age > log_max_modified_age_async()) { /* A flush is not urgent: we do an asynchronous preflush */ - advance = age - log->max_modified_age_async; + advance = age - log_max_modified_age_async(); + log->check_flush_or_checkpoint = FALSE; } else { advance = 0; + log->check_flush_or_checkpoint = FALSE; } checkpoint_age = log->lsn - log->last_checkpoint_lsn; @@ -2137,14 +2190,14 @@ do_checkpoint = TRUE; - } else if (checkpoint_age > log->max_checkpoint_age_async) { + } else if (checkpoint_age > log_max_checkpoint_age_async()) { /* A checkpoint is not urgent: do it asynchronously */ do_checkpoint = TRUE; - log->check_flush_or_checkpoint = FALSE; + //log->check_flush_or_checkpoint = FALSE; } else { - log->check_flush_or_checkpoint = FALSE; + //log->check_flush_or_checkpoint = FALSE; } mutex_exit(&(log->mutex)); @@ -2152,6 +2205,7 @@ if (advance) { ib_uint64_t new_oldest = oldest_lsn + advance; +retry: success = log_preflush_pool_modified_pages(new_oldest, sync); /* If the flush succeeded, this thread has done its part @@ -2166,7 +2220,7 @@ log->check_flush_or_checkpoint = TRUE; mutex_exit(&(log->mutex)); - goto loop; + goto retry; } } @@ -2607,7 +2661,7 @@ mutex_exit(&(log_sys->mutex)); - fil_flush(group->archive_space_id); + fil_flush(group->archive_space_id, TRUE); mutex_enter(&(log_sys->mutex)); @@ -3044,7 +3098,11 @@ log_check_margins(void) /*===================*/ { -loop: +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug == 1) + return; +#endif + loop: log_flush_margin(); log_checkpoint_margin(); @@ -3349,6 +3407,17 @@ log_sys->flushed_to_disk_lsn, log_sys->last_checkpoint_lsn); + fprintf(file, + "Max checkpoint age %lu\n" + "Checkpoint age target %lu\n" + "Modified age %lu\n" + "Checkpoint age %lu\n", + (ulong) log_sys->max_checkpoint_age, + (ulong) log_max_checkpoint_age_async(), + (ulong) (log_sys->lsn - + log_buf_pool_get_oldest_modification()), + (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn)); + current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, --- a/storage/innobase/log/log0recv.c +++ b/storage/innobase/log/log0recv.c @@ -2906,9 +2906,12 @@ ib_uint64_t archived_lsn; #endif /* UNIV_LOG_ARCHIVE */ byte* buf; - byte log_hdr_buf[LOG_FILE_HDR_SIZE]; + byte* log_hdr_buf; + byte log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE]; ulint err; + log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE); + #ifdef UNIV_LOG_ARCHIVE ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX); /** TRUE when recovering from a checkpoint */ @@ -3468,7 +3471,7 @@ exit(1); } - os_file_flush(log_file); + os_file_flush(log_file, TRUE); os_file_close(log_file); } @@ -3492,7 +3495,7 @@ os_file_write(name, log_file, buf, 0, 0, LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); - os_file_flush(log_file); + os_file_flush(log_file, TRUE); os_file_close(log_file); ut_free(buf); --- a/storage/innobase/os/os0file.c +++ b/storage/innobase/os/os0file.c @@ -1424,7 +1424,7 @@ #endif #ifdef UNIV_NON_BUFFERED_IO # ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ @@ -1440,7 +1440,7 @@ attributes = 0; #ifdef UNIV_NON_BUFFERED_IO # ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ @@ -1585,6 +1585,11 @@ os_file_set_nocache(file, name, mode_str); } + /* ALL_O_DIRECT: O_DIRECT also for transaction log file */ + if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + os_file_set_nocache(file, name, mode_str); + } + #ifdef USE_FILE_LOCK if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { @@ -2008,7 +2013,7 @@ ut_free(buf2); - ret = os_file_flush(file); + ret = os_file_flush(file, TRUE); if (ret) { return(TRUE); @@ -2046,7 +2051,8 @@ int os_file_fsync( /*==========*/ - os_file_t file) /*!< in: handle to a file */ + os_file_t file, /*!< in: handle to a file */ + ibool metadata) { int ret; int failures; @@ -2055,7 +2061,16 @@ failures = 0; do { +#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC + if (metadata) { + ret = fsync(file); + } else { + ret = fdatasync(file); + } +#else + (void) metadata; ret = fsync(file); +#endif os_n_fsyncs++; @@ -2092,7 +2107,8 @@ ibool os_file_flush_func( /*===============*/ - os_file_t file) /*!< in, own: handle to a file */ + os_file_t file, /*!< in, own: handle to a file */ + ibool metadata) { #ifdef __WIN__ BOOL ret; @@ -2142,18 +2158,18 @@ /* If we are not on an operating system that supports this, then fall back to a plain fsync. */ - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); } else { ret = fcntl(file, F_FULLFSYNC, NULL); if (ret) { /* If we are not on a file system that supports this, then fall back to a plain fsync. */ - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); } } #else - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); #endif if (ret == 0) { @@ -2336,7 +2352,7 @@ the OS crashes, a database page is only partially physically written to disk. */ - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -2378,7 +2394,7 @@ the OS crashes, a database page is only partially physically written to disk. */ - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -2750,7 +2766,7 @@ # ifdef UNIV_DO_FLUSH if (!os_do_not_call_flush_at_each_write) { - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -4296,7 +4312,7 @@ #ifdef UNIV_DO_FLUSH if (slot->type == OS_FILE_WRITE && !os_do_not_call_flush_at_each_write) { - if (!os_file_flush(slot->file)) { + if (!os_file_flush(slot->file, TRUE)) { ut_error; } } @@ -4597,7 +4613,7 @@ #ifdef UNIV_DO_FLUSH if (slot->type == OS_FILE_WRITE && !os_do_not_call_flush_at_each_write) - && !os_file_flush(slot->file) { + && !os_file_flush(slot->file, TRUE) { ut_error; } #endif /* UNIV_DO_FLUSH */ --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -183,7 +183,8 @@ UNIV_INTERN ulint srv_log_file_size = ULINT_MAX; /* size in database pages */ UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX; -UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; +//UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; +UNIV_INTERN char srv_use_global_flush_log_at_trx_commit = TRUE; /* Try to flush dirty pages so as to avoid IO bursts at the checkpoints. */ @@ -404,6 +405,17 @@ UNIV_INTERN ulong srv_replication_delay = 0; +UNIV_INTERN long long srv_ibuf_max_size = 0; +UNIV_INTERN ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_ibuf_accel_rate = 100; +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0))) + +UNIV_INTERN ulint srv_checkpoint_age_target = 0; +UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:area 2:contiguous */ + +UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ +UNIV_INTERN ulint srv_adaptive_flushing_method = 0; /* 0: native 1: estimate 2: keep_average */ /*-------------------------------------------*/ UNIV_INTERN ulong srv_n_spin_wait_rounds = 30; UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500; @@ -417,6 +429,9 @@ UNIV_INTERN ibool srv_print_buf_io = FALSE; UNIV_INTERN ibool srv_print_log_io = FALSE; UNIV_INTERN ibool srv_print_latch_waits = FALSE; + +UNIV_INTERN ulong srv_flush_checkpoint_debug = 0; + #endif /* UNIV_DEBUG */ UNIV_INTERN ulint srv_n_rows_inserted = 0; @@ -2713,7 +2728,7 @@ ut_ad(!mutex_own(&kernel_mutex)); - ut_a(srv_n_purge_threads == 0); + ut_a(srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)); do { /* Check for shutdown and change in purge config. */ @@ -2746,6 +2761,7 @@ ulint n_pages_purged = 0; ulint n_bytes_merged; ulint n_pages_flushed; + ulint n_pages_flushed_prev = 0; ulint n_bytes_archived; ulint n_tables_to_drop; ulint n_ios; @@ -2753,7 +2769,20 @@ ulint n_ios_very_old; ulint n_pend_ios; ulint next_itr_time; + ulint prev_adaptive_flushing_method = ULINT_UNDEFINED; + ulint inner_loop = 0; + ibool skip_sleep = FALSE; ulint i; + struct t_prev_flush_info_struct { + ulint count; + unsigned space:32; + unsigned offset:32; + ib_uint64_t oldest_modification; + } prev_flush_info[MAX_BUFFER_POOLS]; + + ib_uint64_t lsn_old; + + ib_uint64_t oldest_lsn; #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Master thread starts, id %lu\n", @@ -2775,6 +2804,9 @@ mutex_exit(&kernel_mutex); + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); loop: /*****************************************************************/ /* ---- When there is database activity by users, we cycle in this @@ -2805,9 +2837,13 @@ /* Sleep for 1 second on entrying the for loop below the first time. */ next_itr_time = ut_time_ms() + 1000; + skip_sleep = FALSE; + for (i = 0; i < 10; i++) { ulint cur_time = ut_time_ms(); + n_pages_flushed = 0; /* initialize */ + /* ALTER TABLE in MySQL requires on Unix that the table handler can drop tables lazily after there no longer are SELECT queries to them. */ @@ -2831,6 +2867,7 @@ srv_main_thread_op_info = "sleeping"; srv_main_1_second_loops++; + if (!skip_sleep) { if (next_itr_time > cur_time && srv_shutdown_state == SRV_SHUTDOWN_NONE) { @@ -2841,10 +2878,26 @@ (next_itr_time - cur_time) * 1000)); srv_main_sleeps++; + + /* + mutex_enter(&(log_sys->mutex)); + oldest_lsn = buf_pool_get_oldest_modification(); + ib_uint64_t lsn = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + if(oldest_lsn) + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old); + */ } /* Each iteration should happen at 1 second interval. */ next_itr_time = ut_time_ms() + 1000; + } /* if (!skip_sleep) */ + + skip_sleep = FALSE; /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2864,7 +2917,7 @@ if (n_pend_ios < SRV_PEND_IO_THRESHOLD && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) { srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(FALSE, PCT_IO(5)); + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5)); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2881,7 +2934,11 @@ n_pages_flushed = buf_flush_list( PCT_IO(100), IB_ULONGLONG_MAX); - } else if (srv_adaptive_flushing) { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) { /* Try to keep the rate of flushing of dirty pages such that redo log generation does not @@ -2897,6 +2954,224 @@ n_flush, IB_ULONGLONG_MAX); } + + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) { + + /* Try to keep modified age not to exceed + max_checkpoint_age * 7/8 line */ + + mutex_enter(&(log_sys->mutex)); + + oldest_lsn = buf_pool_get_oldest_modification(); + if (oldest_lsn == 0) { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + } else { + if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { + /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ + /* We should not flush from here. */ + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } else if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age)/4 ) { + + /* defence line (max_checkpoint_age * 1/2) */ + ib_uint64_t lsn = log_sys->lsn; + + ib_uint64_t level, bpl; + buf_page_t* bpage; + ulint j; + + mutex_exit(&(log_sys->mutex)); + + bpl = 0; + + for (j = 0; j < srv_buf_pool_instances; j++) { + buf_pool_t* buf_pool; + ulint n_blocks; + + buf_pool = buf_pool_from_array(j); + + /* The scanning flush_list is optimistic here */ + + level = 0; + n_blocks = 0; + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (bpage != NULL) { + ib_uint64_t oldest_modification = bpage->oldest_modification; + if (oldest_modification != 0) { + level += log_sys->max_checkpoint_age + - (lsn - oldest_modification); + } + bpage = UT_LIST_GET_NEXT(list, bpage); + n_blocks++; + } + + if (level) { + bpl += ((ib_uint64_t) n_blocks * n_blocks + * (lsn - lsn_old)) / level; + } + + } + + if (!srv_use_doublewrite_buf) { + /* flush is faster than when doublewrite */ + bpl = (bpl * 7) / 8; + } + + if (bpl) { +retry_flush_batch: + n_pages_flushed = buf_flush_list(bpl, + oldest_lsn + (lsn - lsn_old)); + if (n_pages_flushed == ULINT_UNDEFINED) { + os_thread_sleep(5000); + goto retry_flush_batch; + } + } + + lsn_old = lsn; + /* + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old, bpl); + */ + } else { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } + } + prev_adaptive_flushing_method = 1; + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) { + buf_pool_t* buf_pool; + buf_page_t* bpage; + ib_uint64_t lsn; + ulint j; + + mutex_enter(&(log_sys->mutex)); + oldest_lsn = buf_pool_get_oldest_modification(); + lsn = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + /* upper loop/sec. (x10) */ + next_itr_time -= 900; /* 1000 - 900 == 100 */ + inner_loop++; + if (inner_loop < 10) { + i--; + } else { + inner_loop = 0; + } + + if (prev_adaptive_flushing_method == 2) { + lint n_flush; + lint blocks_sum; + ulint new_blocks_sum, flushed_blocks_sum; + + blocks_sum = new_blocks_sum = flushed_blocks_sum = 0; + + /* prev_flush_info[j] should be the previous loop's */ + for (j = 0; j < srv_buf_pool_instances; j++) { + lint blocks_num, new_blocks_num, flushed_blocks_num; + ibool found; + + buf_pool = buf_pool_from_array(j); + + blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list); + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + new_blocks_num = 0; + + found = FALSE; + while (bpage != NULL) { + if (prev_flush_info[j].space == bpage->space + && prev_flush_info[j].offset == bpage->offset + && prev_flush_info[j].oldest_modification + == bpage->oldest_modification) { + found = TRUE; + break; + } + bpage = UT_LIST_GET_NEXT(list, bpage); + new_blocks_num++; + } + if (!found) { + new_blocks_num = blocks_num; + } + + flushed_blocks_num = new_blocks_num + prev_flush_info[j].count + - blocks_num; + if (flushed_blocks_num < 0) { + flushed_blocks_num = 0; + } + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list); + if (bpage) { + prev_flush_info[j].space = bpage->space; + prev_flush_info[j].offset = bpage->offset; + prev_flush_info[j].oldest_modification = bpage->oldest_modification; + } else { + prev_flush_info[j].space = 0; + prev_flush_info[j].offset = 0; + prev_flush_info[j].oldest_modification = 0; + } + + new_blocks_sum += new_blocks_num; + flushed_blocks_sum += flushed_blocks_num; + blocks_sum += blocks_num; + } + + n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async; + if (flushed_blocks_sum > n_pages_flushed_prev) { + n_flush -= (flushed_blocks_sum - n_pages_flushed_prev); + } + + if (n_flush > 0) { + n_flush++; + n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old)); + } else { + n_pages_flushed = 0; + } + } else { + /* store previous first pages of the flush_list */ + for (j = 0; j < srv_buf_pool_instances; j++) { + buf_pool = buf_pool_from_array(j); + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list); + if (bpage) { + prev_flush_info[j].space = bpage->space; + prev_flush_info[j].offset = bpage->offset; + prev_flush_info[j].oldest_modification = bpage->oldest_modification; + } else { + prev_flush_info[j].space = 0; + prev_flush_info[j].offset = 0; + prev_flush_info[j].oldest_modification = 0; + } + } + n_pages_flushed = 0; + } + + lsn_old = lsn; + prev_adaptive_flushing_method = 2; + } else { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + prev_adaptive_flushing_method = ULINT_UNDEFINED; + } + + if (n_pages_flushed == ULINT_UNDEFINED) { + n_pages_flushed_prev = 0; + } else { + n_pages_flushed_prev = n_pages_flushed; } if (srv_activity_count == old_activity_count) { @@ -2945,12 +3220,12 @@ even if the server were active */ srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(FALSE, PCT_IO(5)); + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5)); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); - if (srv_n_purge_threads == 0) { + if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) { srv_main_thread_op_info = "master purging"; srv_master_do_purge(); @@ -2982,11 +3257,18 @@ PCT_IO(10), IB_ULONGLONG_MAX); } - srv_main_thread_op_info = "making checkpoint"; +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug != 1) { +#endif - /* Make a new checkpoint about once in 10 seconds */ + srv_main_thread_op_info = "making checkpoint"; - log_checkpoint(TRUE, FALSE); + /* Make a new checkpoint about once in 10 seconds */ + + log_checkpoint(TRUE, FALSE); +#ifdef UNIV_DEBUG + } +#endif srv_main_thread_op_info = "reserving kernel mutex"; @@ -3028,7 +3310,7 @@ } } - if (srv_n_purge_threads == 0) { + if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) { srv_main_thread_op_info = "master purging"; srv_master_do_purge(); @@ -3053,7 +3335,7 @@ buf_flush_list below. Otherwise, the system favors clean pages over cleanup throughput. */ n_bytes_merged = ibuf_contract_for_n_pages(FALSE, - PCT_IO(100)); + PCT_IBUF_IO(100)); } srv_main_thread_op_info = "reserving kernel mutex"; @@ -3065,6 +3347,10 @@ } mutex_exit(&kernel_mutex); +#ifdef UNIV_DEBUG + if (srv_flush_checkpoint_debug == 1) + goto skip_flush; +#endif flush_loop: srv_main_thread_op_info = "flushing buffer pool pages"; srv_main_flush_loops++; @@ -3105,6 +3391,9 @@ goto flush_loop; } +#ifdef UNIV_DEBUG +skip_flush: +#endif srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); @@ -3193,6 +3482,7 @@ srv_slot_t* slot; ulint retries = 0; ulint n_total_purged = ULINT_UNDEFINED; + ulint next_itr_time; ut_a(srv_n_purge_threads == 1); @@ -3213,9 +3503,12 @@ mutex_exit(&kernel_mutex); + next_itr_time = ut_time_ms(); + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { ulint n_pages_purged = 0; + ulint cur_time; /* If there are very few records to purge or the last purge didn't purge any records then wait for activity. @@ -3262,6 +3555,16 @@ } while (n_pages_purged > 0 && !srv_fast_shutdown); srv_sync_log_buffer_in_background(); + + cur_time = ut_time_ms(); + if (next_itr_time > cur_time) { + os_thread_sleep(ut_min(1000000, + (next_itr_time - cur_time) + * 1000)); + next_itr_time = ut_time_ms() + 1000; + } else { + next_itr_time = cur_time + 1000; + } } mutex_enter(&kernel_mutex); --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -1237,6 +1237,9 @@ } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; --- a/storage/innobase/trx/trx0purge.c +++ b/storage/innobase/trx/trx0purge.c @@ -392,10 +392,10 @@ trx_sys->rseg_history_len++; mutex_exit(&kernel_mutex); - if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { +// if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/ /* Inform the purge thread that there is work to do. */ srv_wake_purge_thread_if_not_active(); - } +// } } /**********************************************************************//** --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -984,6 +984,7 @@ trx->read_view = NULL; if (lsn) { + ulint flush_log_at_trx_commit; mutex_exit(&kernel_mutex); @@ -992,6 +993,12 @@ trx_undo_insert_cleanup(trx); } + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + /* NOTE that we could possibly make a group commit more efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ @@ -1023,9 +1030,9 @@ if (trx->flush_log_later) { /* Do nothing yet */ trx->must_flush_log_later = TRUE; - } else if (srv_flush_log_at_trx_commit == 0) { + } else if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -1037,7 +1044,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ @@ -1701,16 +1708,23 @@ trx_t* trx) /*!< in: trx handle */ { ib_uint64_t lsn = trx->commit_lsn; + ulint flush_log_at_trx_commit; ut_a(trx); trx->op_info = "flushing log"; + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + if (!trx->must_flush_log_later) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 0) { + } else if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -1721,7 +1735,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ @@ -1969,6 +1983,8 @@ /*--------------------------------------*/ if (lsn) { + ulint flush_log_at_trx_commit; + /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the prepared state of the transaction durable if the OS does not crash. We may also @@ -1988,9 +2004,15 @@ mutex_exit(&kernel_mutex); - if (srv_flush_log_at_trx_commit == 0) { + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + if (flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1) { + } else if (flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ @@ -2002,7 +2024,7 @@ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } - } else if (srv_flush_log_at_trx_commit == 2) { + } else if (flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ --- a/mysql-test/include/default_mysqld.cnf +++ b/mysql-test/include/default_mysqld.cnf @@ -29,7 +29,7 @@ max_heap_table_size= 1M loose-innodb_data_file_path= ibdata1:10M:autoextend -loose-innodb_buffer_pool_size= 8M +loose-innodb_buffer_pool_size= 32M loose-innodb_write_io_threads= 2 loose-innodb_read_io_threads= 2 loose-innodb_log_buffer_size= 1M --- a/mysql-test/suite/innodb/r/innodb.result +++ b/mysql-test/suite/innodb/r/innodb.result @@ -1678,7 +1678,7 @@ drop table t1; SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_buffer_pool_pages_total'; variable_value -511 +2047 SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_page_size'; variable_value 16384 --- /dev/null +++ b/mysql-test/suite/innodb/r/percona_flush_contiguous_neighbors.result @@ -0,0 +1,21 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB; +INSERT INTO t1(foo) VALUES ('a'), ('b'); +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +DROP TABLE t1; --- /dev/null +++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors-master.opt @@ -0,0 +1 @@ +--innodb_flush_neighbor_pages=cont --- /dev/null +++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors.test @@ -0,0 +1,36 @@ +# Test for innodb_flush_neighbor_pages=contiguous. +# The test is very crude: we simply overflow the buffer pool with such a number of +# new/modified pages that some flushing is bound to happen. + +--source include/have_innodb.inc + +--disable_warnings +DROP TABLE IF EXISTS t1; +--enable_warnings + +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB; + +INSERT INTO t1(foo) VALUES ('a'), ('b'); +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; +INSERT INTO t1(foo) SELECT foo FROM t1; + +# TODO: cannot record a stable value here. A check of > 0 should be enough, +# but the variable is not accessible through INFORMATION_SCHEMA currently. +# SHOW GLOBAL STATUS LIKE 'Innodb_buffer_pool_pages_flushed'; + +DROP TABLE t1; --- /dev/null +++ b/mysql-test/suite/innodb/r/percona_sync_flush.result @@ -0,0 +1,35 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB; +SET @@global.innodb_flush_checkpoint_debug=1; +INSERT INTO t1(foo) VALUES ('a'), ('b'); +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +INSERT INTO t1(foo) SELECT foo FROM t1; +UPDATE t1 SET foo='c'; +SET @@global.innodb_flush_checkpoint_debug=0; +UPDATE t1 SET foo='d' WHERE foo='c'; +DROP TABLE t1; --- /dev/null +++ b/mysql-test/suite/innodb/t/percona_sync_flush.test @@ -0,0 +1,33 @@ +# Test for InnoDB sync state flushing. + +--source include/have_innodb.inc +--source include/have_debug.inc + +--disable_warnings +DROP TABLE IF EXISTS t1; +--enable_warnings + +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB; + +# It is hard to get to InnoDB sync state flushing in MTR with regular workload. Perhaps +# it is possible with many parallel connections, but that would be brittle anyway. +# So, just disable preflushing and checkpointing and issue simple workload. +SET @@global.innodb_flush_checkpoint_debug=1; + +INSERT INTO t1(foo) VALUES ('a'), ('b'); + +let $rep=0; +while ($rep < 14) +{ + INSERT INTO t1(foo) SELECT foo FROM t1; + UPDATE t1 SET foo='c'; + inc $rep; +} + +# By now checkpoint age should be well past sync flush point. Allow +# preflushing/checkpointing again and do some work in order to do the sync flush. +SET @@global.innodb_flush_checkpoint_debug=0; + +UPDATE t1 SET foo='d' WHERE foo='c'; + +DROP TABLE t1; --- a/mysql-test/suite/sys_vars/r/all_vars.result +++ b/mysql-test/suite/sys_vars/r/all_vars.result @@ -4,6 +4,7 @@ insert into t2 select variable_name from information_schema.global_variables; insert into t2 select variable_name from information_schema.session_variables; delete from t2 where variable_name='innodb_change_buffering_debug'; +delete from t2 where variable_name='innodb_flush_checkpoint_debug'; update t2 set variable_name= replace(variable_name, "PERFORMANCE_SCHEMA_", "PFS_"); select variable_name as `There should be *no* long test name listed below:` from t2 where length(variable_name) > 50; --- a/mysql-test/suite/sys_vars/t/all_vars.test +++ b/mysql-test/suite/sys_vars/t/all_vars.test @@ -47,8 +47,9 @@ insert into t2 select variable_name from information_schema.global_variables; insert into t2 select variable_name from information_schema.session_variables; -# This is only present in debug builds. +# These are only present in debug builds. delete from t2 where variable_name='innodb_change_buffering_debug'; +delete from t2 where variable_name='innodb_flush_checkpoint_debug'; # Performance schema variables are too long for files named # 'mysql-test/suite/sys_vars/t/' ... --- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt +++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt @@ -1 +1 @@ ---innodb-buffer-pool-size=8M +--innodb-buffer-pool-size=32M --- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test +++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test @@ -36,13 +36,14 @@ -- disable_query_log --- let $i = 400 +-- let $i = 4000 +begin; while ($i) { insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000)); dec $i; } - +commit; -- enable_query_log # now there should be no 8K pages in the buffer pool