#!!! notice !!!
# Any small change to this file in the main branch
# should be done or reviewed by the maintainer!
-diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
---- a/storage/innobase/buf/buf0buf.c 2010-12-03 15:09:51.273986410 +0900
-+++ b/storage/innobase/buf/buf0buf.c 2010-12-03 15:10:08.934990091 +0900
+--- a/storage/innobase/buf/buf0buf.c
++++ b/storage/innobase/buf/buf0buf.c
@@ -320,6 +320,7 @@
/* When we traverse all the flush lists we don't want another
log_flush_order_mutex_exit();
/* The returned answer may be out of date: the flush_list can
-diff -ruN a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
---- a/storage/innobase/buf/buf0flu.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/buf/buf0flu.c 2010-12-03 15:10:08.934990091 +0900
-@@ -1376,7 +1376,7 @@
+--- a/storage/innobase/buf/buf0flu.c
++++ b/storage/innobase/buf/buf0flu.c
+@@ -857,7 +857,7 @@
+ flush:
+ /* Now flush the doublewrite buffer data to disk */
+
+- fil_flush(TRX_SYS_SPACE);
++ fil_flush(TRX_SYS_SPACE, FALSE);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+@@ -1375,10 +1375,11 @@
+ ulint high;
+ ulint count = 0;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
++ ibool is_forward_scan;
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
/* If there is little space, it is better not to flush
any block except from the end of the LRU list */
-diff -ruN a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
---- a/storage/innobase/buf/buf0rea.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/buf/buf0rea.c 2010-12-03 15:10:08.937050537 +0900
-@@ -260,6 +260,10 @@
- = BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
+@@ -1405,7 +1406,32 @@
+ high = fil_space_get_size(space);
+ }
+
+- for (i = low; i < high; i++) {
++ if (srv_flush_neighbor_pages == 2) {
++
++ /* In the case of contiguous flush where the requested page
++ does not fall at the start of flush area, first scan backward
++ from the page and later forward from it. */
++ is_forward_scan = (offset == low);
++ }
++ else {
++ is_forward_scan = TRUE;
++ }
++
++scan:
++ if (srv_flush_neighbor_pages == 2) {
++ if (is_forward_scan) {
++ i = offset;
++ }
++ else {
++ i = offset - 1;
++ }
++ }
++ else {
++ i = low;
++ }
++
++ for (; is_forward_scan ? (i < high) : (i >= low);
++ is_forward_scan ? i++ : i--) {
+
+ buf_page_t* bpage;
+
+@@ -1434,6 +1460,12 @@
+ if (!bpage) {
+
+ buf_pool_mutex_exit(buf_pool);
++ if (srv_flush_neighbor_pages == 2) {
++
++ /* This is contiguous neighbor page flush and
++ the pages here are not contiguous. */
++ break;
++ }
+ continue;
+ }
+
+@@ -1470,6 +1502,22 @@
+ }
+ }
+ buf_pool_mutex_exit(buf_pool);
++
++ if (srv_flush_neighbor_pages == 2) {
++
++ /* We are trying to do the contiguous neighbor page
++ flush, but the last page we checked was unflushable,
++ making a "hole" in the flush, so stop this attempt. */
++ break;
++ }
++ }
++
++ if (!is_forward_scan) {
++
++ /* Backward scan done, now do the forward scan */
++ ut_a (srv_flush_neighbor_pages == 2);
++ is_forward_scan = TRUE;
++ goto scan;
+ }
+
+ return(count);
+--- a/storage/innobase/buf/buf0rea.c
++++ b/storage/innobase/buf/buf0rea.c
+@@ -427,6 +427,10 @@
+ = BUF_READ_AHEAD_AREA(buf_pool);
ulint threshold;
+ if (!(srv_read_ahead & 2)) {
if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
/* No read-ahead to avoid thread deadlocks */
return(0);
-diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
---- a/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:09:51.283956391 +0900
-+++ b/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:10:08.963980444 +0900
+--- a/storage/innobase/fil/fil0fil.c
++++ b/storage/innobase/fil/fil0fil.c
+@@ -2609,7 +2609,7 @@
+
+ os_thread_sleep(20000);
+
+- fil_flush(id);
++ fil_flush(id, TRUE);
+
+ goto retry;
+
+@@ -2823,7 +2823,7 @@
+ goto error_exit;
+ }
+
+- ret = os_file_flush(file);
++ ret = os_file_flush(file, TRUE);
+
+ if (!ret) {
+ fputs("InnoDB: Error: file flush of tablespace ", stderr);
+@@ -3009,7 +3009,7 @@
+ }
+ }
+
+- success = os_file_flush(file);
++ success = os_file_flush(file, TRUE);
+ if (!success) {
+
+ goto func_exit;
+@@ -3031,7 +3031,7 @@
+
+ goto func_exit;
+ }
+- success = os_file_flush(file);
++ success = os_file_flush(file, TRUE);
+ func_exit:
+ os_file_close(file);
+ ut_free(buf2);
+@@ -4014,7 +4014,7 @@
+ size_after_extend, *actual_size); */
+ mutex_exit(&fil_system->mutex);
+
+- fil_flush(space_id);
++ fil_flush(space_id, TRUE);
+
+ return(success);
+ }
+@@ -4585,8 +4585,9 @@
+ void
+ fil_flush(
+ /*======*/
+- ulint space_id) /*!< in: file space id (this can be a group of
++ ulint space_id, /*!< in: file space id (this can be a group of
+ log files or a tablespace of the database) */
++ ibool metadata)
+ {
+ fil_space_t* space;
+ fil_node_t* node;
+@@ -4657,7 +4658,7 @@
+ /* fprintf(stderr, "Flushing to file %s\n",
+ node->name); */
+
+- os_file_flush(file);
++ os_file_flush(file, metadata);
+
+ mutex_enter(&fil_system->mutex);
+
+@@ -4740,7 +4741,7 @@
+ a non-existing space id. */
+ for (i = 0; i < n_space_ids; i++) {
+
+- fil_flush(space_ids[i]);
++ fil_flush(space_ids[i], TRUE);
+ }
+
+ mem_free(space_ids);
+--- a/storage/innobase/handler/ha_innodb.cc
++++ b/storage/innobase/handler/ha_innodb.cc
@@ -445,6 +445,12 @@
"Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
static handler *innobase_create_handler(handlerton *hton,
TABLE_SHARE *table,
-@@ -839,6 +845,17 @@
+@@ -841,6 +847,17 @@
}
}
/********************************************************************//**
Obtain the InnoDB transaction of a MySQL thread.
@return reference to transaction pointer */
-@@ -2410,6 +2427,9 @@
+@@ -2471,6 +2488,9 @@
srv_n_read_io_threads = (ulint) innobase_read_io_threads;
srv_n_write_io_threads = (ulint) innobase_write_io_threads;
srv_force_recovery = (ulint) innobase_force_recovery;
srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
-@@ -11001,7 +11021,7 @@
+@@ -11141,7 +11161,7 @@
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
"Purge threads can be either 0 or 1.",
NULL, NULL,
0, /* Minimum value */
1, 0); /* Maximum value */
-@@ -11043,12 +11063,18 @@
+@@ -11183,12 +11203,18 @@
innodb_file_format_max_validate,
innodb_file_format_max_update, "Antelope");
static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-@@ -11143,7 +11169,7 @@
+@@ -11293,7 +11319,7 @@
static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-@@ -11295,6 +11321,95 @@
+@@ -11442,6 +11468,127 @@
"trigger a readahead.",
NULL, NULL, 56, 0, 64, 0);
+ "Control soft limit of checkpoint age. (0 : not control)",
+ NULL, NULL, 0, 0, ~0UL, 0);
+
-+static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages,
-+ PLUGIN_VAR_RQCMDARG,
-+ "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
-+ NULL, NULL, 1, 0, 1, 0);
++static
++void
++innodb_flush_neighbor_pages_update(
++ THD* thd,
++ struct st_mysql_sys_var* var,
++ void* var_ptr,
++ const void* save)
++{
++ *(long *)var_ptr = (*(long *)save) % 3;
++}
++
++const char *flush_neighbor_pages_names[]=
++{
++ "none", /* 0 */
++ "area",
++ "cont", /* 2 */
++ /* For compatibility with the older patch */
++ "0", /* "none" + 3 */
++ "1", /* "area" + 3 */
++ "2", /* "cont" + 3 */
++ NullS
++};
++
++TYPELIB flush_neighbor_pages_typelib=
++{
++ array_elements(flush_neighbor_pages_names) - 1,
++ "flush_neighbor_pages_typelib",
++ flush_neighbor_pages_names,
++ NULL
++};
++
++static MYSQL_SYSVAR_ENUM(flush_neighbor_pages, srv_flush_neighbor_pages,
++ PLUGIN_VAR_RQCMDARG, "Neighbor page flushing behaviour: none: do not flush, "
++ "[area]: flush selected pages one-by-one, "
++ "cont: flush a contiguous block of pages", NULL,
++ innodb_flush_neighbor_pages_update, 1, &flush_neighbor_pages_typelib);
+
+static
+void
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(autoextend_increment),
-@@ -11315,6 +11430,7 @@
+@@ -11462,6 +11609,7 @@
MYSQL_SYSVAR(file_format_check),
MYSQL_SYSVAR(file_format_max),
MYSQL_SYSVAR(flush_log_at_trx_commit),
+ MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
MYSQL_SYSVAR(flush_method),
MYSQL_SYSVAR(force_recovery),
- MYSQL_SYSVAR(locks_unsafe_for_binlog),
-@@ -11352,6 +11468,13 @@
+ MYSQL_SYSVAR(large_prefix),
+@@ -11501,6 +11649,13 @@
MYSQL_SYSVAR(show_verbose_locks),
MYSQL_SYSVAR(show_locks_held),
MYSQL_SYSVAR(version),
MYSQL_SYSVAR(use_sys_malloc),
MYSQL_SYSVAR(use_native_aio),
MYSQL_SYSVAR(change_buffering),
-diff -ruN a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
---- a/storage/innobase/ibuf/ibuf0ibuf.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/ibuf/ibuf0ibuf.c 2010-12-03 15:10:09.073984282 +0900
-@@ -524,8 +524,10 @@
+--- a/storage/innobase/ibuf/ibuf0ibuf.c
++++ b/storage/innobase/ibuf/ibuf0ibuf.c
+@@ -523,8 +523,10 @@
grow in size, as the references on the upper levels of the tree can
change */
mutex_create(ibuf_pessimistic_insert_mutex_key,
&ibuf_pessimistic_insert_mutex,
-@@ -2712,9 +2714,11 @@
+@@ -2763,9 +2765,11 @@
size = ibuf->size;
max_size = ibuf->max_size;
sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
-diff -ruN a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
---- a/storage/innobase/include/buf0rea.h 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/include/buf0rea.h 2010-12-03 15:10:09.076066335 +0900
-@@ -124,8 +124,7 @@
+--- a/storage/innobase/include/buf0rea.h
++++ b/storage/innobase/include/buf0rea.h
+@@ -149,8 +149,7 @@
/** The size in pages of the area which the read-ahead algorithms read if
invoked */
/** @name Modes used in read-ahead @{ */
/** read only pages belonging to the insert buffer tree */
-diff -ruN a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
---- a/storage/innobase/include/ha_prototypes.h 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/include/ha_prototypes.h 2010-12-03 15:10:09.078026360 +0900
-@@ -275,5 +275,12 @@
+--- a/storage/innobase/include/fil0fil.h
++++ b/storage/innobase/include/fil0fil.h
+@@ -663,8 +663,9 @@
+ void
+ fil_flush(
+ /*======*/
+- ulint space_id); /*!< in: file space id (this can be a group of
++ ulint space_id, /*!< in: file space id (this can be a group of
+ log files or a tablespace of the database) */
++ ibool metadata);
+ /**********************************************************************//**
+ Flushes to disk writes in file spaces of the given type possibly cached by
+ the OS. */
+--- a/storage/innobase/include/ha_prototypes.h
++++ b/storage/innobase/include/ha_prototypes.h
+@@ -284,6 +284,13 @@
/*===================*/
void* thd, /*!< in: thread handle (THD*) */
ulint value); /*!< in: time waited for the lock */
+/*================================*/
+ void* thd);
- #endif
-diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
---- a/storage/innobase/include/srv0srv.h 2010-12-03 15:09:51.291955835 +0900
-+++ b/storage/innobase/include/srv0srv.h 2010-12-03 15:10:09.079029047 +0900
-@@ -141,7 +141,8 @@
+ /**********************************************************************//**
+ Get the current setting of the lower_case_table_names global parameter from
+--- a/storage/innobase/include/os0file.h
++++ b/storage/innobase/include/os0file.h
+@@ -296,8 +296,8 @@
+ pfs_os_file_write_func(name, file, buf, offset, offset_high, \
+ n, __FILE__, __LINE__)
+
+-# define os_file_flush(file) \
+- pfs_os_file_flush_func(file, __FILE__, __LINE__)
++# define os_file_flush(file, metadata) \
++ pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__)
+
+ # define os_file_rename(key, oldpath, newpath) \
+ pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+@@ -333,7 +333,7 @@
+ # define os_file_write(name, file, buf, offset, offset_high, n) \
+ os_file_write_func(name, file, buf, offset, offset_high, n)
+
+-# define os_file_flush(file) os_file_flush_func(file)
++# define os_file_flush(file, metadata) os_file_flush_func(file, metadata)
+
+ # define os_file_rename(key, oldpath, newpath) \
+ os_file_rename_func(oldpath, newpath)
+@@ -781,6 +781,7 @@
+ pfs_os_file_flush_func(
+ /*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
++ ibool metadata,
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+@@ -860,7 +861,8 @@
+ ibool
+ os_file_flush_func(
+ /*===============*/
+- os_file_t file); /*!< in, own: handle to a file */
++ os_file_t file, /*!< in, own: handle to a file */
++ ibool metadata);
+ /***********************************************************************//**
+ Retrieves the last error number if an error occurs in a file io function.
+ The number should be retrieved before any other OS calls (because they may
+--- a/storage/innobase/include/os0file.ic
++++ b/storage/innobase/include/os0file.ic
+@@ -369,6 +369,7 @@
+ pfs_os_file_flush_func(
+ /*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
++ ibool metadata,
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+ {
+@@ -378,7 +379,7 @@
+
+ register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
+ src_file, src_line);
+- result = os_file_flush_func(file);
++ result = os_file_flush_func(file, metadata);
+
+ register_pfs_file_io_end(locker, 0);
+
+--- a/storage/innobase/include/srv0srv.h
++++ b/storage/innobase/include/srv0srv.h
+@@ -138,7 +138,8 @@
extern ulint srv_n_log_files;
extern ulint srv_log_file_size;
extern ulint srv_log_buffer_size;
+extern char srv_use_global_flush_log_at_trx_commit;
extern char srv_adaptive_flushing;
-
-@@ -219,6 +220,16 @@
+ /* If this flag is TRUE, then we will load the indexes' (and tables') metadata
+@@ -221,6 +222,16 @@
extern ulong srv_max_purge_lag;
extern ulong srv_replication_delay;
/*-------------------------------------------*/
extern ulint srv_n_rows_inserted;
-@@ -397,8 +408,9 @@
+@@ -399,8 +410,9 @@
when writing data files, but do flush
after writing to log files */
SRV_UNIX_NOSYNC, /*!< do not flush after writing */
};
/** Alternatives for file i/o in Windows */
-diff -ruN a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c
---- a/storage/innobase/log/log0log.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/log/log0log.c 2010-12-03 15:10:09.084023562 +0900
+--- a/storage/innobase/log/log0log.c
++++ b/storage/innobase/log/log0log.c
@@ -48,6 +48,7 @@
#include "srv0start.h"
#include "trx0sys.h"
log->check_flush_or_checkpoint = TRUE;
}
-@@ -1100,6 +1128,7 @@
+@@ -1100,9 +1128,10 @@
group = (log_group_t*)((ulint)group - 1);
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
- fil_flush(group->space_id);
-@@ -1121,8 +1150,9 @@
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FALSE);
+ }
+
+ #ifdef UNIV_DEBUG
+@@ -1121,10 +1150,11 @@
logs and cannot end up here! */
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
- && srv_flush_log_at_trx_commit != 2) {
+ && thd_flush_log_at_trx_commit(NULL) != 2) {
- fil_flush(group->space_id);
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FALSE);
}
+
+ mutex_enter(&(log_sys->mutex));
@@ -1501,7 +1531,8 @@
mutex_exit(&(log_sys->mutex));
/* O_DSYNC means the OS did not buffer the log file at all:
so we have also flushed to disk what we have written */
+@@ -1511,7 +1542,7 @@
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FALSE);
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+ }
+
@@ -2120,10 +2151,10 @@
sync = TRUE;
/* A checkpoint is not urgent: do it asynchronously */
do_checkpoint = TRUE;
+@@ -2607,7 +2638,7 @@
+
+ mutex_exit(&(log_sys->mutex));
+
+- fil_flush(group->archive_space_id);
++ fil_flush(group->archive_space_id, TRUE);
+
+ mutex_enter(&(log_sys->mutex));
+
@@ -3349,6 +3380,17 @@
log_sys->flushed_to_disk_lsn,
log_sys->last_checkpoint_lsn);
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
-diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
---- a/storage/innobase/log/log0recv.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/log/log0recv.c 2010-12-03 15:10:09.089024191 +0900
+--- a/storage/innobase/log/log0recv.c
++++ b/storage/innobase/log/log0recv.c
@@ -2906,9 +2906,12 @@
ib_uint64_t archived_lsn;
#endif /* UNIV_LOG_ARCHIVE */
#ifdef UNIV_LOG_ARCHIVE
ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
/** TRUE when recovering from a checkpoint */
-diff -ruN a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c
---- a/storage/innobase/os/os0file.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/os/os0file.c 2010-12-03 15:10:09.093023540 +0900
+@@ -3468,7 +3471,7 @@
+ exit(1);
+ }
+
+- os_file_flush(log_file);
++ os_file_flush(log_file, TRUE);
+ os_file_close(log_file);
+ }
+
+@@ -3492,7 +3495,7 @@
+
+ os_file_write(name, log_file, buf, 0, 0,
+ LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+- os_file_flush(log_file);
++ os_file_flush(log_file, TRUE);
+ os_file_close(log_file);
+
+ ut_free(buf);
+--- a/storage/innobase/os/os0file.c
++++ b/storage/innobase/os/os0file.c
@@ -1424,7 +1424,7 @@
#endif
#ifdef UNIV_NON_BUFFERED_IO
#ifdef USE_FILE_LOCK
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
-diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
---- a/storage/innobase/srv/srv0srv.c 2010-12-03 15:09:51.301987792 +0900
-+++ b/storage/innobase/srv/srv0srv.c 2010-12-03 15:13:29.369986988 +0900
-@@ -190,7 +190,8 @@
+@@ -2008,7 +2013,7 @@
+
+ ut_free(buf2);
+
+- ret = os_file_flush(file);
++ ret = os_file_flush(file, TRUE);
+
+ if (ret) {
+ return(TRUE);
+@@ -2046,7 +2051,8 @@
+ int
+ os_file_fsync(
+ /*==========*/
+- os_file_t file) /*!< in: handle to a file */
++ os_file_t file, /*!< in: handle to a file */
++ ibool metadata)
+ {
+ int ret;
+ int failures;
+@@ -2055,7 +2061,16 @@
+ failures = 0;
+
+ do {
++#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
++ if (metadata) {
++ ret = fsync(file);
++ } else {
++ ret = fdatasync(file);
++ }
++#else
++ (void) metadata;
+ ret = fsync(file);
++#endif
+
+ os_n_fsyncs++;
+
+@@ -2092,7 +2107,8 @@
+ ibool
+ os_file_flush_func(
+ /*===============*/
+- os_file_t file) /*!< in, own: handle to a file */
++ os_file_t file, /*!< in, own: handle to a file */
++ ibool metadata)
+ {
+ #ifdef __WIN__
+ BOOL ret;
+@@ -2142,18 +2158,18 @@
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+- ret = os_file_fsync(file);
++ ret = os_file_fsync(file, metadata);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+- ret = os_file_fsync(file);
++ ret = os_file_fsync(file, metadata);
+ }
+ }
+ #else
+- ret = os_file_fsync(file);
++ ret = os_file_fsync(file, metadata);
+ #endif
+
+ if (ret == 0) {
+@@ -2336,7 +2352,7 @@
+ the OS crashes, a database page is only partially
+ physically written to disk. */
+
+- ut_a(TRUE == os_file_flush(file));
++ ut_a(TRUE == os_file_flush(file, TRUE));
+ }
+ # endif /* UNIV_DO_FLUSH */
+
+@@ -2378,7 +2394,7 @@
+ the OS crashes, a database page is only partially
+ physically written to disk. */
+
+- ut_a(TRUE == os_file_flush(file));
++ ut_a(TRUE == os_file_flush(file, TRUE));
+ }
+ # endif /* UNIV_DO_FLUSH */
+
+@@ -2750,7 +2766,7 @@
+
+ # ifdef UNIV_DO_FLUSH
+ if (!os_do_not_call_flush_at_each_write) {
+- ut_a(TRUE == os_file_flush(file));
++ ut_a(TRUE == os_file_flush(file, TRUE));
+ }
+ # endif /* UNIV_DO_FLUSH */
+
+@@ -4296,7 +4312,7 @@
+ #ifdef UNIV_DO_FLUSH
+ if (slot->type == OS_FILE_WRITE
+ && !os_do_not_call_flush_at_each_write) {
+- if (!os_file_flush(slot->file)) {
++ if (!os_file_flush(slot->file, TRUE)) {
+ ut_error;
+ }
+ }
+@@ -4597,7 +4613,7 @@
+ #ifdef UNIV_DO_FLUSH
+ if (slot->type == OS_FILE_WRITE
+ && !os_do_not_call_flush_at_each_write)
+- && !os_file_flush(slot->file) {
++ && !os_file_flush(slot->file, TRUE) {
+ ut_error;
+ }
+ #endif /* UNIV_DO_FLUSH */
+--- a/storage/innobase/srv/srv0srv.c
++++ b/storage/innobase/srv/srv0srv.c
+@@ -183,7 +183,8 @@
UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
/* size in database pages */
UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
/* Try to flush dirty pages so as to avoid IO bursts at
the checkpoints. */
-@@ -409,6 +410,17 @@
+@@ -404,6 +405,17 @@
UNIV_INTERN ulong srv_replication_delay = 0;
+#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
+
+UNIV_INTERN ulint srv_checkpoint_age_target = 0;
-+UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
++UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:area 2:contiguous */
+
+UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
/*-------------------------------------------*/
UNIV_INTERN ulong srv_n_spin_wait_rounds = 30;
UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500;
-@@ -2745,6 +2757,7 @@
+@@ -2713,7 +2725,7 @@
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+- ut_a(srv_n_purge_threads == 0);
++ ut_a(srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0));
+
+ do {
+ /* Check for shutdown and change in purge config. */
+@@ -2746,6 +2758,7 @@
ulint n_pages_purged = 0;
ulint n_bytes_merged;
ulint n_pages_flushed;
ulint n_bytes_archived;
ulint n_tables_to_drop;
ulint n_ios;
-@@ -2752,7 +2765,20 @@
+@@ -2753,7 +2766,20 @@
ulint n_ios_very_old;
ulint n_pend_ios;
ulint next_itr_time;
#ifdef UNIV_DEBUG_THREAD_CREATION
fprintf(stderr, "Master thread starts, id %lu\n",
-@@ -2774,6 +2800,9 @@
+@@ -2775,6 +2801,9 @@
mutex_exit(&kernel_mutex);
loop:
/*****************************************************************/
/* ---- When there is database activity by users, we cycle in this
-@@ -2804,9 +2833,13 @@
+@@ -2805,9 +2834,13 @@
/* Sleep for 1 second on entrying the for loop below the first time. */
next_itr_time = ut_time_ms() + 1000;
/* ALTER TABLE in MySQL requires on Unix that the table handler
can drop tables lazily after there no longer are SELECT
queries to them. */
-@@ -2830,6 +2863,7 @@
+@@ -2831,6 +2864,7 @@
srv_main_thread_op_info = "sleeping";
srv_main_1_second_loops++;
if (next_itr_time > cur_time
&& srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-@@ -2840,10 +2874,26 @@
+@@ -2841,10 +2875,26 @@
(next_itr_time - cur_time)
* 1000));
srv_main_sleeps++;
/* Flush logs if needed */
srv_sync_log_buffer_in_background();
-@@ -2863,7 +2913,7 @@
+@@ -2864,7 +2914,7 @@
if (n_pend_ios < SRV_PEND_IO_THRESHOLD
&& (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
srv_main_thread_op_info = "doing insert buffer merge";
/* Flush logs if needed */
srv_sync_log_buffer_in_background();
-@@ -2880,7 +2930,11 @@
+@@ -2881,7 +2931,11 @@
n_pages_flushed = buf_flush_list(
PCT_IO(100), IB_ULONGLONG_MAX);
/* Try to keep the rate of flushing of dirty
pages such that redo log generation does not
-@@ -2896,6 +2950,224 @@
+@@ -2897,6 +2951,224 @@
n_flush,
IB_ULONGLONG_MAX);
}
}
if (srv_activity_count == old_activity_count) {
-@@ -2944,7 +3216,7 @@
+@@ -2945,12 +3217,12 @@
even if the server were active */
srv_main_thread_op_info = "doing insert buffer merge";
/* Flush logs if needed */
srv_sync_log_buffer_in_background();
-@@ -3052,7 +3324,7 @@
+
+- if (srv_n_purge_threads == 0) {
++ if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
+ srv_main_thread_op_info = "master purging";
+
+ srv_master_do_purge();
+@@ -3028,7 +3300,7 @@
+ }
+ }
+
+- if (srv_n_purge_threads == 0) {
++ if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
+ srv_main_thread_op_info = "master purging";
+
+ srv_master_do_purge();
+@@ -3053,7 +3325,7 @@
buf_flush_list below. Otherwise, the system favors
clean pages over cleanup throughput. */
n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
}
srv_main_thread_op_info = "reserving kernel mutex";
-@@ -3199,6 +3471,7 @@
+@@ -3193,6 +3465,7 @@
+ srv_slot_t* slot;
ulint retries = 0;
- ulint slot_no = ULINT_UNDEFINED;
ulint n_total_purged = ULINT_UNDEFINED;
+ ulint next_itr_time;
ut_a(srv_n_purge_threads == 1);
-@@ -3221,9 +3494,12 @@
+@@ -3213,9 +3486,12 @@
mutex_exit(&kernel_mutex);
/* If there are very few records to purge or the last
purge didn't purge any records then wait for activity.
-@@ -3272,6 +3548,16 @@
+@@ -3262,6 +3538,16 @@
} while (n_pages_purged > 0 && !srv_fast_shutdown);
srv_sync_log_buffer_in_background();
}
mutex_enter(&kernel_mutex);
-diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
---- a/storage/innobase/srv/srv0start.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/srv/srv0start.c 2010-12-03 15:10:09.103023543 +0900
-@@ -1218,6 +1218,9 @@
+--- a/storage/innobase/srv/srv0start.c
++++ b/storage/innobase/srv/srv0start.c
+@@ -1237,6 +1237,9 @@
} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
} else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
-diff -ruN a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c
---- a/storage/innobase/trx/trx0purge.c 2011-04-12 14:14:14.000000000 +0900
-+++ b/storage/innobase/trx/trx0purge.c 2011-04-12 14:15:44.000000000 +0900
+--- a/storage/innobase/trx/trx0purge.c
++++ b/storage/innobase/trx/trx0purge.c
@@ -392,10 +392,10 @@
trx_sys->rseg_history_len++;
mutex_exit(&kernel_mutex);
}
/**********************************************************************//**
-diff -ruN a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
---- a/storage/innobase/trx/trx0trx.c 2010-11-03 07:01:13.000000000 +0900
-+++ b/storage/innobase/trx/trx0trx.c 2010-12-03 15:10:09.106023937 +0900
-@@ -925,6 +925,7 @@
+--- a/storage/innobase/trx/trx0trx.c
++++ b/storage/innobase/trx/trx0trx.c
+@@ -984,6 +984,7 @@
trx->read_view = NULL;
if (lsn) {
mutex_exit(&kernel_mutex);
-@@ -933,6 +934,12 @@
+@@ -992,6 +993,12 @@
trx_undo_insert_cleanup(trx);
}
/* NOTE that we could possibly make a group commit more
efficient here: call os_thread_yield here to allow also other
trxs to come to commit! */
-@@ -964,9 +971,9 @@
+@@ -1023,9 +1030,9 @@
if (trx->flush_log_later) {
/* Do nothing yet */
trx->must_flush_log_later = TRUE;
if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
/* Write the log but do not flush it to disk */
-@@ -978,7 +985,7 @@
+@@ -1037,7 +1044,7 @@
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
}
/* Write the log but do not flush it to disk */
-@@ -1642,16 +1649,23 @@
+@@ -1701,16 +1708,23 @@
trx_t* trx) /*!< in: trx handle */
{
ib_uint64_t lsn = trx->commit_lsn;
if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
/* Write the log but do not flush it to disk */
-@@ -1662,7 +1676,7 @@
+@@ -1721,7 +1735,7 @@
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
}
/* Write the log but do not flush it to disk */
-@@ -1915,6 +1929,8 @@
+@@ -1969,6 +1983,8 @@
/*--------------------------------------*/
if (lsn) {
/* Depending on the my.cnf options, we may now write the log
buffer to the log files, making the prepared state of the
transaction durable if the OS does not crash. We may also
-@@ -1934,9 +1950,15 @@
+@@ -1988,9 +2004,15 @@
mutex_exit(&kernel_mutex);
if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
/* Write the log but do not flush it to disk */
-@@ -1948,7 +1970,7 @@
+@@ -2002,7 +2024,7 @@
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
}
/* Write the log but do not flush it to disk */
+--- a/mysql-test/include/default_mysqld.cnf
++++ b/mysql-test/include/default_mysqld.cnf
+@@ -29,7 +29,7 @@
+ max_heap_table_size= 1M
+
+ loose-innodb_data_file_path= ibdata1:10M:autoextend
+-loose-innodb_buffer_pool_size= 8M
++loose-innodb_buffer_pool_size= 32M
+ loose-innodb_write_io_threads= 2
+ loose-innodb_read_io_threads= 2
+ loose-innodb_log_buffer_size= 1M
+--- a/mysql-test/suite/innodb/r/innodb.result
++++ b/mysql-test/suite/innodb/r/innodb.result
+@@ -1678,7 +1678,7 @@
+ drop table t1;
+ SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_buffer_pool_pages_total';
+ variable_value
+-511
++2047
+ SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_page_size';
+ variable_value
+ 16384
+--- /dev/null
++++ b/mysql-test/suite/innodb/r/percona_flush_contiguous_neighbors.result
+@@ -0,0 +1,21 @@
++DROP TABLE IF EXISTS t1;
++CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
++INSERT INTO t1(foo) VALUES ('a'), ('b');
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++DROP TABLE t1;
+--- /dev/null
++++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors-master.opt
+@@ -0,0 +1 @@
++--innodb_flush_neighbor_pages=cont
+--- /dev/null
++++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors.test
+@@ -0,0 +1,36 @@
++# Test for innodb_flush_neighbor_pages=contiguous.
++# The test is very crude: we simply overflow the buffer pool with such a number of
++# new/modified pages that some flushing is bound to happen.
++
++--source include/have_innodb.inc
++
++--disable_warnings
++DROP TABLE IF EXISTS t1;
++--enable_warnings
++
++CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
++
++INSERT INTO t1(foo) VALUES ('a'), ('b');
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++INSERT INTO t1(foo) SELECT foo FROM t1;
++
++# TODO: cannot record a stable value here. A check of > 0 should be enough,
++# but the variable is not accessible through INFORMATION_SCHEMA currently.
++# SHOW GLOBAL STATUS LIKE 'Innodb_buffer_pool_pages_flushed';
++
++DROP TABLE t1;
+--- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
++++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
+@@ -1 +1 @@
+---innodb-buffer-pool-size=8M
++--innodb-buffer-pool-size=32M
+--- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
++++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
+@@ -36,13 +36,14 @@
+
+ -- disable_query_log
+
+--- let $i = 400
++-- let $i = 4000
++begin;
+ while ($i)
+ {
+ insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
+ dec $i;
+ }
+-
++commit;
+ -- enable_query_log
+
+ # now there should be no 8K pages in the buffer pool