1 # name : innodb_io_patches.patch
2 # introduced : 11 or before
3 # maintainer : Yasufumi
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
9 --- a/storage/innobase/buf/buf0buf.c 2010-12-03 15:09:51.273986410 +0900
10 +++ b/storage/innobase/buf/buf0buf.c 2010-12-03 15:10:08.934990091 +0900
13 /* When we traverse all the flush lists we don't want another
14 thread to add a dirty page to any flush list. */
15 + if (srv_buf_pool_instances > 1)
16 log_flush_order_mutex_enter();
18 for (i = 0; i < srv_buf_pool_instances; i++) {
23 + if (srv_buf_pool_instances > 1)
24 log_flush_order_mutex_exit();
26 /* The returned answer may be out of date: the flush_list can
27 diff -ruN a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
28 --- a/storage/innobase/buf/buf0flu.c 2010-11-03 07:01:13.000000000 +0900
29 +++ b/storage/innobase/buf/buf0flu.c 2010-12-03 15:10:08.934990091 +0900
32 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
34 - if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
35 + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
36 /* If there is little space, it is better not to flush
37 any block except from the end of the LRU list */
39 diff -ruN a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
40 --- a/storage/innobase/buf/buf0rea.c 2010-11-03 07:01:13.000000000 +0900
41 +++ b/storage/innobase/buf/buf0rea.c 2010-12-03 15:10:08.937050537 +0900
43 = BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
46 + if (!(srv_read_ahead & 2)) {
50 if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
51 /* No read-ahead to avoid thread deadlocks */
53 diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
54 --- a/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:09:51.283956391 +0900
55 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-03 15:10:08.963980444 +0900
57 "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
58 NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
60 +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
61 + "Set to 0 (write and flush once per second),"
62 + " 1 (write and flush at each commit)"
63 + " or 2 (write at commit, flush once per second).",
64 + NULL, NULL, 1, 0, 2, 0);
67 static handler *innobase_create_handler(handlerton *hton,
73 +/******************************************************************//**
75 +extern "C" UNIV_INTERN
77 +thd_flush_log_at_trx_commit(
78 +/*================================*/
81 + return(THDVAR((THD*) thd, flush_log_at_trx_commit));
84 /********************************************************************//**
85 Obtain the InnoDB transaction of a MySQL thread.
86 @return reference to transaction pointer */
88 srv_n_read_io_threads = (ulint) innobase_read_io_threads;
89 srv_n_write_io_threads = (ulint) innobase_write_io_threads;
91 + srv_read_ahead &= 3;
92 + srv_adaptive_flushing_method %= 3;
94 srv_force_recovery = (ulint) innobase_force_recovery;
96 srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
97 @@ -10107,6 +10127,10 @@
100 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
101 + if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) {
102 + /* choose group commit rather than binlog order */
106 /* For ibbackup to work the order of transactions in binlog
107 and InnoDB must be the same. Consider the situation
108 @@ -10917,9 +10941,9 @@
110 static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
111 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
112 - "Purge threads can be either 0 or 1. Default is 0.",
113 + "Purge threads can be either 0 or 1. Default is 1.",
115 - 0, /* Default setting */
116 + 1, /* Default setting */
117 0, /* Minimum value */
118 1, 0); /* Maximum value */
120 @@ -10961,12 +10985,18 @@
121 innodb_file_format_max_validate,
122 innodb_file_format_max_update, "Antelope");
124 -static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
125 - PLUGIN_VAR_OPCMDARG,
126 - "Set to 0 (write and flush once per second),"
127 - " 1 (write and flush at each commit)"
128 - " or 2 (write at commit, flush once per second).",
129 - NULL, NULL, 1, 0, 2, 0);
130 +/* Changed to the THDVAR */
131 +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
132 +// PLUGIN_VAR_OPCMDARG,
133 +// "Set to 0 (write and flush once per second),"
134 +// " 1 (write and flush at each commit)"
135 +// " or 2 (write at commit, flush once per second).",
136 +// NULL, NULL, 1, 0, 2, 0);
138 +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit,
139 + PLUGIN_VAR_NOCMDARG,
140 + "Use global innodb_flush_log_at_trx_commit value. (default: ON).",
143 static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
144 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
145 @@ -11061,7 +11091,7 @@
146 static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
147 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
148 "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
149 - NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
150 + NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
152 static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
153 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
154 @@ -11206,6 +11236,100 @@
155 "trigger a readahead.",
156 NULL, NULL, 56, 0, 64, 0);
158 +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
159 + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
160 + "The maximum size of the insert buffer. (in bytes)",
161 + NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
163 +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
164 + PLUGIN_VAR_RQCMDARG,
165 + "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
166 + NULL, NULL, 1, 0, 1, 0);
168 +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
169 + PLUGIN_VAR_RQCMDARG,
170 + "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
171 + NULL, NULL, 100, 100, 999999999, 0);
173 +static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target,
174 + PLUGIN_VAR_RQCMDARG,
175 + "Control soft limit of checkpoint age. (0 : not control)",
176 + NULL, NULL, 0, 0, ~0UL, 0);
178 +static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages,
179 + PLUGIN_VAR_RQCMDARG,
180 + "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
181 + NULL, NULL, 1, 0, 1, 0);
185 +innodb_read_ahead_update(
187 + struct st_mysql_sys_var* var,
191 + *(long *)var_ptr= (*(long *)save) & 3;
193 +const char *read_ahead_names[]=
199 + /* For compatibility of the older patch */
200 + "0", /* 4 ("none" + 4) */
203 + "3", /* 7 ("both" + 4) */
206 +TYPELIB read_ahead_typelib=
208 + array_elements(read_ahead_names) - 1, "read_ahead_typelib",
209 + read_ahead_names, NULL
211 +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
212 + PLUGIN_VAR_RQCMDARG,
213 + "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
214 + NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
218 +innodb_adaptive_flushing_method_update(
220 + struct st_mysql_sys_var* var,
224 + *(long *)var_ptr= (*(long *)save) % 4;
226 +const char *adaptive_flushing_method_names[]=
229 + "estimate", /* 1 */
230 + "keep_average", /* 2 */
231 + /* For compatibility of the older patch */
232 + "0", /* 3 ("none" + 3) */
233 + "1", /* 4 ("estimate" + 3) */
234 + "2", /* 5 ("keep_average" + 3) */
237 +TYPELIB adaptive_flushing_method_typelib=
239 + array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib",
240 + adaptive_flushing_method_names, NULL
242 +static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method,
243 + PLUGIN_VAR_RQCMDARG,
244 + "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)",
245 + NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib);
247 +static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit,
248 + PLUGIN_VAR_RQCMDARG,
249 + "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
250 + NULL, NULL, 0, 0, 1, 0);
252 static struct st_mysql_sys_var* innobase_system_variables[]= {
253 MYSQL_SYSVAR(additional_mem_pool_size),
254 MYSQL_SYSVAR(autoextend_increment),
255 @@ -11226,6 +11350,7 @@
256 MYSQL_SYSVAR(file_format_check),
257 MYSQL_SYSVAR(file_format_max),
258 MYSQL_SYSVAR(flush_log_at_trx_commit),
259 + MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
260 MYSQL_SYSVAR(flush_method),
261 MYSQL_SYSVAR(force_recovery),
262 MYSQL_SYSVAR(locks_unsafe_for_binlog),
263 @@ -11262,6 +11387,14 @@
264 MYSQL_SYSVAR(show_verbose_locks),
265 MYSQL_SYSVAR(show_locks_held),
266 MYSQL_SYSVAR(version),
267 + MYSQL_SYSVAR(ibuf_max_size),
268 + MYSQL_SYSVAR(ibuf_active_contract),
269 + MYSQL_SYSVAR(ibuf_accel_rate),
270 + MYSQL_SYSVAR(checkpoint_age_target),
271 + MYSQL_SYSVAR(flush_neighbor_pages),
272 + MYSQL_SYSVAR(read_ahead),
273 + MYSQL_SYSVAR(adaptive_flushing_method),
274 + MYSQL_SYSVAR(enable_unsafe_group_commit),
275 MYSQL_SYSVAR(use_sys_malloc),
276 MYSQL_SYSVAR(use_native_aio),
277 MYSQL_SYSVAR(change_buffering),
278 diff -ruN a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
279 --- a/storage/innobase/ibuf/ibuf0ibuf.c 2010-11-03 07:01:13.000000000 +0900
280 +++ b/storage/innobase/ibuf/ibuf0ibuf.c 2010-12-03 15:10:09.073984282 +0900
282 grow in size, as the references on the upper levels of the tree can
285 - ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
286 - / IBUF_POOL_SIZE_PER_MAX_SIZE;
287 + ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
288 + / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
290 + srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
292 mutex_create(ibuf_pessimistic_insert_mutex_key,
293 &ibuf_pessimistic_insert_mutex,
294 @@ -2651,9 +2653,11 @@
296 max_size = ibuf->max_size;
298 + if (!srv_ibuf_active_contract) {
299 if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
304 sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
306 diff -ruN a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
307 --- a/storage/innobase/include/buf0rea.h 2010-11-03 07:01:13.000000000 +0900
308 +++ b/storage/innobase/include/buf0rea.h 2010-12-03 15:10:09.076066335 +0900
311 /** The size in pages of the area which the read-ahead algorithms read if
313 -#define BUF_READ_AHEAD_AREA(b) \
314 - ut_min(64, ut_2_power_up((b)->curr_size / 32))
315 +#define BUF_READ_AHEAD_AREA(b) 64
317 /** @name Modes used in read-ahead @{ */
318 /** read only pages belonging to the insert buffer tree */
319 diff -ruN a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
320 --- a/storage/innobase/include/ha_prototypes.h 2010-11-03 07:01:13.000000000 +0900
321 +++ b/storage/innobase/include/ha_prototypes.h 2010-12-03 15:10:09.078026360 +0900
323 /*===================*/
324 void* thd, /*!< in: thread handle (THD*) */
325 ulint value); /*!< in: time waited for the lock */
326 +/******************************************************************//**
330 +thd_flush_log_at_trx_commit(
331 +/*================================*/
335 diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
336 --- a/storage/innobase/include/srv0srv.h 2010-12-03 15:09:51.291955835 +0900
337 +++ b/storage/innobase/include/srv0srv.h 2010-12-03 15:10:09.079029047 +0900
339 extern ulint srv_n_log_files;
340 extern ulint srv_log_file_size;
341 extern ulint srv_log_buffer_size;
342 -extern ulong srv_flush_log_at_trx_commit;
343 +//extern ulong srv_flush_log_at_trx_commit;
344 +extern char srv_use_global_flush_log_at_trx_commit;
345 extern char srv_adaptive_flushing;
349 extern ulong srv_max_purge_lag;
351 extern ulong srv_replication_delay;
353 +extern long long srv_ibuf_max_size;
354 +extern ulint srv_ibuf_active_contract;
355 +extern ulint srv_ibuf_accel_rate;
356 +extern ulint srv_checkpoint_age_target;
357 +extern ulint srv_flush_neighbor_pages;
358 +extern ulint srv_enable_unsafe_group_commit;
359 +extern ulint srv_read_ahead;
360 +extern ulint srv_adaptive_flushing_method;
362 /*-------------------------------------------*/
364 extern ulint srv_n_rows_inserted;
366 when writing data files, but do flush
367 after writing to log files */
368 SRV_UNIX_NOSYNC, /*!< do not flush after writing */
369 - SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on
370 + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on
372 + SRV_UNIX_ALL_O_DIRECT /* new method for examination: logfile also open O_DIRECT */
375 /** Alternatives for file i/o in Windows */
376 diff -ruN a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c
377 --- a/storage/innobase/log/log0log.c 2010-11-03 07:01:13.000000000 +0900
378 +++ b/storage/innobase/log/log0log.c 2010-12-03 15:10:09.084023562 +0900
382 /************************************************************//**
386 +log_max_modified_age_async()
388 + if (srv_checkpoint_age_target) {
389 + return(ut_min(log_sys->max_modified_age_async,
390 + srv_checkpoint_age_target
391 + - srv_checkpoint_age_target / 8));
393 + return(log_sys->max_modified_age_async);
399 +log_max_checkpoint_age_async()
401 + if (srv_checkpoint_age_target) {
402 + return(ut_min(log_sys->max_checkpoint_age_async,
403 + srv_checkpoint_age_target));
405 + return(log_sys->max_checkpoint_age_async);
409 +/************************************************************//**
417 - if (checkpoint_age <= log->max_modified_age_async) {
418 + if (checkpoint_age <= log_max_modified_age_async()) {
423 oldest_lsn = buf_pool_get_oldest_modification();
426 - || lsn - oldest_lsn > log->max_modified_age_async
427 - || checkpoint_age > log->max_checkpoint_age_async) {
428 + || lsn - oldest_lsn > log_max_modified_age_async()
429 + || checkpoint_age > log_max_checkpoint_age_async()) {
431 log->check_flush_or_checkpoint = TRUE;
433 @@ -1100,6 +1127,7 @@
434 group = (log_group_t*)((ulint)group - 1);
436 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
437 + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
438 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
440 fil_flush(group->space_id);
441 @@ -1121,8 +1149,9 @@
442 logs and cannot end up here! */
444 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
445 + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
446 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
447 - && srv_flush_log_at_trx_commit != 2) {
448 + && thd_flush_log_at_trx_commit(NULL) != 2) {
450 fil_flush(group->space_id);
452 @@ -1501,7 +1530,8 @@
454 mutex_exit(&(log_sys->mutex));
456 - if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
457 + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
458 + || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
459 /* O_DSYNC means the OS did not buffer the log file at all:
460 so we have also flushed to disk what we have written */
462 @@ -2120,10 +2150,10 @@
465 advance = 2 * (age - log->max_modified_age_sync);
466 - } else if (age > log->max_modified_age_async) {
467 + } else if (age > log_max_modified_age_async()) {
469 /* A flush is not urgent: we do an asynchronous preflush */
470 - advance = age - log->max_modified_age_async;
471 + advance = age - log_max_modified_age_async();
475 @@ -2137,7 +2167,7 @@
477 do_checkpoint = TRUE;
479 - } else if (checkpoint_age > log->max_checkpoint_age_async) {
480 + } else if (checkpoint_age > log_max_checkpoint_age_async()) {
481 /* A checkpoint is not urgent: do it asynchronously */
483 do_checkpoint = TRUE;
484 @@ -3349,6 +3379,17 @@
485 log_sys->flushed_to_disk_lsn,
486 log_sys->last_checkpoint_lsn);
489 + "Max checkpoint age %lu\n"
490 + "Checkpoint age target %lu\n"
491 + "Modified age %lu\n"
492 + "Checkpoint age %lu\n",
493 + (ulong) log_sys->max_checkpoint_age,
494 + (ulong) log_max_checkpoint_age_async(),
495 + (ulong) (log_sys->lsn -
496 + log_buf_pool_get_oldest_modification()),
497 + (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
499 current_time = time(NULL);
501 time_elapsed = 0.001 + difftime(current_time,
502 diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
503 --- a/storage/innobase/log/log0recv.c 2010-11-03 07:01:13.000000000 +0900
504 +++ b/storage/innobase/log/log0recv.c 2010-12-03 15:10:09.089024191 +0900
505 @@ -2906,9 +2906,12 @@
506 ib_uint64_t archived_lsn;
507 #endif /* UNIV_LOG_ARCHIVE */
509 - byte log_hdr_buf[LOG_FILE_HDR_SIZE];
511 + byte log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
514 + log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
516 #ifdef UNIV_LOG_ARCHIVE
517 ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
518 /** TRUE when recovering from a checkpoint */
519 diff -ruN a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c
520 --- a/storage/innobase/os/os0file.c 2010-11-03 07:01:13.000000000 +0900
521 +++ b/storage/innobase/os/os0file.c 2010-12-03 15:10:09.093023540 +0900
522 @@ -1399,7 +1399,7 @@
524 #ifdef UNIV_NON_BUFFERED_IO
525 # ifndef UNIV_HOTBACKUP
526 - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
527 + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
528 /* Do not use unbuffered i/o to log files because
529 value 2 denotes that we do not flush the log at every
530 commit, but only once per second */
531 @@ -1415,7 +1415,7 @@
533 #ifdef UNIV_NON_BUFFERED_IO
534 # ifndef UNIV_HOTBACKUP
535 - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
536 + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
537 /* Do not use unbuffered i/o to log files because
538 value 2 denotes that we do not flush the log at every
539 commit, but only once per second */
540 @@ -1560,6 +1560,11 @@
541 os_file_set_nocache(file, name, mode_str);
544 + /* ALL_O_DIRECT: O_DIRECT also for transaction log file */
545 + if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
546 + os_file_set_nocache(file, name, mode_str);
550 if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
552 diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
553 --- a/storage/innobase/srv/srv0srv.c 2010-12-03 15:09:51.301987792 +0900
554 +++ b/storage/innobase/srv/srv0srv.c 2010-12-03 15:13:29.369986988 +0900
556 UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
557 /* size in database pages */
558 UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
559 -UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
560 +//UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
561 +UNIV_INTERN char srv_use_global_flush_log_at_trx_commit = TRUE;
563 /* Try to flush dirty pages so as to avoid IO bursts at
567 UNIV_INTERN ulong srv_replication_delay = 0;
569 +UNIV_INTERN long long srv_ibuf_max_size = 0;
570 +UNIV_INTERN ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
571 +UNIV_INTERN ulint srv_ibuf_accel_rate = 100;
572 +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
574 +UNIV_INTERN ulint srv_checkpoint_age_target = 0;
575 +UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
577 +UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
578 +UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
579 +UNIV_INTERN ulint srv_adaptive_flushing_method = 0; /* 0: native 1: estimate 2: keep_average */
580 /*-------------------------------------------*/
581 UNIV_INTERN ulong srv_n_spin_wait_rounds = 30;
582 UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500;
583 @@ -2703,6 +2715,7 @@
584 ulint n_pages_purged = 0;
585 ulint n_bytes_merged;
586 ulint n_pages_flushed;
587 + ulint n_pages_flushed_prev = 0;
588 ulint n_bytes_archived;
589 ulint n_tables_to_drop;
591 @@ -2710,7 +2723,20 @@
592 ulint n_ios_very_old;
595 + ulint prev_adaptive_flushing_method = ULINT_UNDEFINED;
596 + ulint inner_loop = 0;
597 + ibool skip_sleep = FALSE;
599 + struct t_prev_flush_info_struct {
602 + unsigned offset:32;
603 + ib_uint64_t oldest_modification;
604 + } prev_flush_info[MAX_BUFFER_POOLS];
606 + ib_uint64_t lsn_old;
608 + ib_uint64_t oldest_lsn;
610 #ifdef UNIV_DEBUG_THREAD_CREATION
611 fprintf(stderr, "Master thread starts, id %lu\n",
612 @@ -2732,6 +2758,9 @@
614 mutex_exit(&kernel_mutex);
616 + mutex_enter(&(log_sys->mutex));
617 + lsn_old = log_sys->lsn;
618 + mutex_exit(&(log_sys->mutex));
620 /*****************************************************************/
621 /* ---- When there is database activity by users, we cycle in this
622 @@ -2762,9 +2791,13 @@
623 /* Sleep for 1 second on entrying the for loop below the first time. */
624 next_itr_time = ut_time_ms() + 1000;
626 + skip_sleep = FALSE;
628 for (i = 0; i < 10; i++) {
629 ulint cur_time = ut_time_ms();
631 + n_pages_flushed = 0; /* initialize */
633 /* ALTER TABLE in MySQL requires on Unix that the table handler
634 can drop tables lazily after there no longer are SELECT
636 @@ -2788,6 +2821,7 @@
637 srv_main_thread_op_info = "sleeping";
638 srv_main_1_second_loops++;
641 if (next_itr_time > cur_time
642 && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
644 @@ -2798,10 +2832,26 @@
645 (next_itr_time - cur_time)
650 + mutex_enter(&(log_sys->mutex));
651 + oldest_lsn = buf_pool_get_oldest_modification();
652 + ib_uint64_t lsn = log_sys->lsn;
653 + mutex_exit(&(log_sys->mutex));
657 + "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
658 + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
663 /* Each iteration should happen at 1 second interval. */
664 next_itr_time = ut_time_ms() + 1000;
665 + } /* if (!skip_sleep) */
667 + skip_sleep = FALSE;
669 /* Flush logs if needed */
670 srv_sync_log_buffer_in_background();
671 @@ -2821,7 +2871,7 @@
672 if (n_pend_ios < SRV_PEND_IO_THRESHOLD
673 && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
674 srv_main_thread_op_info = "doing insert buffer merge";
675 - ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
676 + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
678 /* Flush logs if needed */
679 srv_sync_log_buffer_in_background();
680 @@ -2838,7 +2888,11 @@
681 n_pages_flushed = buf_flush_list(
682 PCT_IO(100), IB_ULONGLONG_MAX);
684 - } else if (srv_adaptive_flushing) {
685 + mutex_enter(&(log_sys->mutex));
686 + lsn_old = log_sys->lsn;
687 + mutex_exit(&(log_sys->mutex));
688 + prev_adaptive_flushing_method = ULINT_UNDEFINED;
689 + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) {
691 /* Try to keep the rate of flushing of dirty
692 pages such that redo log generation does not
693 @@ -2854,6 +2908,223 @@
698 + mutex_enter(&(log_sys->mutex));
699 + lsn_old = log_sys->lsn;
700 + mutex_exit(&(log_sys->mutex));
701 + prev_adaptive_flushing_method = ULINT_UNDEFINED;
702 + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) {
704 + /* Try to keep modified age not to exceed
705 + max_checkpoint_age * 7/8 line */
707 + mutex_enter(&(log_sys->mutex));
709 + oldest_lsn = buf_pool_get_oldest_modification();
710 + if (oldest_lsn == 0) {
711 + lsn_old = log_sys->lsn;
712 + mutex_exit(&(log_sys->mutex));
715 + if ((log_sys->lsn - oldest_lsn)
716 + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
717 + /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
718 + /* We should not flush from here. */
719 + lsn_old = log_sys->lsn;
720 + mutex_exit(&(log_sys->mutex));
721 + } else if ((log_sys->lsn - oldest_lsn)
722 + > (log_sys->max_checkpoint_age)/4 ) {
724 + /* defence line (max_checkpoint_age * 1/2) */
725 + ib_uint64_t lsn = log_sys->lsn;
727 + ib_uint64_t level, bpl;
731 + mutex_exit(&(log_sys->mutex));
735 + for (j = 0; j < srv_buf_pool_instances; j++) {
736 + buf_pool_t* buf_pool;
739 + buf_pool = buf_pool_from_array(j);
741 + /* The scanning flush_list is optimistic here */
745 + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
747 + while (bpage != NULL) {
748 + ib_uint64_t oldest_modification = bpage->oldest_modification;
749 + if (oldest_modification != 0) {
750 + level += log_sys->max_checkpoint_age
751 + - (lsn - oldest_modification);
753 + bpage = UT_LIST_GET_NEXT(list, bpage);
758 + bpl += ((ib_uint64_t) n_blocks * n_blocks
759 + * (lsn - lsn_old)) / level;
764 + if (!srv_use_doublewrite_buf) {
765 + /* flush is faster than when doublewrite */
766 + bpl = (bpl * 7) / 8;
771 + n_pages_flushed = buf_flush_list(bpl,
772 + oldest_lsn + (lsn - lsn_old));
773 + if (n_pages_flushed == ULINT_UNDEFINED) {
774 + os_thread_sleep(5000);
775 + goto retry_flush_batch;
782 + "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
783 + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
784 + lsn - lsn_old, bpl);
787 + lsn_old = log_sys->lsn;
788 + mutex_exit(&(log_sys->mutex));
791 + prev_adaptive_flushing_method = 1;
792 + } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) {
793 + buf_pool_t* buf_pool;
798 + mutex_enter(&(log_sys->mutex));
799 + oldest_lsn = buf_pool_get_oldest_modification();
800 + lsn = log_sys->lsn;
801 + mutex_exit(&(log_sys->mutex));
803 + /* upper loop/sec. (x10) */
804 + next_itr_time -= 900; /* 1000 - 900 == 100 */
806 + if (inner_loop < 10) {
812 + if (prev_adaptive_flushing_method == 2) {
814 + lint blocks_sum, new_blocks_sum, flushed_blocks_sum;
816 + blocks_sum = new_blocks_sum = flushed_blocks_sum = 0;
818 + /* prev_flush_info[j] should be the previous loop's */
819 + for (j = 0; j < srv_buf_pool_instances; j++) {
820 + lint blocks_num, new_blocks_num, flushed_blocks_num;
823 + buf_pool = buf_pool_from_array(j);
825 + blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list);
826 + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
827 + new_blocks_num = 0;
830 + while (bpage != NULL) {
831 + if (prev_flush_info[j].space == bpage->space
832 + && prev_flush_info[j].offset == bpage->offset
833 + && prev_flush_info[j].oldest_modification
834 + == bpage->oldest_modification) {
838 + bpage = UT_LIST_GET_NEXT(list, bpage);
842 + new_blocks_num = blocks_num;
845 + flushed_blocks_num = new_blocks_num + prev_flush_info[j].count
847 + if (flushed_blocks_num < 0) {
848 + flushed_blocks_num = 0;
851 + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
853 + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
855 + prev_flush_info[j].space = bpage->space;
856 + prev_flush_info[j].offset = bpage->offset;
857 + prev_flush_info[j].oldest_modification = bpage->oldest_modification;
859 + prev_flush_info[j].space = 0;
860 + prev_flush_info[j].offset = 0;
861 + prev_flush_info[j].oldest_modification = 0;
864 + new_blocks_sum += new_blocks_num;
865 + flushed_blocks_sum += flushed_blocks_num;
866 + blocks_sum += blocks_num;
869 + n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async;
870 + if (flushed_blocks_sum > n_pages_flushed_prev) {
871 + n_flush -= (flushed_blocks_sum - n_pages_flushed_prev);
876 + n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old));
878 + n_pages_flushed = 0;
881 + /* store previous first pages of the flush_list */
882 + for (j = 0; j < srv_buf_pool_instances; j++) {
883 + buf_pool = buf_pool_from_array(j);
885 + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
887 + prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
889 + prev_flush_info[j].space = bpage->space;
890 + prev_flush_info[j].offset = bpage->offset;
891 + prev_flush_info[j].oldest_modification = bpage->oldest_modification;
893 + prev_flush_info[j].space = 0;
894 + prev_flush_info[j].offset = 0;
895 + prev_flush_info[j].oldest_modification = 0;
898 + n_pages_flushed = 0;
902 + prev_adaptive_flushing_method = 2;
904 + mutex_enter(&(log_sys->mutex));
905 + lsn_old = log_sys->lsn;
906 + mutex_exit(&(log_sys->mutex));
907 + prev_adaptive_flushing_method = ULINT_UNDEFINED;
910 + if (n_pages_flushed == ULINT_UNDEFINED) {
911 + n_pages_flushed_prev = 0;
913 + n_pages_flushed_prev = n_pages_flushed;
916 if (srv_activity_count == old_activity_count) {
917 @@ -2902,7 +3173,7 @@
918 even if the server were active */
920 srv_main_thread_op_info = "doing insert buffer merge";
921 - ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
922 + ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
924 /* Flush logs if needed */
925 srv_sync_log_buffer_in_background();
926 @@ -3010,7 +3281,7 @@
927 buf_flush_list below. Otherwise, the system favors
928 clean pages over cleanup throughput. */
929 n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
934 srv_main_thread_op_info = "reserving kernel mutex";
935 @@ -3156,6 +3427,7 @@
937 ulint slot_no = ULINT_UNDEFINED;
938 ulint n_total_purged = ULINT_UNDEFINED;
939 + ulint next_itr_time;
941 ut_a(srv_n_purge_threads == 1);
943 @@ -3178,9 +3450,12 @@
945 mutex_exit(&kernel_mutex);
947 + next_itr_time = ut_time_ms();
949 while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
951 ulint n_pages_purged;
954 /* If there are very few records to purge or the last
955 purge didn't purge any records then wait for activity.
956 @@ -3221,6 +3496,16 @@
957 } while (n_pages_purged > 0 && !srv_fast_shutdown);
959 srv_sync_log_buffer_in_background();
961 + cur_time = ut_time_ms();
962 + if (next_itr_time > cur_time) {
963 + os_thread_sleep(ut_min(1000000,
964 + (next_itr_time - cur_time)
966 + next_itr_time = ut_time_ms() + 1000;
968 + next_itr_time = cur_time + 1000;
972 mutex_enter(&kernel_mutex);
973 diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
974 --- a/storage/innobase/srv/srv0start.c 2010-11-03 07:01:13.000000000 +0900
975 +++ b/storage/innobase/srv/srv0start.c 2010-12-03 15:10:09.103023543 +0900
976 @@ -1184,6 +1184,9 @@
977 } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
978 srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
980 + } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
981 + srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
983 } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
984 srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
986 diff -ruN a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
987 --- a/storage/innobase/trx/trx0trx.c 2010-11-03 07:01:13.000000000 +0900
988 +++ b/storage/innobase/trx/trx0trx.c 2010-12-03 15:10:09.106023937 +0900
990 trx->read_view = NULL;
993 + ulint flush_log_at_trx_commit;
995 mutex_exit(&kernel_mutex);
998 trx_undo_insert_cleanup(trx);
1001 + if (srv_use_global_flush_log_at_trx_commit) {
1002 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1004 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1007 /* NOTE that we could possibly make a group commit more
1008 efficient here: call os_thread_yield here to allow also other
1009 trxs to come to commit! */
1011 if (trx->flush_log_later) {
1012 /* Do nothing yet */
1013 trx->must_flush_log_later = TRUE;
1014 - } else if (srv_flush_log_at_trx_commit == 0) {
1015 + } else if (flush_log_at_trx_commit == 0) {
1017 - } else if (srv_flush_log_at_trx_commit == 1) {
1018 + } else if (flush_log_at_trx_commit == 1) {
1019 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1020 /* Write the log but do not flush it to disk */
1024 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1026 - } else if (srv_flush_log_at_trx_commit == 2) {
1027 + } else if (flush_log_at_trx_commit == 2) {
1029 /* Write the log but do not flush it to disk */
1031 @@ -1582,16 +1589,23 @@
1032 trx_t* trx) /*!< in: trx handle */
1034 ib_uint64_t lsn = trx->commit_lsn;
1035 + ulint flush_log_at_trx_commit;
1039 trx->op_info = "flushing log";
1041 + if (srv_use_global_flush_log_at_trx_commit) {
1042 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1044 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1047 if (!trx->must_flush_log_later) {
1049 - } else if (srv_flush_log_at_trx_commit == 0) {
1050 + } else if (flush_log_at_trx_commit == 0) {
1052 - } else if (srv_flush_log_at_trx_commit == 1) {
1053 + } else if (flush_log_at_trx_commit == 1) {
1054 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1055 /* Write the log but do not flush it to disk */
1057 @@ -1602,7 +1616,7 @@
1059 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1061 - } else if (srv_flush_log_at_trx_commit == 2) {
1062 + } else if (flush_log_at_trx_commit == 2) {
1064 /* Write the log but do not flush it to disk */
1066 @@ -1855,6 +1869,8 @@
1067 /*--------------------------------------*/
1070 + ulint flush_log_at_trx_commit;
1072 /* Depending on the my.cnf options, we may now write the log
1073 buffer to the log files, making the prepared state of the
1074 transaction durable if the OS does not crash. We may also
1075 @@ -1874,9 +1890,15 @@
1077 mutex_exit(&kernel_mutex);
1079 - if (srv_flush_log_at_trx_commit == 0) {
1080 + if (srv_use_global_flush_log_at_trx_commit) {
1081 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1083 + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1086 + if (flush_log_at_trx_commit == 0) {
1088 - } else if (srv_flush_log_at_trx_commit == 1) {
1089 + } else if (flush_log_at_trx_commit == 1) {
1090 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1091 /* Write the log but do not flush it to disk */
1093 @@ -1888,7 +1910,7 @@
1095 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1097 - } else if (srv_flush_log_at_trx_commit == 2) {
1098 + } else if (flush_log_at_trx_commit == 2) {
1100 /* Write the log but do not flush it to disk */