]> git.pld-linux.org Git - packages/mysql.git/blame - innodb_io_patches.patch
localize vars
[packages/mysql.git] / innodb_io_patches.patch
CommitLineData
b4e1fa2c
AM
1# name : innodb_io_patches.patch
2# introduced : 11 or before
3# maintainer : Yasufumi
4#
5#!!! notice !!!
6# Any small change to this file in the main branch
7# should be done or reviewed by the maintainer!
db82db79
AM
8--- a/storage/innobase/buf/buf0buf.c
9+++ b/storage/innobase/buf/buf0buf.c
b4e1fa2c
AM
10@@ -320,6 +320,7 @@
11
12 /* When we traverse all the flush lists we don't want another
13 thread to add a dirty page to any flush list. */
14+ if (srv_buf_pool_instances > 1)
15 log_flush_order_mutex_enter();
16
17 for (i = 0; i < srv_buf_pool_instances; i++) {
18@@ -343,6 +344,7 @@
19 }
20 }
21
22+ if (srv_buf_pool_instances > 1)
23 log_flush_order_mutex_exit();
24
25 /* The returned answer may be out of date: the flush_list can
db82db79
AM
26--- a/storage/innobase/buf/buf0flu.c
27+++ b/storage/innobase/buf/buf0flu.c
28@@ -857,7 +857,7 @@
413cadc7
AM
29 flush:
30 /* Now flush the doublewrite buffer data to disk */
31
32- fil_flush(TRX_SYS_SPACE);
33+ fil_flush(TRX_SYS_SPACE, FALSE);
34
35 /* We know that the writes have been flushed to disk now
36 and in recovery we will find them in the doublewrite buffer
1bfc1981
AM
37@@ -1375,10 +1375,11 @@
38 ulint high;
39 ulint count = 0;
40 buf_pool_t* buf_pool = buf_pool_get(space, offset);
41+ ibool is_forward_scan;
b4e1fa2c
AM
42
43 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
44
45- if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
46+ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
47 /* If there is little space, it is better not to flush
48 any block except from the end of the LRU list */
49
1bfc1981
AM
50@@ -1405,7 +1406,32 @@
51 high = fil_space_get_size(space);
52 }
53
54- for (i = low; i < high; i++) {
55+ if (srv_flush_neighbor_pages == 2) {
56+
57+ /* In the case of contiguous flush where the requested page
58+ does not fall at the start of flush area, first scan backward
59+ from the page and later forward from it. */
60+ is_forward_scan = (offset == low);
61+ }
62+ else {
63+ is_forward_scan = TRUE;
64+ }
65+
66+scan:
67+ if (srv_flush_neighbor_pages == 2) {
68+ if (is_forward_scan) {
69+ i = offset;
70+ }
71+ else {
72+ i = offset - 1;
73+ }
74+ }
75+ else {
76+ i = low;
77+ }
78+
79+ for (; is_forward_scan ? (i < high) : (i >= low);
80+ is_forward_scan ? i++ : i--) {
81
82 buf_page_t* bpage;
83
84@@ -1434,6 +1460,12 @@
85 if (!bpage) {
86
87 buf_pool_mutex_exit(buf_pool);
88+ if (srv_flush_neighbor_pages == 2) {
89+
90+ /* This is contiguous neighbor page flush and
91+ the pages here are not contiguous. */
92+ break;
93+ }
94 continue;
95 }
96
97@@ -1470,6 +1502,22 @@
98 }
99 }
100 buf_pool_mutex_exit(buf_pool);
101+
102+ if (srv_flush_neighbor_pages == 2) {
103+
104+ /* We are trying to do the contiguous neighbor page
105+ flush, but the last page we checked was unflushable,
106+ making a "hole" in the flush, so stop this attempt. */
107+ break;
108+ }
109+ }
110+
111+ if (!is_forward_scan) {
112+
113+ /* Backward scan done, now do the forward scan */
114+ ut_a (srv_flush_neighbor_pages == 2);
115+ is_forward_scan = TRUE;
116+ goto scan;
117 }
118
119 return(count);
db82db79
AM
120--- a/storage/innobase/buf/buf0rea.c
121+++ b/storage/innobase/buf/buf0rea.c
734d6226
AM
122@@ -427,6 +427,10 @@
123 = BUF_READ_AHEAD_AREA(buf_pool);
b4e1fa2c
AM
124 ulint threshold;
125
126+ if (!(srv_read_ahead & 2)) {
127+ return(0);
128+ }
129+
130 if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
131 /* No read-ahead to avoid thread deadlocks */
132 return(0);
db82db79
AM
133--- a/storage/innobase/fil/fil0fil.c
134+++ b/storage/innobase/fil/fil0fil.c
29ffd636 135@@ -2609,7 +2609,7 @@
413cadc7
AM
136
137 os_thread_sleep(20000);
138
139- fil_flush(id);
140+ fil_flush(id, TRUE);
141
142 goto retry;
143
29ffd636 144@@ -2823,7 +2823,7 @@
413cadc7
AM
145 goto error_exit;
146 }
147
148- ret = os_file_flush(file);
149+ ret = os_file_flush(file, TRUE);
150
151 if (!ret) {
152 fputs("InnoDB: Error: file flush of tablespace ", stderr);
29ffd636 153@@ -3009,7 +3009,7 @@
413cadc7
AM
154 }
155 }
156
157- success = os_file_flush(file);
158+ success = os_file_flush(file, TRUE);
159 if (!success) {
160
161 goto func_exit;
29ffd636 162@@ -3031,7 +3031,7 @@
413cadc7
AM
163
164 goto func_exit;
165 }
166- success = os_file_flush(file);
167+ success = os_file_flush(file, TRUE);
168 func_exit:
169 os_file_close(file);
170 ut_free(buf2);
29ffd636 171@@ -4014,7 +4014,7 @@
413cadc7
AM
172 size_after_extend, *actual_size); */
173 mutex_exit(&fil_system->mutex);
174
175- fil_flush(space_id);
176+ fil_flush(space_id, TRUE);
177
178 return(success);
179 }
29ffd636 180@@ -4585,8 +4585,9 @@
413cadc7
AM
181 void
182 fil_flush(
183 /*======*/
184- ulint space_id) /*!< in: file space id (this can be a group of
185+ ulint space_id, /*!< in: file space id (this can be a group of
186 log files or a tablespace of the database) */
187+ ibool metadata)
188 {
189 fil_space_t* space;
190 fil_node_t* node;
29ffd636 191@@ -4657,7 +4658,7 @@
413cadc7
AM
192 /* fprintf(stderr, "Flushing to file %s\n",
193 node->name); */
194
195- os_file_flush(file);
196+ os_file_flush(file, metadata);
197
198 mutex_enter(&fil_system->mutex);
199
29ffd636 200@@ -4740,7 +4741,7 @@
413cadc7
AM
201 a non-existing space id. */
202 for (i = 0; i < n_space_ids; i++) {
203
204- fil_flush(space_ids[i]);
205+ fil_flush(space_ids[i], TRUE);
206 }
207
208 mem_free(space_ids);
db82db79
AM
209--- a/storage/innobase/handler/ha_innodb.cc
210+++ b/storage/innobase/handler/ha_innodb.cc
211@@ -445,6 +445,12 @@
b4e1fa2c
AM
212 "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
213 NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
214
215+static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
216+ "Set to 0 (write and flush once per second),"
217+ " 1 (write and flush at each commit)"
218+ " or 2 (write at commit, flush once per second).",
219+ NULL, NULL, 1, 0, 2, 0);
220+
221
222 static handler *innobase_create_handler(handlerton *hton,
223 TABLE_SHARE *table,
734d6226 224@@ -841,6 +847,17 @@
b4e1fa2c
AM
225 }
226 }
227
228+/******************************************************************//**
229+*/
230+extern "C" UNIV_INTERN
231+ulong
232+thd_flush_log_at_trx_commit(
233+/*================================*/
234+ void* thd)
235+{
236+ return(THDVAR((THD*) thd, flush_log_at_trx_commit));
237+}
238+
239 /********************************************************************//**
240 Obtain the InnoDB transaction of a MySQL thread.
241 @return reference to transaction pointer */
734d6226 242@@ -2471,6 +2488,9 @@
b4e1fa2c
AM
243 srv_n_read_io_threads = (ulint) innobase_read_io_threads;
244 srv_n_write_io_threads = (ulint) innobase_write_io_threads;
245
246+ srv_read_ahead &= 3;
247+ srv_adaptive_flushing_method %= 3;
248+
249 srv_force_recovery = (ulint) innobase_force_recovery;
250
251 srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
29ffd636 252@@ -11141,7 +11161,7 @@
b4e1fa2c 253 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
11822e22 254 "Purge threads can be either 0 or 1.",
b4e1fa2c
AM
255 NULL, NULL,
256- 0, /* Default setting */
257+ 1, /* Default setting */
258 0, /* Minimum value */
259 1, 0); /* Maximum value */
260
29ffd636 261@@ -11183,12 +11203,18 @@
b4e1fa2c
AM
262 innodb_file_format_max_validate,
263 innodb_file_format_max_update, "Antelope");
264
265-static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
266- PLUGIN_VAR_OPCMDARG,
267- "Set to 0 (write and flush once per second),"
268- " 1 (write and flush at each commit)"
269- " or 2 (write at commit, flush once per second).",
270- NULL, NULL, 1, 0, 2, 0);
271+/* Changed to the THDVAR */
272+//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
273+// PLUGIN_VAR_OPCMDARG,
274+// "Set to 0 (write and flush once per second),"
275+// " 1 (write and flush at each commit)"
276+// " or 2 (write at commit, flush once per second).",
277+// NULL, NULL, 1, 0, 2, 0);
278+
279+static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit,
280+ PLUGIN_VAR_NOCMDARG,
281+ "Use global innodb_flush_log_at_trx_commit value. (default: ON).",
282+ NULL, NULL, TRUE);
283
284 static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
285 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
29ffd636 286@@ -11293,7 +11319,7 @@
b4e1fa2c
AM
287 static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
288 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
289 "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
290- NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
291+ NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
292
293 static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
294 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
543222d2 295@@ -11442,6 +11468,127 @@
b4e1fa2c
AM
296 "trigger a readahead.",
297 NULL, NULL, 56, 0, 64, 0);
298
299+static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
300+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
301+ "The maximum size of the insert buffer. (in bytes)",
302+ NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
303+
304+static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
305+ PLUGIN_VAR_RQCMDARG,
306+ "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
307+ NULL, NULL, 1, 0, 1, 0);
308+
309+static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
310+ PLUGIN_VAR_RQCMDARG,
311+ "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
312+ NULL, NULL, 100, 100, 999999999, 0);
313+
314+static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target,
315+ PLUGIN_VAR_RQCMDARG,
316+ "Control soft limit of checkpoint age. (0 : not control)",
317+ NULL, NULL, 0, 0, ~0UL, 0);
318+
1bfc1981
AM
319+static
320+void
321+innodb_flush_neighbor_pages_update(
322+ THD* thd,
323+ struct st_mysql_sys_var* var,
324+ void* var_ptr,
325+ const void* save)
326+{
327+ *(long *)var_ptr = (*(long *)save) % 3;
328+}
329+
330+const char *flush_neighbor_pages_names[]=
331+{
332+ "none", /* 0 */
333+ "area",
334+ "cont", /* 2 */
335+ /* For compatibility with the older patch */
336+ "0", /* "none" + 3 */
337+ "1", /* "area" + 3 */
338+ "2", /* "cont" + 3 */
339+ NullS
340+};
341+
342+TYPELIB flush_neighbor_pages_typelib=
343+{
344+ array_elements(flush_neighbor_pages_names) - 1,
345+ "flush_neighbor_pages_typelib",
346+ flush_neighbor_pages_names,
347+ NULL
348+};
349+
350+static MYSQL_SYSVAR_ENUM(flush_neighbor_pages, srv_flush_neighbor_pages,
351+ PLUGIN_VAR_RQCMDARG, "Neighbor page flushing behaviour: none: do not flush, "
352+ "[area]: flush selected pages one-by-one, "
353+ "cont: flush a contiguous block of pages", NULL,
354+ innodb_flush_neighbor_pages_update, 1, &flush_neighbor_pages_typelib);
b4e1fa2c
AM
355+
356+static
357+void
358+innodb_read_ahead_update(
359+ THD* thd,
360+ struct st_mysql_sys_var* var,
361+ void* var_ptr,
362+ const void* save)
363+{
364+ *(long *)var_ptr= (*(long *)save) & 3;
365+}
366+const char *read_ahead_names[]=
367+{
368+ "none", /* 0 */
369+ "random",
370+ "linear",
371+ "both", /* 3 */
372+ /* For compatibility of the older patch */
373+ "0", /* 4 ("none" + 4) */
374+ "1",
375+ "2",
376+ "3", /* 7 ("both" + 4) */
377+ NullS
378+};
379+TYPELIB read_ahead_typelib=
380+{
381+ array_elements(read_ahead_names) - 1, "read_ahead_typelib",
382+ read_ahead_names, NULL
383+};
384+static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
385+ PLUGIN_VAR_RQCMDARG,
386+ "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
387+ NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
388+
389+static
390+void
391+innodb_adaptive_flushing_method_update(
392+ THD* thd,
393+ struct st_mysql_sys_var* var,
394+ void* var_ptr,
395+ const void* save)
396+{
397+ *(long *)var_ptr= (*(long *)save) % 4;
398+}
399+const char *adaptive_flushing_method_names[]=
400+{
401+ "native", /* 0 */
402+ "estimate", /* 1 */
403+ "keep_average", /* 2 */
404+ /* For compatibility of the older patch */
405+ "0", /* 3 ("none" + 3) */
406+ "1", /* 4 ("estimate" + 3) */
407+ "2", /* 5 ("keep_average" + 3) */
408+ NullS
409+};
410+TYPELIB adaptive_flushing_method_typelib=
411+{
412+ array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib",
413+ adaptive_flushing_method_names, NULL
414+};
415+static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method,
416+ PLUGIN_VAR_RQCMDARG,
417+ "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)",
418+ NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib);
b4e1fa2c
AM
419+
420 static struct st_mysql_sys_var* innobase_system_variables[]= {
421 MYSQL_SYSVAR(additional_mem_pool_size),
422 MYSQL_SYSVAR(autoextend_increment),
543222d2 423@@ -11462,6 +11609,7 @@
b4e1fa2c
AM
424 MYSQL_SYSVAR(file_format_check),
425 MYSQL_SYSVAR(file_format_max),
426 MYSQL_SYSVAR(flush_log_at_trx_commit),
427+ MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
428 MYSQL_SYSVAR(flush_method),
429 MYSQL_SYSVAR(force_recovery),
db82db79 430 MYSQL_SYSVAR(large_prefix),
543222d2 431@@ -11501,6 +11649,13 @@
b4e1fa2c
AM
432 MYSQL_SYSVAR(show_verbose_locks),
433 MYSQL_SYSVAR(show_locks_held),
434 MYSQL_SYSVAR(version),
435+ MYSQL_SYSVAR(ibuf_max_size),
436+ MYSQL_SYSVAR(ibuf_active_contract),
437+ MYSQL_SYSVAR(ibuf_accel_rate),
438+ MYSQL_SYSVAR(checkpoint_age_target),
439+ MYSQL_SYSVAR(flush_neighbor_pages),
440+ MYSQL_SYSVAR(read_ahead),
441+ MYSQL_SYSVAR(adaptive_flushing_method),
b4e1fa2c
AM
442 MYSQL_SYSVAR(use_sys_malloc),
443 MYSQL_SYSVAR(use_native_aio),
444 MYSQL_SYSVAR(change_buffering),
db82db79
AM
445--- a/storage/innobase/ibuf/ibuf0ibuf.c
446+++ b/storage/innobase/ibuf/ibuf0ibuf.c
1bfc1981 447@@ -523,8 +523,10 @@
b4e1fa2c
AM
448 grow in size, as the references on the upper levels of the tree can
449 change */
450
451- ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
452- / IBUF_POOL_SIZE_PER_MAX_SIZE;
453+ ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
454+ / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
455+
456+ srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
457
458 mutex_create(ibuf_pessimistic_insert_mutex_key,
459 &ibuf_pessimistic_insert_mutex,
1bfc1981 460@@ -2763,9 +2765,11 @@
b4e1fa2c
AM
461 size = ibuf->size;
462 max_size = ibuf->max_size;
463
464+ if (!srv_ibuf_active_contract) {
465 if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
466 return;
467 }
468+ }
469
470 sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
471
db82db79
AM
472--- a/storage/innobase/include/buf0rea.h
473+++ b/storage/innobase/include/buf0rea.h
734d6226 474@@ -149,8 +149,7 @@
b4e1fa2c
AM
475
476 /** The size in pages of the area which the read-ahead algorithms read if
477 invoked */
478-#define BUF_READ_AHEAD_AREA(b) \
479- ut_min(64, ut_2_power_up((b)->curr_size / 32))
480+#define BUF_READ_AHEAD_AREA(b) 64
481
482 /** @name Modes used in read-ahead @{ */
483 /** read only pages belonging to the insert buffer tree */
db82db79
AM
484--- a/storage/innobase/include/fil0fil.h
485+++ b/storage/innobase/include/fil0fil.h
29ffd636 486@@ -663,8 +663,9 @@
413cadc7
AM
487 void
488 fil_flush(
489 /*======*/
490- ulint space_id); /*!< in: file space id (this can be a group of
491+ ulint space_id, /*!< in: file space id (this can be a group of
492 log files or a tablespace of the database) */
493+ ibool metadata);
494 /**********************************************************************//**
495 Flushes to disk writes in file spaces of the given type possibly cached by
496 the OS. */
db82db79
AM
497--- a/storage/innobase/include/ha_prototypes.h
498+++ b/storage/innobase/include/ha_prototypes.h
adf0fb13 499@@ -284,6 +284,13 @@
b4e1fa2c
AM
500 /*===================*/
501 void* thd, /*!< in: thread handle (THD*) */
502 ulint value); /*!< in: time waited for the lock */
503+/******************************************************************//**
504+*/
505+
506+ulong
507+thd_flush_log_at_trx_commit(
508+/*================================*/
509+ void* thd);
510
adf0fb13
AM
511 /**********************************************************************//**
512 Get the current setting of the lower_case_table_names global parameter from
db82db79
AM
513--- a/storage/innobase/include/os0file.h
514+++ b/storage/innobase/include/os0file.h
413cadc7
AM
515@@ -296,8 +296,8 @@
516 pfs_os_file_write_func(name, file, buf, offset, offset_high, \
517 n, __FILE__, __LINE__)
518
519-# define os_file_flush(file) \
520- pfs_os_file_flush_func(file, __FILE__, __LINE__)
521+# define os_file_flush(file, metadata) \
522+ pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__)
523
524 # define os_file_rename(key, oldpath, newpath) \
525 pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
526@@ -333,7 +333,7 @@
527 # define os_file_write(name, file, buf, offset, offset_high, n) \
528 os_file_write_func(name, file, buf, offset, offset_high, n)
529
530-# define os_file_flush(file) os_file_flush_func(file)
531+# define os_file_flush(file, metadata) os_file_flush_func(file, metadata)
532
533 # define os_file_rename(key, oldpath, newpath) \
534 os_file_rename_func(oldpath, newpath)
535@@ -781,6 +781,7 @@
536 pfs_os_file_flush_func(
537 /*===================*/
538 os_file_t file, /*!< in, own: handle to a file */
539+ ibool metadata,
540 const char* src_file,/*!< in: file name where func invoked */
541 ulint src_line);/*!< in: line where the func invoked */
542
543@@ -860,7 +861,8 @@
544 ibool
545 os_file_flush_func(
546 /*===============*/
547- os_file_t file); /*!< in, own: handle to a file */
548+ os_file_t file, /*!< in, own: handle to a file */
549+ ibool metadata);
550 /***********************************************************************//**
551 Retrieves the last error number if an error occurs in a file io function.
552 The number should be retrieved before any other OS calls (because they may
db82db79
AM
553--- a/storage/innobase/include/os0file.ic
554+++ b/storage/innobase/include/os0file.ic
413cadc7
AM
555@@ -369,6 +369,7 @@
556 pfs_os_file_flush_func(
557 /*===================*/
558 os_file_t file, /*!< in, own: handle to a file */
559+ ibool metadata,
560 const char* src_file,/*!< in: file name where func invoked */
561 ulint src_line)/*!< in: line where the func invoked */
562 {
563@@ -378,7 +379,7 @@
564
565 register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
566 src_file, src_line);
567- result = os_file_flush_func(file);
568+ result = os_file_flush_func(file, metadata);
569
570 register_pfs_file_io_end(locker, 0);
571
db82db79
AM
572--- a/storage/innobase/include/srv0srv.h
573+++ b/storage/innobase/include/srv0srv.h
adf0fb13 574@@ -138,7 +138,8 @@
b4e1fa2c
AM
575 extern ulint srv_n_log_files;
576 extern ulint srv_log_file_size;
577 extern ulint srv_log_buffer_size;
578-extern ulong srv_flush_log_at_trx_commit;
579+//extern ulong srv_flush_log_at_trx_commit;
580+extern char srv_use_global_flush_log_at_trx_commit;
581 extern char srv_adaptive_flushing;
582
734d6226
AM
583 /* If this flag is TRUE, then we will load the indexes' (and tables') metadata
584@@ -221,6 +222,16 @@
b4e1fa2c
AM
585 extern ulong srv_max_purge_lag;
586
587 extern ulong srv_replication_delay;
588+
589+extern long long srv_ibuf_max_size;
590+extern ulint srv_ibuf_active_contract;
591+extern ulint srv_ibuf_accel_rate;
592+extern ulint srv_checkpoint_age_target;
593+extern ulint srv_flush_neighbor_pages;
594+extern ulint srv_enable_unsafe_group_commit;
595+extern ulint srv_read_ahead;
596+extern ulint srv_adaptive_flushing_method;
597+
598 /*-------------------------------------------*/
599
600 extern ulint srv_n_rows_inserted;
543222d2 601@@ -399,8 +410,9 @@
b4e1fa2c
AM
602 when writing data files, but do flush
603 after writing to log files */
604 SRV_UNIX_NOSYNC, /*!< do not flush after writing */
605- SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on
606+ SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on
607 data files */
608+ SRV_UNIX_ALL_O_DIRECT /* new method for examination: logfile also open O_DIRECT */
609 };
610
611 /** Alternatives for file i/o in Windows */
db82db79
AM
612--- a/storage/innobase/log/log0log.c
613+++ b/storage/innobase/log/log0log.c
d8778560
AM
614@@ -48,6 +48,7 @@
615 #include "srv0start.h"
616 #include "trx0sys.h"
617 #include "trx0trx.h"
618+#include "ha_prototypes.h"
619
620 /*
621 General philosophy of InnoDB redo-logs:
622@@ -359,6 +360,33 @@
b4e1fa2c
AM
623 }
624
625 /************************************************************//**
626+*/
627+UNIV_INLINE
628+ulint
629+log_max_modified_age_async()
630+{
631+ if (srv_checkpoint_age_target) {
632+ return(ut_min(log_sys->max_modified_age_async,
633+ srv_checkpoint_age_target
634+ - srv_checkpoint_age_target / 8));
635+ } else {
636+ return(log_sys->max_modified_age_async);
637+ }
638+}
639+
640+UNIV_INLINE
641+ulint
642+log_max_checkpoint_age_async()
643+{
644+ if (srv_checkpoint_age_target) {
645+ return(ut_min(log_sys->max_checkpoint_age_async,
646+ srv_checkpoint_age_target));
647+ } else {
648+ return(log_sys->max_checkpoint_age_async);
649+ }
650+}
651+
652+/************************************************************//**
653 Closes the log.
654 @return lsn */
655 UNIV_INTERN
d8778560 656@@ -427,7 +455,7 @@
b4e1fa2c
AM
657 }
658 }
659
660- if (checkpoint_age <= log->max_modified_age_async) {
661+ if (checkpoint_age <= log_max_modified_age_async()) {
662
663 goto function_exit;
664 }
d8778560 665@@ -435,8 +463,8 @@
b4e1fa2c
AM
666 oldest_lsn = buf_pool_get_oldest_modification();
667
668 if (!oldest_lsn
669- || lsn - oldest_lsn > log->max_modified_age_async
670- || checkpoint_age > log->max_checkpoint_age_async) {
671+ || lsn - oldest_lsn > log_max_modified_age_async()
672+ || checkpoint_age > log_max_checkpoint_age_async()) {
673
674 log->check_flush_or_checkpoint = TRUE;
675 }
413cadc7 676@@ -1100,9 +1128,10 @@
b4e1fa2c
AM
677 group = (log_group_t*)((ulint)group - 1);
678
679 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
680+ && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
681 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
682
413cadc7
AM
683- fil_flush(group->space_id);
684+ fil_flush(group->space_id, FALSE);
685 }
686
687 #ifdef UNIV_DEBUG
688@@ -1121,10 +1150,11 @@
b4e1fa2c
AM
689 logs and cannot end up here! */
690
691 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
692+ && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
693 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
694- && srv_flush_log_at_trx_commit != 2) {
695+ && thd_flush_log_at_trx_commit(NULL) != 2) {
696
413cadc7
AM
697- fil_flush(group->space_id);
698+ fil_flush(group->space_id, FALSE);
b4e1fa2c 699 }
413cadc7
AM
700
701 mutex_enter(&(log_sys->mutex));
d8778560 702@@ -1501,7 +1531,8 @@
b4e1fa2c
AM
703
704 mutex_exit(&(log_sys->mutex));
705
706- if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
707+ if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
708+ || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
709 /* O_DSYNC means the OS did not buffer the log file at all:
710 so we have also flushed to disk what we have written */
711
413cadc7
AM
712@@ -1511,7 +1542,7 @@
713
714 group = UT_LIST_GET_FIRST(log_sys->log_groups);
715
716- fil_flush(group->space_id);
717+ fil_flush(group->space_id, FALSE);
718 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
719 }
720
543222d2 721@@ -2120,10 +2151,10 @@
b4e1fa2c
AM
722
723 sync = TRUE;
543222d2 724 advance = 2 * (age - log->max_modified_age_sync);
b4e1fa2c
AM
725- } else if (age > log->max_modified_age_async) {
726+ } else if (age > log_max_modified_age_async()) {
727
728 /* A flush is not urgent: we do an asynchronous preflush */
729- advance = age - log->max_modified_age_async;
730+ advance = age - log_max_modified_age_async();
731 } else {
732 advance = 0;
733 }
543222d2 734@@ -2137,7 +2168,7 @@
b4e1fa2c
AM
735
736 do_checkpoint = TRUE;
737
738- } else if (checkpoint_age > log->max_checkpoint_age_async) {
739+ } else if (checkpoint_age > log_max_checkpoint_age_async()) {
740 /* A checkpoint is not urgent: do it asynchronously */
741
742 do_checkpoint = TRUE;
543222d2 743@@ -2607,7 +2638,7 @@
413cadc7
AM
744
745 mutex_exit(&(log_sys->mutex));
746
747- fil_flush(group->archive_space_id);
748+ fil_flush(group->archive_space_id, TRUE);
749
750 mutex_enter(&(log_sys->mutex));
751
543222d2 752@@ -3349,6 +3380,17 @@
b4e1fa2c
AM
753 log_sys->flushed_to_disk_lsn,
754 log_sys->last_checkpoint_lsn);
755
756+ fprintf(file,
757+ "Max checkpoint age %lu\n"
758+ "Checkpoint age target %lu\n"
759+ "Modified age %lu\n"
760+ "Checkpoint age %lu\n",
761+ (ulong) log_sys->max_checkpoint_age,
762+ (ulong) log_max_checkpoint_age_async(),
763+ (ulong) (log_sys->lsn -
764+ log_buf_pool_get_oldest_modification()),
765+ (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
766+
767 current_time = time(NULL);
768
769 time_elapsed = 0.001 + difftime(current_time,
db82db79
AM
770--- a/storage/innobase/log/log0recv.c
771+++ b/storage/innobase/log/log0recv.c
b4e1fa2c
AM
772@@ -2906,9 +2906,12 @@
773 ib_uint64_t archived_lsn;
774 #endif /* UNIV_LOG_ARCHIVE */
775 byte* buf;
776- byte log_hdr_buf[LOG_FILE_HDR_SIZE];
777+ byte* log_hdr_buf;
778+ byte log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
779 ulint err;
780
781+ log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
782+
783 #ifdef UNIV_LOG_ARCHIVE
784 ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
785 /** TRUE when recovering from a checkpoint */
413cadc7
AM
786@@ -3468,7 +3471,7 @@
787 exit(1);
788 }
789
790- os_file_flush(log_file);
791+ os_file_flush(log_file, TRUE);
792 os_file_close(log_file);
793 }
794
795@@ -3492,7 +3495,7 @@
796
797 os_file_write(name, log_file, buf, 0, 0,
798 LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
799- os_file_flush(log_file);
800+ os_file_flush(log_file, TRUE);
801 os_file_close(log_file);
802
803 ut_free(buf);
db82db79
AM
804--- a/storage/innobase/os/os0file.c
805+++ b/storage/innobase/os/os0file.c
d8778560 806@@ -1424,7 +1424,7 @@
b4e1fa2c
AM
807 #endif
808 #ifdef UNIV_NON_BUFFERED_IO
809 # ifndef UNIV_HOTBACKUP
810- if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
811+ if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
812 /* Do not use unbuffered i/o to log files because
813 value 2 denotes that we do not flush the log at every
814 commit, but only once per second */
d8778560 815@@ -1440,7 +1440,7 @@
b4e1fa2c
AM
816 attributes = 0;
817 #ifdef UNIV_NON_BUFFERED_IO
818 # ifndef UNIV_HOTBACKUP
819- if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
820+ if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
821 /* Do not use unbuffered i/o to log files because
822 value 2 denotes that we do not flush the log at every
823 commit, but only once per second */
d8778560 824@@ -1585,6 +1585,11 @@
b4e1fa2c
AM
825 os_file_set_nocache(file, name, mode_str);
826 }
827
828+ /* ALL_O_DIRECT: O_DIRECT also for transaction log file */
829+ if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
830+ os_file_set_nocache(file, name, mode_str);
831+ }
832+
833 #ifdef USE_FILE_LOCK
834 if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
835
413cadc7
AM
836@@ -2008,7 +2013,7 @@
837
838 ut_free(buf2);
839
840- ret = os_file_flush(file);
841+ ret = os_file_flush(file, TRUE);
842
843 if (ret) {
844 return(TRUE);
845@@ -2046,7 +2051,8 @@
846 int
847 os_file_fsync(
848 /*==========*/
849- os_file_t file) /*!< in: handle to a file */
850+ os_file_t file, /*!< in: handle to a file */
851+ ibool metadata)
852 {
853 int ret;
854 int failures;
db82db79 855@@ -2055,7 +2061,16 @@
413cadc7
AM
856 failures = 0;
857
858 do {
db82db79 859+#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
413cadc7
AM
860+ if (metadata) {
861+ ret = fsync(file);
862+ } else {
863+ ret = fdatasync(file);
864+ }
865+#else
db82db79 866+ (void) metadata;
413cadc7
AM
867 ret = fsync(file);
868+#endif
869
870 os_n_fsyncs++;
871
db82db79 872@@ -2092,7 +2107,8 @@
413cadc7
AM
873 ibool
874 os_file_flush_func(
875 /*===============*/
876- os_file_t file) /*!< in, own: handle to a file */
877+ os_file_t file, /*!< in, own: handle to a file */
878+ ibool metadata)
879 {
880 #ifdef __WIN__
881 BOOL ret;
db82db79 882@@ -2142,18 +2158,18 @@
413cadc7
AM
883 /* If we are not on an operating system that supports this,
884 then fall back to a plain fsync. */
885
886- ret = os_file_fsync(file);
887+ ret = os_file_fsync(file, metadata);
888 } else {
889 ret = fcntl(file, F_FULLFSYNC, NULL);
890
891 if (ret) {
892 /* If we are not on a file system that supports this,
893 then fall back to a plain fsync. */
894- ret = os_file_fsync(file);
895+ ret = os_file_fsync(file, metadata);
896 }
897 }
898 #else
899- ret = os_file_fsync(file);
900+ ret = os_file_fsync(file, metadata);
901 #endif
902
903 if (ret == 0) {
db82db79 904@@ -2336,7 +2352,7 @@
413cadc7
AM
905 the OS crashes, a database page is only partially
906 physically written to disk. */
907
908- ut_a(TRUE == os_file_flush(file));
909+ ut_a(TRUE == os_file_flush(file, TRUE));
910 }
911 # endif /* UNIV_DO_FLUSH */
912
db82db79 913@@ -2378,7 +2394,7 @@
413cadc7
AM
914 the OS crashes, a database page is only partially
915 physically written to disk. */
916
917- ut_a(TRUE == os_file_flush(file));
918+ ut_a(TRUE == os_file_flush(file, TRUE));
919 }
920 # endif /* UNIV_DO_FLUSH */
921
db82db79 922@@ -2750,7 +2766,7 @@
413cadc7
AM
923
924 # ifdef UNIV_DO_FLUSH
925 if (!os_do_not_call_flush_at_each_write) {
926- ut_a(TRUE == os_file_flush(file));
927+ ut_a(TRUE == os_file_flush(file, TRUE));
928 }
929 # endif /* UNIV_DO_FLUSH */
930
db82db79 931@@ -4296,7 +4312,7 @@
413cadc7
AM
932 #ifdef UNIV_DO_FLUSH
933 if (slot->type == OS_FILE_WRITE
934 && !os_do_not_call_flush_at_each_write) {
935- if (!os_file_flush(slot->file)) {
936+ if (!os_file_flush(slot->file, TRUE)) {
937 ut_error;
938 }
939 }
db82db79 940@@ -4597,7 +4613,7 @@
413cadc7
AM
941 #ifdef UNIV_DO_FLUSH
942 if (slot->type == OS_FILE_WRITE
943 && !os_do_not_call_flush_at_each_write)
944- && !os_file_flush(slot->file) {
945+ && !os_file_flush(slot->file, TRUE) {
946 ut_error;
947 }
948 #endif /* UNIV_DO_FLUSH */
db82db79
AM
949--- a/storage/innobase/srv/srv0srv.c
950+++ b/storage/innobase/srv/srv0srv.c
adf0fb13 951@@ -183,7 +183,8 @@
b4e1fa2c
AM
952 UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
953 /* size in database pages */
954 UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
955-UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
956+//UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
957+UNIV_INTERN char srv_use_global_flush_log_at_trx_commit = TRUE;
958
959 /* Try to flush dirty pages so as to avoid IO bursts at
960 the checkpoints. */
734d6226 961@@ -404,6 +405,17 @@
b4e1fa2c
AM
962
963 UNIV_INTERN ulong srv_replication_delay = 0;
964
965+UNIV_INTERN long long srv_ibuf_max_size = 0;
966+UNIV_INTERN ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
967+UNIV_INTERN ulint srv_ibuf_accel_rate = 100;
968+#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
969+
970+UNIV_INTERN ulint srv_checkpoint_age_target = 0;
1bfc1981 971+UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:area 2:contiguous */
b4e1fa2c
AM
972+
973+UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
974+UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
975+UNIV_INTERN ulint srv_adaptive_flushing_method = 0; /* 0: native 1: estimate 2: keep_average */
976 /*-------------------------------------------*/
977 UNIV_INTERN ulong srv_n_spin_wait_rounds = 30;
978 UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500;
543222d2 979@@ -2713,7 +2725,7 @@
db82db79
AM
980
981 ut_ad(!mutex_own(&kernel_mutex));
982
983- ut_a(srv_n_purge_threads == 0);
984+ ut_a(srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0));
985
986 do {
987 /* Check for shutdown and change in purge config. */
543222d2 988@@ -2746,6 +2758,7 @@
b4e1fa2c
AM
989 ulint n_pages_purged = 0;
990 ulint n_bytes_merged;
991 ulint n_pages_flushed;
992+ ulint n_pages_flushed_prev = 0;
993 ulint n_bytes_archived;
994 ulint n_tables_to_drop;
995 ulint n_ios;
543222d2 996@@ -2753,7 +2766,20 @@
b4e1fa2c
AM
997 ulint n_ios_very_old;
998 ulint n_pend_ios;
999 ulint next_itr_time;
1000+ ulint prev_adaptive_flushing_method = ULINT_UNDEFINED;
1001+ ulint inner_loop = 0;
1002+ ibool skip_sleep = FALSE;
1003 ulint i;
1004+ struct t_prev_flush_info_struct {
1005+ ulint count;
1006+ unsigned space:32;
1007+ unsigned offset:32;
1008+ ib_uint64_t oldest_modification;
1009+ } prev_flush_info[MAX_BUFFER_POOLS];
1010+
1011+ ib_uint64_t lsn_old;
1012+
1013+ ib_uint64_t oldest_lsn;
1014
1015 #ifdef UNIV_DEBUG_THREAD_CREATION
1016 fprintf(stderr, "Master thread starts, id %lu\n",
543222d2 1017@@ -2775,6 +2801,9 @@
b4e1fa2c
AM
1018
1019 mutex_exit(&kernel_mutex);
1020
1021+ mutex_enter(&(log_sys->mutex));
1022+ lsn_old = log_sys->lsn;
1023+ mutex_exit(&(log_sys->mutex));
1024 loop:
1025 /*****************************************************************/
1026 /* ---- When there is database activity by users, we cycle in this
543222d2 1027@@ -2805,9 +2834,13 @@
b4e1fa2c
AM
1028 /* Sleep for 1 second on entrying the for loop below the first time. */
1029 next_itr_time = ut_time_ms() + 1000;
1030
1031+ skip_sleep = FALSE;
1032+
1033 for (i = 0; i < 10; i++) {
1034 ulint cur_time = ut_time_ms();
1035
1036+ n_pages_flushed = 0; /* initialize */
1037+
1038 /* ALTER TABLE in MySQL requires on Unix that the table handler
1039 can drop tables lazily after there no longer are SELECT
1040 queries to them. */
543222d2 1041@@ -2831,6 +2864,7 @@
b4e1fa2c
AM
1042 srv_main_thread_op_info = "sleeping";
1043 srv_main_1_second_loops++;
1044
1045+ if (!skip_sleep) {
1046 if (next_itr_time > cur_time
1047 && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
1048
543222d2 1049@@ -2841,10 +2875,26 @@
b4e1fa2c
AM
1050 (next_itr_time - cur_time)
1051 * 1000));
1052 srv_main_sleeps++;
1053+
1054+ /*
1055+ mutex_enter(&(log_sys->mutex));
1056+ oldest_lsn = buf_pool_get_oldest_modification();
1057+ ib_uint64_t lsn = log_sys->lsn;
1058+ mutex_exit(&(log_sys->mutex));
1059+
1060+ if(oldest_lsn)
1061+ fprintf(stderr,
1062+ "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
1063+ (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
1064+ lsn - lsn_old);
1065+ */
1066 }
1067
1068 /* Each iteration should happen at 1 second interval. */
1069 next_itr_time = ut_time_ms() + 1000;
1070+ } /* if (!skip_sleep) */
1071+
1072+ skip_sleep = FALSE;
1073
1074 /* Flush logs if needed */
1075 srv_sync_log_buffer_in_background();
543222d2 1076@@ -2864,7 +2914,7 @@
b4e1fa2c
AM
1077 if (n_pend_ios < SRV_PEND_IO_THRESHOLD
1078 && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
1079 srv_main_thread_op_info = "doing insert buffer merge";
1080- ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
1081+ ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
1082
1083 /* Flush logs if needed */
1084 srv_sync_log_buffer_in_background();
543222d2 1085@@ -2881,7 +2931,11 @@
b4e1fa2c
AM
1086 n_pages_flushed = buf_flush_list(
1087 PCT_IO(100), IB_ULONGLONG_MAX);
1088
1089- } else if (srv_adaptive_flushing) {
1090+ mutex_enter(&(log_sys->mutex));
1091+ lsn_old = log_sys->lsn;
1092+ mutex_exit(&(log_sys->mutex));
1093+ prev_adaptive_flushing_method = ULINT_UNDEFINED;
1094+ } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) {
1095
1096 /* Try to keep the rate of flushing of dirty
1097 pages such that redo log generation does not
543222d2 1098@@ -2897,6 +2951,224 @@
b4e1fa2c
AM
1099 n_flush,
1100 IB_ULONGLONG_MAX);
1101 }
1102+
1103+ mutex_enter(&(log_sys->mutex));
1104+ lsn_old = log_sys->lsn;
1105+ mutex_exit(&(log_sys->mutex));
1106+ prev_adaptive_flushing_method = ULINT_UNDEFINED;
1107+ } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) {
1108+
1109+ /* Try to keep modified age not to exceed
1110+ max_checkpoint_age * 7/8 line */
1111+
1112+ mutex_enter(&(log_sys->mutex));
1113+
1114+ oldest_lsn = buf_pool_get_oldest_modification();
1115+ if (oldest_lsn == 0) {
1116+ lsn_old = log_sys->lsn;
1117+ mutex_exit(&(log_sys->mutex));
1118+
1119+ } else {
1120+ if ((log_sys->lsn - oldest_lsn)
1121+ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
1122+ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
1123+ /* We should not flush from here. */
1124+ lsn_old = log_sys->lsn;
1125+ mutex_exit(&(log_sys->mutex));
1126+ } else if ((log_sys->lsn - oldest_lsn)
1127+ > (log_sys->max_checkpoint_age)/4 ) {
1128+
1129+ /* defence line (max_checkpoint_age * 1/2) */
1130+ ib_uint64_t lsn = log_sys->lsn;
1131+
1132+ ib_uint64_t level, bpl;
1133+ buf_page_t* bpage;
1134+ ulint j;
1135+
1136+ mutex_exit(&(log_sys->mutex));
1137+
1138+ bpl = 0;
1139+
1140+ for (j = 0; j < srv_buf_pool_instances; j++) {
1141+ buf_pool_t* buf_pool;
1142+ ulint n_blocks;
1143+
1144+ buf_pool = buf_pool_from_array(j);
1145+
1146+ /* The scanning flush_list is optimistic here */
1147+
1148+ level = 0;
1149+ n_blocks = 0;
1150+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1151+
1152+ while (bpage != NULL) {
1153+ ib_uint64_t oldest_modification = bpage->oldest_modification;
1154+ if (oldest_modification != 0) {
1155+ level += log_sys->max_checkpoint_age
1156+ - (lsn - oldest_modification);
1157+ }
1158+ bpage = UT_LIST_GET_NEXT(list, bpage);
1159+ n_blocks++;
1160+ }
1161+
1162+ if (level) {
1163+ bpl += ((ib_uint64_t) n_blocks * n_blocks
1164+ * (lsn - lsn_old)) / level;
1165+ }
1166+
1167+ }
1168+
1169+ if (!srv_use_doublewrite_buf) {
1170+ /* flush is faster than when doublewrite */
1171+ bpl = (bpl * 7) / 8;
1172+ }
1173+
1174+ if (bpl) {
1175+retry_flush_batch:
1176+ n_pages_flushed = buf_flush_list(bpl,
1177+ oldest_lsn + (lsn - lsn_old));
1178+ if (n_pages_flushed == ULINT_UNDEFINED) {
1179+ os_thread_sleep(5000);
1180+ goto retry_flush_batch;
1181+ }
1182+ }
1183+
1184+ lsn_old = lsn;
1185+ /*
1186+ fprintf(stderr,
1187+ "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
1188+ (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
1189+ lsn - lsn_old, bpl);
1190+ */
1191+ } else {
1192+ lsn_old = log_sys->lsn;
1193+ mutex_exit(&(log_sys->mutex));
1194+ }
1195+ }
1196+ prev_adaptive_flushing_method = 1;
1197+ } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) {
1198+ buf_pool_t* buf_pool;
1199+ buf_page_t* bpage;
1200+ ib_uint64_t lsn;
1201+ ulint j;
1202+
1203+ mutex_enter(&(log_sys->mutex));
1204+ oldest_lsn = buf_pool_get_oldest_modification();
1205+ lsn = log_sys->lsn;
1206+ mutex_exit(&(log_sys->mutex));
1207+
1208+ /* upper loop/sec. (x10) */
1209+ next_itr_time -= 900; /* 1000 - 900 == 100 */
1210+ inner_loop++;
1211+ if (inner_loop < 10) {
1212+ i--;
1213+ } else {
1214+ inner_loop = 0;
1215+ }
1216+
1217+ if (prev_adaptive_flushing_method == 2) {
1218+ lint n_flush;
d8778560
AM
1219+ lint blocks_sum;
1220+ ulint new_blocks_sum, flushed_blocks_sum;
b4e1fa2c
AM
1221+
1222+ blocks_sum = new_blocks_sum = flushed_blocks_sum = 0;
1223+
1224+ /* prev_flush_info[j] should be the previous loop's */
1225+ for (j = 0; j < srv_buf_pool_instances; j++) {
1226+ lint blocks_num, new_blocks_num, flushed_blocks_num;
1227+ ibool found;
1228+
1229+ buf_pool = buf_pool_from_array(j);
1230+
1231+ blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list);
1232+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1233+ new_blocks_num = 0;
1234+
1235+ found = FALSE;
1236+ while (bpage != NULL) {
1237+ if (prev_flush_info[j].space == bpage->space
1238+ && prev_flush_info[j].offset == bpage->offset
1239+ && prev_flush_info[j].oldest_modification
1240+ == bpage->oldest_modification) {
1241+ found = TRUE;
1242+ break;
1243+ }
1244+ bpage = UT_LIST_GET_NEXT(list, bpage);
1245+ new_blocks_num++;
1246+ }
1247+ if (!found) {
1248+ new_blocks_num = blocks_num;
1249+ }
1250+
1251+ flushed_blocks_num = new_blocks_num + prev_flush_info[j].count
1252+ - blocks_num;
1253+ if (flushed_blocks_num < 0) {
1254+ flushed_blocks_num = 0;
1255+ }
1256+
1257+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1258+
1259+ prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1260+ if (bpage) {
1261+ prev_flush_info[j].space = bpage->space;
1262+ prev_flush_info[j].offset = bpage->offset;
1263+ prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1264+ } else {
1265+ prev_flush_info[j].space = 0;
1266+ prev_flush_info[j].offset = 0;
1267+ prev_flush_info[j].oldest_modification = 0;
1268+ }
1269+
1270+ new_blocks_sum += new_blocks_num;
1271+ flushed_blocks_sum += flushed_blocks_num;
1272+ blocks_sum += blocks_num;
1273+ }
1274+
1275+ n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async;
1276+ if (flushed_blocks_sum > n_pages_flushed_prev) {
1277+ n_flush -= (flushed_blocks_sum - n_pages_flushed_prev);
1278+ }
1279+
1280+ if (n_flush > 0) {
1281+ n_flush++;
1282+ n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old));
1283+ } else {
1284+ n_pages_flushed = 0;
1285+ }
1286+ } else {
1287+ /* store previous first pages of the flush_list */
1288+ for (j = 0; j < srv_buf_pool_instances; j++) {
1289+ buf_pool = buf_pool_from_array(j);
1290+
1291+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1292+
1293+ prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1294+ if (bpage) {
1295+ prev_flush_info[j].space = bpage->space;
1296+ prev_flush_info[j].offset = bpage->offset;
1297+ prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1298+ } else {
1299+ prev_flush_info[j].space = 0;
1300+ prev_flush_info[j].offset = 0;
1301+ prev_flush_info[j].oldest_modification = 0;
1302+ }
1303+ }
1304+ n_pages_flushed = 0;
1305+ }
1306+
1307+ lsn_old = lsn;
1308+ prev_adaptive_flushing_method = 2;
1309+ } else {
1310+ mutex_enter(&(log_sys->mutex));
1311+ lsn_old = log_sys->lsn;
1312+ mutex_exit(&(log_sys->mutex));
1313+ prev_adaptive_flushing_method = ULINT_UNDEFINED;
1314+ }
1315+
1316+ if (n_pages_flushed == ULINT_UNDEFINED) {
1317+ n_pages_flushed_prev = 0;
1318+ } else {
1319+ n_pages_flushed_prev = n_pages_flushed;
1320 }
1321
1322 if (srv_activity_count == old_activity_count) {
543222d2 1323@@ -2945,12 +3217,12 @@
b4e1fa2c
AM
1324 even if the server were active */
1325
1326 srv_main_thread_op_info = "doing insert buffer merge";
1327- ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
1328+ ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
1329
1330 /* Flush logs if needed */
1331 srv_sync_log_buffer_in_background();
db82db79
AM
1332
1333- if (srv_n_purge_threads == 0) {
1334+ if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
1335 srv_main_thread_op_info = "master purging";
1336
1337 srv_master_do_purge();
543222d2 1338@@ -3028,7 +3300,7 @@
db82db79
AM
1339 }
1340 }
1341
1342- if (srv_n_purge_threads == 0) {
1343+ if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
1344 srv_main_thread_op_info = "master purging";
1345
1346 srv_master_do_purge();
543222d2 1347@@ -3053,7 +3325,7 @@
b4e1fa2c
AM
1348 buf_flush_list below. Otherwise, the system favors
1349 clean pages over cleanup throughput. */
1350 n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
1351- PCT_IO(100));
1352+ PCT_IBUF_IO(100));
1353 }
1354
1355 srv_main_thread_op_info = "reserving kernel mutex";
543222d2 1356@@ -3193,6 +3465,7 @@
adf0fb13 1357 srv_slot_t* slot;
11822e22 1358 ulint retries = 0;
b4e1fa2c
AM
1359 ulint n_total_purged = ULINT_UNDEFINED;
1360+ ulint next_itr_time;
1361
1362 ut_a(srv_n_purge_threads == 1);
1363
543222d2 1364@@ -3213,9 +3486,12 @@
b4e1fa2c
AM
1365
1366 mutex_exit(&kernel_mutex);
1367
1368+ next_itr_time = ut_time_ms();
1369+
1370 while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
1371
11822e22 1372 ulint n_pages_purged = 0;
b4e1fa2c
AM
1373+ ulint cur_time;
1374
1375 /* If there are very few records to purge or the last
1376 purge didn't purge any records then wait for activity.
543222d2 1377@@ -3262,6 +3538,16 @@
b4e1fa2c
AM
1378 } while (n_pages_purged > 0 && !srv_fast_shutdown);
1379
1380 srv_sync_log_buffer_in_background();
1381+
1382+ cur_time = ut_time_ms();
1383+ if (next_itr_time > cur_time) {
1384+ os_thread_sleep(ut_min(1000000,
1385+ (next_itr_time - cur_time)
1386+ * 1000));
1387+ next_itr_time = ut_time_ms() + 1000;
1388+ } else {
1389+ next_itr_time = cur_time + 1000;
1390+ }
1391 }
1392
1393 mutex_enter(&kernel_mutex);
db82db79
AM
1394--- a/storage/innobase/srv/srv0start.c
1395+++ b/storage/innobase/srv/srv0start.c
29ffd636 1396@@ -1237,6 +1237,9 @@
b4e1fa2c
AM
1397 } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
1398 srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
1399
1400+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
1401+ srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
1402+
1403 } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
1404 srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
1405
db82db79
AM
1406--- a/storage/innobase/trx/trx0purge.c
1407+++ b/storage/innobase/trx/trx0purge.c
11822e22
AM
1408@@ -392,10 +392,10 @@
1409 trx_sys->rseg_history_len++;
1410 mutex_exit(&kernel_mutex);
1411
1412- if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
1413+// if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
1414 /* Inform the purge thread that there is work to do. */
1415 srv_wake_purge_thread_if_not_active();
1416- }
1417+// }
1418 }
1419
1420 /**********************************************************************//**
db82db79
AM
1421--- a/storage/innobase/trx/trx0trx.c
1422+++ b/storage/innobase/trx/trx0trx.c
adf0fb13 1423@@ -984,6 +984,7 @@
b4e1fa2c
AM
1424 trx->read_view = NULL;
1425
1426 if (lsn) {
1427+ ulint flush_log_at_trx_commit;
1428
1429 mutex_exit(&kernel_mutex);
1430
adf0fb13 1431@@ -992,6 +993,12 @@
b4e1fa2c
AM
1432 trx_undo_insert_cleanup(trx);
1433 }
1434
1435+ if (srv_use_global_flush_log_at_trx_commit) {
1436+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1437+ } else {
1438+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1439+ }
1440+
1441 /* NOTE that we could possibly make a group commit more
1442 efficient here: call os_thread_yield here to allow also other
1443 trxs to come to commit! */
adf0fb13 1444@@ -1023,9 +1030,9 @@
b4e1fa2c
AM
1445 if (trx->flush_log_later) {
1446 /* Do nothing yet */
1447 trx->must_flush_log_later = TRUE;
1448- } else if (srv_flush_log_at_trx_commit == 0) {
1449+ } else if (flush_log_at_trx_commit == 0) {
1450 /* Do nothing */
1451- } else if (srv_flush_log_at_trx_commit == 1) {
1452+ } else if (flush_log_at_trx_commit == 1) {
1453 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1454 /* Write the log but do not flush it to disk */
1455
adf0fb13 1456@@ -1037,7 +1044,7 @@
b4e1fa2c
AM
1457
1458 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1459 }
1460- } else if (srv_flush_log_at_trx_commit == 2) {
1461+ } else if (flush_log_at_trx_commit == 2) {
1462
1463 /* Write the log but do not flush it to disk */
1464
adf0fb13 1465@@ -1701,16 +1708,23 @@
b4e1fa2c
AM
1466 trx_t* trx) /*!< in: trx handle */
1467 {
1468 ib_uint64_t lsn = trx->commit_lsn;
1469+ ulint flush_log_at_trx_commit;
1470
1471 ut_a(trx);
1472
1473 trx->op_info = "flushing log";
1474
1475+ if (srv_use_global_flush_log_at_trx_commit) {
1476+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1477+ } else {
1478+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1479+ }
1480+
1481 if (!trx->must_flush_log_later) {
1482 /* Do nothing */
1483- } else if (srv_flush_log_at_trx_commit == 0) {
1484+ } else if (flush_log_at_trx_commit == 0) {
1485 /* Do nothing */
1486- } else if (srv_flush_log_at_trx_commit == 1) {
1487+ } else if (flush_log_at_trx_commit == 1) {
1488 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1489 /* Write the log but do not flush it to disk */
1490
adf0fb13 1491@@ -1721,7 +1735,7 @@
b4e1fa2c
AM
1492
1493 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1494 }
1495- } else if (srv_flush_log_at_trx_commit == 2) {
1496+ } else if (flush_log_at_trx_commit == 2) {
1497
1498 /* Write the log but do not flush it to disk */
1499
adf0fb13 1500@@ -1969,6 +1983,8 @@
b4e1fa2c
AM
1501 /*--------------------------------------*/
1502
1503 if (lsn) {
1504+ ulint flush_log_at_trx_commit;
1505+
1506 /* Depending on the my.cnf options, we may now write the log
1507 buffer to the log files, making the prepared state of the
1508 transaction durable if the OS does not crash. We may also
adf0fb13 1509@@ -1988,9 +2004,15 @@
b4e1fa2c
AM
1510
1511 mutex_exit(&kernel_mutex);
1512
1513- if (srv_flush_log_at_trx_commit == 0) {
1514+ if (srv_use_global_flush_log_at_trx_commit) {
1515+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1516+ } else {
1517+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1518+ }
1519+
1520+ if (flush_log_at_trx_commit == 0) {
1521 /* Do nothing */
1522- } else if (srv_flush_log_at_trx_commit == 1) {
1523+ } else if (flush_log_at_trx_commit == 1) {
1524 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1525 /* Write the log but do not flush it to disk */
1526
adf0fb13 1527@@ -2002,7 +2024,7 @@
b4e1fa2c
AM
1528
1529 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1530 }
1531- } else if (srv_flush_log_at_trx_commit == 2) {
1532+ } else if (flush_log_at_trx_commit == 2) {
1533
1534 /* Write the log but do not flush it to disk */
1535
13ceb006
AM
1536--- a/mysql-test/include/default_mysqld.cnf
1537+++ b/mysql-test/include/default_mysqld.cnf
1538@@ -29,7 +29,7 @@
1539 max_heap_table_size= 1M
1540
1541 loose-innodb_data_file_path= ibdata1:10M:autoextend
1542-loose-innodb_buffer_pool_size= 8M
1543+loose-innodb_buffer_pool_size= 32M
1544 loose-innodb_write_io_threads= 2
1545 loose-innodb_read_io_threads= 2
1546 loose-innodb_log_buffer_size= 1M
1547--- a/mysql-test/suite/innodb/r/innodb.result
1548+++ b/mysql-test/suite/innodb/r/innodb.result
1549@@ -1678,7 +1678,7 @@
1550 drop table t1;
1551 SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_buffer_pool_pages_total';
1552 variable_value
1553-511
1554+2047
1555 SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_page_size';
1556 variable_value
1557 16384
1bfc1981
AM
1558--- /dev/null
1559+++ b/mysql-test/suite/innodb/r/percona_flush_contiguous_neighbors.result
1560@@ -0,0 +1,21 @@
1561+DROP TABLE IF EXISTS t1;
1562+CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
1563+INSERT INTO t1(foo) VALUES ('a'), ('b');
1564+INSERT INTO t1(foo) SELECT foo FROM t1;
1565+INSERT INTO t1(foo) SELECT foo FROM t1;
1566+INSERT INTO t1(foo) SELECT foo FROM t1;
1567+INSERT INTO t1(foo) SELECT foo FROM t1;
1568+INSERT INTO t1(foo) SELECT foo FROM t1;
1569+INSERT INTO t1(foo) SELECT foo FROM t1;
1570+INSERT INTO t1(foo) SELECT foo FROM t1;
1571+INSERT INTO t1(foo) SELECT foo FROM t1;
1572+INSERT INTO t1(foo) SELECT foo FROM t1;
1573+INSERT INTO t1(foo) SELECT foo FROM t1;
1574+INSERT INTO t1(foo) SELECT foo FROM t1;
1575+INSERT INTO t1(foo) SELECT foo FROM t1;
1576+INSERT INTO t1(foo) SELECT foo FROM t1;
1577+INSERT INTO t1(foo) SELECT foo FROM t1;
1578+INSERT INTO t1(foo) SELECT foo FROM t1;
1579+INSERT INTO t1(foo) SELECT foo FROM t1;
1580+INSERT INTO t1(foo) SELECT foo FROM t1;
1581+DROP TABLE t1;
1582--- /dev/null
1583+++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors-master.opt
1584@@ -0,0 +1 @@
1585+--innodb_flush_neighbor_pages=cont
1586--- /dev/null
1587+++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors.test
1588@@ -0,0 +1,36 @@
1589+# Test for innodb_flush_neighbor_pages=contiguous.
1590+# The test is very crude: we simply overflow the buffer pool with such a number of
1591+# new/modified pages that some flushing is bound to happen.
1592+
1593+--source include/have_innodb.inc
1594+
1595+--disable_warnings
1596+DROP TABLE IF EXISTS t1;
1597+--enable_warnings
1598+
1599+CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
1600+
1601+INSERT INTO t1(foo) VALUES ('a'), ('b');
1602+INSERT INTO t1(foo) SELECT foo FROM t1;
1603+INSERT INTO t1(foo) SELECT foo FROM t1;
1604+INSERT INTO t1(foo) SELECT foo FROM t1;
1605+INSERT INTO t1(foo) SELECT foo FROM t1;
1606+INSERT INTO t1(foo) SELECT foo FROM t1;
1607+INSERT INTO t1(foo) SELECT foo FROM t1;
1608+INSERT INTO t1(foo) SELECT foo FROM t1;
1609+INSERT INTO t1(foo) SELECT foo FROM t1;
1610+INSERT INTO t1(foo) SELECT foo FROM t1;
1611+INSERT INTO t1(foo) SELECT foo FROM t1;
1612+INSERT INTO t1(foo) SELECT foo FROM t1;
1613+INSERT INTO t1(foo) SELECT foo FROM t1;
1614+INSERT INTO t1(foo) SELECT foo FROM t1;
1615+INSERT INTO t1(foo) SELECT foo FROM t1;
1616+INSERT INTO t1(foo) SELECT foo FROM t1;
1617+INSERT INTO t1(foo) SELECT foo FROM t1;
1618+INSERT INTO t1(foo) SELECT foo FROM t1;
1619+
1620+# TODO: cannot record a stable value here. A check of > 0 should be enough,
1621+# but the variable is not accessible through INFORMATION_SCHEMA currently.
1622+# SHOW GLOBAL STATUS LIKE 'Innodb_buffer_pool_pages_flushed';
1623+
1624+DROP TABLE t1;
29ffd636
AM
1625--- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
1626+++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
1627@@ -1 +1 @@
1628---innodb-buffer-pool-size=8M
1629+--innodb-buffer-pool-size=32M
1630--- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
1631+++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
1632@@ -36,13 +36,14 @@
1633
1634 -- disable_query_log
1635
1636--- let $i = 400
1637+-- let $i = 4000
1638+begin;
1639 while ($i)
1640 {
1641 insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
1642 dec $i;
1643 }
1644-
1645+commit;
1646 -- enable_query_log
1647
1648 # now there should be no 8K pages in the buffer pool
This page took 0.275175 seconds and 4 git commands to generate.