]> git.pld-linux.org Git - packages/mysql.git/blob - innodb_io_patches.patch
- updated to 5.5.32
[packages/mysql.git] / innodb_io_patches.patch
1 # name       : innodb_io_patches.patch
2 # introduced : 11 or before
3 # maintainer : Yasufumi
4 #
5 #!!! notice !!!
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 --- a/storage/innobase/buf/buf0buf.c
9 +++ b/storage/innobase/buf/buf0buf.c
10 @@ -320,6 +320,7 @@
11  
12         /* When we traverse all the flush lists we don't want another
13         thread to add a dirty page to any flush list. */
14 +       if (srv_buf_pool_instances > 1)
15         log_flush_order_mutex_enter();
16  
17         for (i = 0; i < srv_buf_pool_instances; i++) {
18 @@ -343,6 +344,7 @@
19                 }
20         }
21  
22 +       if (srv_buf_pool_instances > 1)
23         log_flush_order_mutex_exit();
24  
25         /* The returned answer may be out of date: the flush_list can
26 --- a/storage/innobase/buf/buf0flu.c
27 +++ b/storage/innobase/buf/buf0flu.c
28 @@ -857,7 +857,7 @@
29  flush:
30         /* Now flush the doublewrite buffer data to disk */
31  
32 -       fil_flush(TRX_SYS_SPACE);
33 +       fil_flush(TRX_SYS_SPACE, FALSE);
34  
35         /* We know that the writes have been flushed to disk now
36         and in recovery we will find them in the doublewrite buffer
37 @@ -1375,10 +1375,11 @@
38         ulint           high;
39         ulint           count = 0;
40         buf_pool_t*     buf_pool = buf_pool_get(space, offset);
41 +       ibool           is_forward_scan;
42  
43         ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
44  
45 -       if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
46 +       if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
47                 /* If there is little space, it is better not to flush
48                 any block except from the end of the LRU list */
49  
50 @@ -1405,7 +1406,32 @@
51                 high = fil_space_get_size(space);
52         }
53  
54 -       for (i = low; i < high; i++) {
55 +       if (srv_flush_neighbor_pages == 2) {
56 +
57 +               /* In the case of contiguous flush where the requested page
58 +               does not fall at the start of flush area, first scan backward
59 +               from the page and later forward from it. */
60 +               is_forward_scan = (offset == low);
61 +       }
62 +       else {
63 +               is_forward_scan = TRUE;
64 +       }
65 +
66 +scan:
67 +       if (srv_flush_neighbor_pages == 2) {
68 +               if (is_forward_scan) {
69 +                       i = offset;
70 +               }
71 +               else {
72 +                       i = offset - 1;
73 +               }
74 +       }
75 +       else {
76 +               i = low;
77 +       }
78 +
79 +       for (; is_forward_scan ? (i < high) : (i >= low);
80 +            is_forward_scan ? i++ : i--) {
81  
82                 buf_page_t*     bpage;
83  
84 @@ -1434,6 +1460,12 @@
85                 if (!bpage) {
86  
87                         buf_pool_mutex_exit(buf_pool);
88 +                       if (srv_flush_neighbor_pages == 2) {
89 +
90 +                               /* This is contiguous neighbor page flush and
91 +                               the pages here are not contiguous. */
92 +                               break;
93 +                       }
94                         continue;
95                 }
96  
97 @@ -1470,6 +1502,22 @@
98                         }
99                 }
100                 buf_pool_mutex_exit(buf_pool);
101 +
102 +               if (srv_flush_neighbor_pages == 2) {
103 +
104 +                       /* We are trying to do the contiguous neighbor page
105 +                       flush, but the last page we checked was unflushable,
106 +                       making a "hole" in the flush, so stop this attempt. */
107 +                       break;
108 +               }
109 +       }
110 +
111 +       if (!is_forward_scan) {
112 +
113 +               /* Backward scan done, now do the forward scan */
114 +               ut_a (srv_flush_neighbor_pages == 2);
115 +               is_forward_scan = TRUE;
116 +               goto scan;
117         }
118  
119         return(count);
120 --- a/storage/innobase/buf/buf0rea.c
121 +++ b/storage/innobase/buf/buf0rea.c
122 @@ -427,6 +427,10 @@
123                 = BUF_READ_AHEAD_AREA(buf_pool);
124         ulint           threshold;
125  
126 +       if (!(srv_read_ahead & 2)) {
127 +               return(0);
128 +       }
129 +
130         if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
131                 /* No read-ahead to avoid thread deadlocks */
132                 return(0);
133 --- a/storage/innobase/fil/fil0fil.c
134 +++ b/storage/innobase/fil/fil0fil.c
135 @@ -2609,7 +2609,7 @@
136  
137                 os_thread_sleep(20000);
138  
139 -               fil_flush(id);
140 +               fil_flush(id, TRUE);
141  
142                 goto retry;
143  
144 @@ -2823,7 +2823,7 @@
145                 goto error_exit;
146         }
147  
148 -       ret = os_file_flush(file);
149 +       ret = os_file_flush(file, TRUE);
150  
151         if (!ret) {
152                 fputs("InnoDB: Error: file flush of tablespace ", stderr);
153 @@ -3009,7 +3009,7 @@
154                 }
155         }
156  
157 -       success = os_file_flush(file);
158 +       success = os_file_flush(file, TRUE);
159         if (!success) {
160  
161                 goto func_exit;
162 @@ -3031,7 +3031,7 @@
163  
164                 goto func_exit;
165         }
166 -       success = os_file_flush(file);
167 +       success = os_file_flush(file, TRUE);
168  func_exit:
169         os_file_close(file);
170         ut_free(buf2);
171 @@ -4014,7 +4014,7 @@
172         size_after_extend, *actual_size); */
173         mutex_exit(&fil_system->mutex);
174  
175 -       fil_flush(space_id);
176 +       fil_flush(space_id, TRUE);
177  
178         return(success);
179  }
180 @@ -4585,8 +4585,9 @@
181  void
182  fil_flush(
183  /*======*/
184 -       ulint   space_id)       /*!< in: file space id (this can be a group of
185 +       ulint   space_id,       /*!< in: file space id (this can be a group of
186                                 log files or a tablespace of the database) */
187 +       ibool   metadata)
188  {
189         fil_space_t*    space;
190         fil_node_t*     node;
191 @@ -4657,7 +4658,7 @@
192                         /* fprintf(stderr, "Flushing to file %s\n",
193                         node->name); */
194  
195 -                       os_file_flush(file);
196 +                       os_file_flush(file, metadata);
197  
198                         mutex_enter(&fil_system->mutex);
199  
200 @@ -4740,7 +4741,7 @@
201         a non-existing space id. */
202         for (i = 0; i < n_space_ids; i++) {
203  
204 -               fil_flush(space_ids[i]);
205 +               fil_flush(space_ids[i], TRUE);
206         }
207  
208         mem_free(space_ids);
209 --- a/storage/innobase/handler/ha_innodb.cc
210 +++ b/storage/innobase/handler/ha_innodb.cc
211 @@ -445,6 +445,12 @@
212    "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
213    NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
214  
215 +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
216 +  "Set to 0 (write and flush once per second),"
217 +  " 1 (write and flush at each commit)"
218 +  " or 2 (write at commit, flush once per second).",
219 +  NULL, NULL, 1, 0, 2, 0);
220 +
221  
222  static handler *innobase_create_handler(handlerton *hton,
223                                          TABLE_SHARE *table,
224 @@ -841,6 +847,17 @@
225         }
226  }
227  
228 +/******************************************************************//**
229 +*/
230 +extern "C" UNIV_INTERN
231 +ulong
232 +thd_flush_log_at_trx_commit(
233 +/*================================*/
234 +       void*   thd)
235 +{
236 +       return(THDVAR((THD*) thd, flush_log_at_trx_commit));
237 +}
238 +
239  /********************************************************************//**
240  Obtain the InnoDB transaction of a MySQL thread.
241  @return        reference to transaction pointer */
242 @@ -2471,6 +2488,9 @@
243         srv_n_read_io_threads = (ulint) innobase_read_io_threads;
244         srv_n_write_io_threads = (ulint) innobase_write_io_threads;
245  
246 +       srv_read_ahead &= 3;
247 +       srv_adaptive_flushing_method %= 3;
248 +
249         srv_force_recovery = (ulint) innobase_force_recovery;
250  
251         srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
252 @@ -11141,7 +11161,7 @@
253    PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
254    "Purge threads can be either 0 or 1.",
255    NULL, NULL,
256 -  0,                   /* Default setting */
257 +  1,                   /* Default setting */
258    0,                   /* Minimum value */
259    1, 0);               /* Maximum value */
260  
261 @@ -11183,12 +11203,18 @@
262    innodb_file_format_max_validate,
263    innodb_file_format_max_update, "Antelope");
264  
265 -static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
266 -  PLUGIN_VAR_OPCMDARG,
267 -  "Set to 0 (write and flush once per second),"
268 -  " 1 (write and flush at each commit)"
269 -  " or 2 (write at commit, flush once per second).",
270 -  NULL, NULL, 1, 0, 2, 0);
271 +/* Changed to the THDVAR */
272 +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
273 +//  PLUGIN_VAR_OPCMDARG,
274 +//  "Set to 0 (write and flush once per second),"
275 +//  " 1 (write and flush at each commit)"
276 +//  " or 2 (write at commit, flush once per second).",
277 +//  NULL, NULL, 1, 0, 2, 0);
278 +
279 +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit,
280 +  PLUGIN_VAR_NOCMDARG,
281 +  "Use global innodb_flush_log_at_trx_commit value. (default: ON).",
282 +  NULL, NULL, TRUE);
283  
284  static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
285    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
286 @@ -11293,7 +11319,7 @@
287  static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
288    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
289    "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
290 -  NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
291 +  NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
292  
293  static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
294    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
295 @@ -11442,6 +11468,127 @@
296    "trigger a readahead.",
297    NULL, NULL, 56, 0, 64, 0);
298  
299 +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
300 +  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
301 +  "The maximum size of the insert buffer. (in bytes)",
302 +  NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
303 +
304 +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
305 +  PLUGIN_VAR_RQCMDARG,
306 +  "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
307 +  NULL, NULL, 1, 0, 1, 0);
308 +
309 +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
310 +  PLUGIN_VAR_RQCMDARG,
311 +  "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
312 +  NULL, NULL, 100, 100, 999999999, 0);
313 +
314 +static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target,
315 +  PLUGIN_VAR_RQCMDARG,
316 +  "Control soft limit of checkpoint age. (0 : not control)",
317 +  NULL, NULL, 0, 0, ~0UL, 0);
318 +
319 +static
320 +void
321 +innodb_flush_neighbor_pages_update(
322 +  THD* thd,
323 +  struct st_mysql_sys_var* var,
324 +  void* var_ptr,
325 +  const void* save)
326 +{
327 +  *(long *)var_ptr = (*(long *)save) % 3;
328 +}
329 +
330 +const char *flush_neighbor_pages_names[]=
331 +{
332 +  "none", /* 0 */
333 +  "area",
334 +  "cont", /* 2 */
335 +  /* For compatibility with the older patch */
336 +  "0", /* "none" + 3 */
337 +  "1", /* "area" + 3 */
338 +  "2", /* "cont" + 3 */
339 +  NullS
340 +};
341 +
342 +TYPELIB flush_neighbor_pages_typelib=
343 +{
344 +  array_elements(flush_neighbor_pages_names) - 1,
345 +  "flush_neighbor_pages_typelib",
346 +  flush_neighbor_pages_names,
347 +  NULL
348 +};
349 +
350 +static MYSQL_SYSVAR_ENUM(flush_neighbor_pages, srv_flush_neighbor_pages,
351 +  PLUGIN_VAR_RQCMDARG, "Neighbor page flushing behaviour: none: do not flush, "
352 +                       "[area]: flush selected pages one-by-one, "
353 +                       "cont: flush a contiguous block of pages", NULL,
354 +  innodb_flush_neighbor_pages_update, 1, &flush_neighbor_pages_typelib);
355 +
356 +static
357 +void
358 +innodb_read_ahead_update(
359 +  THD* thd,
360 +  struct st_mysql_sys_var*     var,
361 +  void*        var_ptr,
362 +  const void*  save)
363 +{
364 +  *(long *)var_ptr= (*(long *)save) & 3;
365 +}
366 +const char *read_ahead_names[]=
367 +{
368 +  "none", /* 0 */
369 +  "random",
370 +  "linear",
371 +  "both", /* 3 */
372 +  /* For compatibility of the older patch */
373 +  "0", /* 4 ("none" + 4) */
374 +  "1",
375 +  "2",
376 +  "3", /* 7 ("both" + 4) */
377 +  NullS
378 +};
379 +TYPELIB read_ahead_typelib=
380 +{
381 +  array_elements(read_ahead_names) - 1, "read_ahead_typelib",
382 +  read_ahead_names, NULL
383 +};
384 +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
385 +  PLUGIN_VAR_RQCMDARG,
386 +  "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
387 +  NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
388 +
389 +static
390 +void
391 +innodb_adaptive_flushing_method_update(
392 +  THD* thd,
393 +  struct st_mysql_sys_var*     var,
394 +  void*        var_ptr,
395 +  const void*  save)
396 +{
397 +  *(long *)var_ptr= (*(long *)save) % 4;
398 +}
399 +const char *adaptive_flushing_method_names[]=
400 +{
401 +  "native", /* 0 */
402 +  "estimate", /* 1 */
403 +  "keep_average", /* 2 */
404 +  /* For compatibility of the older patch */
405 +  "0", /* 3 ("none" + 3) */
406 +  "1", /* 4 ("estimate" + 3) */
407 +  "2", /* 5 ("keep_average" + 3) */
408 +  NullS
409 +};
410 +TYPELIB adaptive_flushing_method_typelib=
411 +{
412 +  array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib",
413 +  adaptive_flushing_method_names, NULL
414 +};
415 +static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method,
416 +  PLUGIN_VAR_RQCMDARG,
417 +  "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)",
418 +  NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib);
419 +
420  static struct st_mysql_sys_var* innobase_system_variables[]= {
421    MYSQL_SYSVAR(additional_mem_pool_size),
422    MYSQL_SYSVAR(autoextend_increment),
423 @@ -11462,6 +11609,7 @@
424    MYSQL_SYSVAR(file_format_check),
425    MYSQL_SYSVAR(file_format_max),
426    MYSQL_SYSVAR(flush_log_at_trx_commit),
427 +  MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
428    MYSQL_SYSVAR(flush_method),
429    MYSQL_SYSVAR(force_recovery),
430    MYSQL_SYSVAR(large_prefix),
431 @@ -11501,6 +11649,13 @@
432    MYSQL_SYSVAR(show_verbose_locks),
433    MYSQL_SYSVAR(show_locks_held),
434    MYSQL_SYSVAR(version),
435 +  MYSQL_SYSVAR(ibuf_max_size),
436 +  MYSQL_SYSVAR(ibuf_active_contract),
437 +  MYSQL_SYSVAR(ibuf_accel_rate),
438 +  MYSQL_SYSVAR(checkpoint_age_target),
439 +  MYSQL_SYSVAR(flush_neighbor_pages),
440 +  MYSQL_SYSVAR(read_ahead),
441 +  MYSQL_SYSVAR(adaptive_flushing_method),
442    MYSQL_SYSVAR(use_sys_malloc),
443    MYSQL_SYSVAR(use_native_aio),
444    MYSQL_SYSVAR(change_buffering),
445 --- a/storage/innobase/ibuf/ibuf0ibuf.c
446 +++ b/storage/innobase/ibuf/ibuf0ibuf.c
447 @@ -523,8 +523,10 @@
448         grow in size, as the references on the upper levels of the tree can
449         change */
450  
451 -       ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
452 -               / IBUF_POOL_SIZE_PER_MAX_SIZE;
453 +       ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
454 +               / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
455 +
456 +       srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
457  
458         mutex_create(ibuf_pessimistic_insert_mutex_key,
459                      &ibuf_pessimistic_insert_mutex,
460 @@ -2763,9 +2765,11 @@
461         size = ibuf->size;
462         max_size = ibuf->max_size;
463  
464 +       if (!srv_ibuf_active_contract) {
465         if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
466                 return;
467         }
468 +       }
469  
470         sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
471  
472 --- a/storage/innobase/include/buf0rea.h
473 +++ b/storage/innobase/include/buf0rea.h
474 @@ -149,8 +149,7 @@
475  
476  /** The size in pages of the area which the read-ahead algorithms read if
477  invoked */
478 -#define        BUF_READ_AHEAD_AREA(b)                                  \
479 -       ut_min(64, ut_2_power_up((b)->curr_size / 32))
480 +#define        BUF_READ_AHEAD_AREA(b)          64
481  
482  /** @name Modes used in read-ahead @{ */
483  /** read only pages belonging to the insert buffer tree */
484 --- a/storage/innobase/include/fil0fil.h
485 +++ b/storage/innobase/include/fil0fil.h
486 @@ -663,8 +663,9 @@
487  void
488  fil_flush(
489  /*======*/
490 -       ulint   space_id);      /*!< in: file space id (this can be a group of
491 +       ulint   space_id,       /*!< in: file space id (this can be a group of
492                                 log files or a tablespace of the database) */
493 +       ibool   metadata);
494  /**********************************************************************//**
495  Flushes to disk writes in file spaces of the given type possibly cached by
496  the OS. */
497 --- a/storage/innobase/include/ha_prototypes.h
498 +++ b/storage/innobase/include/ha_prototypes.h
499 @@ -284,6 +284,13 @@
500  /*===================*/
501          void*   thd,   /*!< in: thread handle (THD*) */
502          ulint   value);        /*!< in: time waited for the lock */
503 +/******************************************************************//**
504 +*/
505 +
506 +ulong
507 +thd_flush_log_at_trx_commit(
508 +/*================================*/
509 +       void*   thd);
510  
511  /**********************************************************************//**
512  Get the current setting of the lower_case_table_names global parameter from
513 --- a/storage/innobase/include/os0file.h
514 +++ b/storage/innobase/include/os0file.h
515 @@ -296,8 +296,8 @@
516         pfs_os_file_write_func(name, file, buf, offset, offset_high,    \
517                                n, __FILE__, __LINE__)
518  
519 -# define os_file_flush(file)                                           \
520 -       pfs_os_file_flush_func(file, __FILE__, __LINE__)
521 +# define os_file_flush(file, metadata)                                 \
522 +       pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__)
523  
524  # define os_file_rename(key, oldpath, newpath)                         \
525         pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
526 @@ -333,7 +333,7 @@
527  # define os_file_write(name, file, buf, offset, offset_high, n)                \
528         os_file_write_func(name, file, buf, offset, offset_high, n)
529  
530 -# define os_file_flush(file)   os_file_flush_func(file)
531 +# define os_file_flush(file, metadata) os_file_flush_func(file, metadata)
532  
533  # define os_file_rename(key, oldpath, newpath)                         \
534         os_file_rename_func(oldpath, newpath)
535 @@ -781,6 +781,7 @@
536  pfs_os_file_flush_func(
537  /*===================*/
538         os_file_t       file,   /*!< in, own: handle to a file */
539 +       ibool           metadata,
540         const char*     src_file,/*!< in: file name where func invoked */
541         ulint           src_line);/*!< in: line where the func invoked */
542  
543 @@ -860,7 +861,8 @@
544  ibool
545  os_file_flush_func(
546  /*===============*/
547 -       os_file_t       file);  /*!< in, own: handle to a file */
548 +       os_file_t       file,   /*!< in, own: handle to a file */
549 +       ibool           metadata);
550  /***********************************************************************//**
551  Retrieves the last error number if an error occurs in a file io function.
552  The number should be retrieved before any other OS calls (because they may
553 --- a/storage/innobase/include/os0file.ic
554 +++ b/storage/innobase/include/os0file.ic
555 @@ -369,6 +369,7 @@
556  pfs_os_file_flush_func(
557  /*===================*/
558         os_file_t       file,   /*!< in, own: handle to a file */
559 +       ibool           metadata,
560         const char*     src_file,/*!< in: file name where func invoked */
561         ulint           src_line)/*!< in: line where the func invoked */
562  {
563 @@ -378,7 +379,7 @@
564  
565         register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
566                                    src_file, src_line);
567 -       result = os_file_flush_func(file);
568 +       result = os_file_flush_func(file, metadata);
569  
570         register_pfs_file_io_end(locker, 0);
571  
572 --- a/storage/innobase/include/srv0srv.h
573 +++ b/storage/innobase/include/srv0srv.h
574 @@ -138,7 +138,8 @@
575  extern ulint   srv_n_log_files;
576  extern ulint   srv_log_file_size;
577  extern ulint   srv_log_buffer_size;
578 -extern ulong   srv_flush_log_at_trx_commit;
579 +//extern ulong srv_flush_log_at_trx_commit;
580 +extern char    srv_use_global_flush_log_at_trx_commit;
581  extern char    srv_adaptive_flushing;
582  
583  /* If this flag is TRUE, then we will load the indexes' (and tables') metadata
584 @@ -221,6 +222,16 @@
585  extern ulong   srv_max_purge_lag;
586  
587  extern ulong   srv_replication_delay;
588 +
589 +extern long long       srv_ibuf_max_size;
590 +extern ulint   srv_ibuf_active_contract;
591 +extern ulint   srv_ibuf_accel_rate;
592 +extern ulint   srv_checkpoint_age_target;
593 +extern ulint   srv_flush_neighbor_pages;
594 +extern ulint   srv_enable_unsafe_group_commit;
595 +extern ulint   srv_read_ahead;
596 +extern ulint   srv_adaptive_flushing_method;
597 +
598  /*-------------------------------------------*/
599  
600  extern ulint   srv_n_rows_inserted;
601 @@ -399,8 +410,9 @@
602                                 when writing data files, but do flush
603                                 after writing to log files */
604         SRV_UNIX_NOSYNC,        /*!< do not flush after writing */
605 -       SRV_UNIX_O_DIRECT       /*!< invoke os_file_set_nocache() on
606 +       SRV_UNIX_O_DIRECT,      /*!< invoke os_file_set_nocache() on
607                                 data files */
608 +       SRV_UNIX_ALL_O_DIRECT   /* new method for examination: logfile also open O_DIRECT */
609  };
610  
611  /** Alternatives for file i/o in Windows */
612 --- a/storage/innobase/log/log0log.c
613 +++ b/storage/innobase/log/log0log.c
614 @@ -48,6 +48,7 @@
615  #include "srv0start.h"
616  #include "trx0sys.h"
617  #include "trx0trx.h"
618 +#include "ha_prototypes.h"
619  
620  /*
621  General philosophy of InnoDB redo-logs:
622 @@ -359,6 +360,33 @@
623  }
624  
625  /************************************************************//**
626 +*/
627 +UNIV_INLINE
628 +ulint
629 +log_max_modified_age_async()
630 +{
631 +       if (srv_checkpoint_age_target) {
632 +               return(ut_min(log_sys->max_modified_age_async,
633 +                               srv_checkpoint_age_target
634 +                               - srv_checkpoint_age_target / 8));
635 +       } else {
636 +               return(log_sys->max_modified_age_async);
637 +       }
638 +}
639 +
640 +UNIV_INLINE
641 +ulint
642 +log_max_checkpoint_age_async()
643 +{
644 +       if (srv_checkpoint_age_target) {
645 +               return(ut_min(log_sys->max_checkpoint_age_async,
646 +                               srv_checkpoint_age_target));
647 +       } else {
648 +               return(log_sys->max_checkpoint_age_async);
649 +       }
650 +}
651 +
652 +/************************************************************//**
653  Closes the log.
654  @return        lsn */
655  UNIV_INTERN
656 @@ -427,7 +455,7 @@
657                 }
658         }
659  
660 -       if (checkpoint_age <= log->max_modified_age_async) {
661 +       if (checkpoint_age <= log_max_modified_age_async()) {
662  
663                 goto function_exit;
664         }
665 @@ -435,8 +463,8 @@
666         oldest_lsn = buf_pool_get_oldest_modification();
667  
668         if (!oldest_lsn
669 -           || lsn - oldest_lsn > log->max_modified_age_async
670 -           || checkpoint_age > log->max_checkpoint_age_async) {
671 +           || lsn - oldest_lsn > log_max_modified_age_async()
672 +           || checkpoint_age > log_max_checkpoint_age_async()) {
673  
674                 log->check_flush_or_checkpoint = TRUE;
675         }
676 @@ -1100,9 +1128,10 @@
677                 group = (log_group_t*)((ulint)group - 1);
678  
679                 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
680 +                   && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
681                     && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
682  
683 -                       fil_flush(group->space_id);
684 +                       fil_flush(group->space_id, FALSE);
685                 }
686  
687  #ifdef UNIV_DEBUG
688 @@ -1121,10 +1150,11 @@
689                         logs and cannot end up here! */
690  
691         if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
692 +           && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
693             && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
694 -           && srv_flush_log_at_trx_commit != 2) {
695 +           && thd_flush_log_at_trx_commit(NULL) != 2) {
696  
697 -               fil_flush(group->space_id);
698 +               fil_flush(group->space_id, FALSE);
699         }
700  
701         mutex_enter(&(log_sys->mutex));
702 @@ -1501,7 +1531,8 @@
703  
704         mutex_exit(&(log_sys->mutex));
705  
706 -       if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
707 +       if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
708 +           || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
709                 /* O_DSYNC means the OS did not buffer the log file at all:
710                 so we have also flushed to disk what we have written */
711  
712 @@ -1511,7 +1542,7 @@
713  
714                 group = UT_LIST_GET_FIRST(log_sys->log_groups);
715  
716 -               fil_flush(group->space_id);
717 +               fil_flush(group->space_id, FALSE);
718                 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
719         }
720  
721 @@ -2120,10 +2151,10 @@
722  
723                 sync = TRUE;
724                 advance = 2 * (age - log->max_modified_age_sync);
725 -       } else if (age > log->max_modified_age_async) {
726 +       } else if (age > log_max_modified_age_async()) {
727  
728                 /* A flush is not urgent: we do an asynchronous preflush */
729 -               advance = age - log->max_modified_age_async;
730 +               advance = age - log_max_modified_age_async();
731         } else {
732                 advance = 0;
733         }
734 @@ -2137,7 +2168,7 @@
735  
736                 do_checkpoint = TRUE;
737  
738 -       } else if (checkpoint_age > log->max_checkpoint_age_async) {
739 +       } else if (checkpoint_age > log_max_checkpoint_age_async()) {
740                 /* A checkpoint is not urgent: do it asynchronously */
741  
742                 do_checkpoint = TRUE;
743 @@ -2607,7 +2638,7 @@
744  
745         mutex_exit(&(log_sys->mutex));
746  
747 -       fil_flush(group->archive_space_id);
748 +       fil_flush(group->archive_space_id, TRUE);
749  
750         mutex_enter(&(log_sys->mutex));
751  
752 @@ -3349,6 +3380,17 @@
753                 log_sys->flushed_to_disk_lsn,
754                 log_sys->last_checkpoint_lsn);
755  
756 +       fprintf(file,
757 +               "Max checkpoint age    %lu\n"
758 +               "Checkpoint age target %lu\n"
759 +               "Modified age          %lu\n"
760 +               "Checkpoint age        %lu\n",
761 +               (ulong) log_sys->max_checkpoint_age,
762 +               (ulong) log_max_checkpoint_age_async(),
763 +               (ulong) (log_sys->lsn -
764 +                               log_buf_pool_get_oldest_modification()),
765 +               (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
766 +
767         current_time = time(NULL);
768  
769         time_elapsed = 0.001 + difftime(current_time,
770 --- a/storage/innobase/log/log0recv.c
771 +++ b/storage/innobase/log/log0recv.c
772 @@ -2906,9 +2906,12 @@
773         ib_uint64_t     archived_lsn;
774  #endif /* UNIV_LOG_ARCHIVE */
775         byte*           buf;
776 -       byte            log_hdr_buf[LOG_FILE_HDR_SIZE];
777 +       byte*           log_hdr_buf;
778 +       byte            log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
779         ulint           err;
780  
781 +       log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
782 +
783  #ifdef UNIV_LOG_ARCHIVE
784         ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
785  /** TRUE when recovering from a checkpoint */
786 @@ -3468,7 +3471,7 @@
787                         exit(1);
788                 }
789  
790 -               os_file_flush(log_file);
791 +               os_file_flush(log_file, TRUE);
792                 os_file_close(log_file);
793         }
794  
795 @@ -3492,7 +3495,7 @@
796  
797         os_file_write(name, log_file, buf, 0, 0,
798                       LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
799 -       os_file_flush(log_file);
800 +       os_file_flush(log_file, TRUE);
801         os_file_close(log_file);
802  
803         ut_free(buf);
804 --- a/storage/innobase/os/os0file.c
805 +++ b/storage/innobase/os/os0file.c
806 @@ -1424,7 +1424,7 @@
807  #endif
808  #ifdef UNIV_NON_BUFFERED_IO
809  # ifndef UNIV_HOTBACKUP
810 -               if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
811 +               if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
812                         /* Do not use unbuffered i/o to log files because
813                         value 2 denotes that we do not flush the log at every
814                         commit, but only once per second */
815 @@ -1440,7 +1440,7 @@
816                 attributes = 0;
817  #ifdef UNIV_NON_BUFFERED_IO
818  # ifndef UNIV_HOTBACKUP
819 -               if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
820 +               if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
821                         /* Do not use unbuffered i/o to log files because
822                         value 2 denotes that we do not flush the log at every
823                         commit, but only once per second */
824 @@ -1585,6 +1585,11 @@
825                 os_file_set_nocache(file, name, mode_str);
826         }
827  
828 +       /* ALL_O_DIRECT: O_DIRECT also for transaction log file */
829 +       if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
830 +               os_file_set_nocache(file, name, mode_str);
831 +       }
832 +
833  #ifdef USE_FILE_LOCK
834         if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
835  
836 @@ -2008,7 +2013,7 @@
837  
838         ut_free(buf2);
839  
840 -       ret = os_file_flush(file);
841 +       ret = os_file_flush(file, TRUE);
842  
843         if (ret) {
844                 return(TRUE);
845 @@ -2046,7 +2051,8 @@
846  int
847  os_file_fsync(
848  /*==========*/
849 -       os_file_t       file)   /*!< in: handle to a file */
850 +       os_file_t       file,   /*!< in: handle to a file */
851 +       ibool           metadata)
852  {
853         int     ret;
854         int     failures;
855 @@ -2055,7 +2061,16 @@
856         failures = 0;
857  
858         do {
859 +#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
860 +               if (metadata) {
861 +                       ret = fsync(file);
862 +               } else {
863 +                       ret = fdatasync(file);
864 +               }
865 +#else
866 +               (void) metadata;
867                 ret = fsync(file);
868 +#endif
869  
870                 os_n_fsyncs++;
871  
872 @@ -2092,7 +2107,8 @@
873  ibool
874  os_file_flush_func(
875  /*===============*/
876 -       os_file_t       file)   /*!< in, own: handle to a file */
877 +       os_file_t       file,   /*!< in, own: handle to a file */
878 +       ibool           metadata)
879  {
880  #ifdef __WIN__
881         BOOL    ret;
882 @@ -2142,18 +2158,18 @@
883                 /* If we are not on an operating system that supports this,
884                 then fall back to a plain fsync. */
885  
886 -               ret = os_file_fsync(file);
887 +               ret = os_file_fsync(file, metadata);
888         } else {
889                 ret = fcntl(file, F_FULLFSYNC, NULL);
890  
891                 if (ret) {
892                         /* If we are not on a file system that supports this,
893                         then fall back to a plain fsync. */
894 -                       ret = os_file_fsync(file);
895 +                       ret = os_file_fsync(file, metadata);
896                 }
897         }
898  #else
899 -       ret = os_file_fsync(file);
900 +       ret = os_file_fsync(file, metadata);
901  #endif
902  
903         if (ret == 0) {
904 @@ -2336,7 +2352,7 @@
905                 the OS crashes, a database page is only partially
906                 physically written to disk. */
907  
908 -               ut_a(TRUE == os_file_flush(file));
909 +               ut_a(TRUE == os_file_flush(file, TRUE));
910         }
911  # endif /* UNIV_DO_FLUSH */
912  
913 @@ -2378,7 +2394,7 @@
914                         the OS crashes, a database page is only partially
915                         physically written to disk. */
916  
917 -                       ut_a(TRUE == os_file_flush(file));
918 +                       ut_a(TRUE == os_file_flush(file, TRUE));
919                 }
920  # endif /* UNIV_DO_FLUSH */
921  
922 @@ -2750,7 +2766,7 @@
923  
924  # ifdef UNIV_DO_FLUSH
925         if (!os_do_not_call_flush_at_each_write) {
926 -               ut_a(TRUE == os_file_flush(file));
927 +               ut_a(TRUE == os_file_flush(file, TRUE));
928         }
929  # endif /* UNIV_DO_FLUSH */
930  
931 @@ -4296,7 +4312,7 @@
932  #ifdef UNIV_DO_FLUSH
933                 if (slot->type == OS_FILE_WRITE
934                     && !os_do_not_call_flush_at_each_write) {
935 -                       if (!os_file_flush(slot->file)) {
936 +                       if (!os_file_flush(slot->file, TRUE)) {
937                                 ut_error;
938                         }
939                 }
940 @@ -4597,7 +4613,7 @@
941  #ifdef UNIV_DO_FLUSH
942                 if (slot->type == OS_FILE_WRITE
943                     && !os_do_not_call_flush_at_each_write)
944 -                   && !os_file_flush(slot->file) {
945 +                   && !os_file_flush(slot->file, TRUE) {
946                         ut_error;
947                 }
948  #endif /* UNIV_DO_FLUSH */
949 --- a/storage/innobase/srv/srv0srv.c
950 +++ b/storage/innobase/srv/srv0srv.c
951 @@ -183,7 +183,8 @@
952  UNIV_INTERN ulint      srv_log_file_size       = ULINT_MAX;
953  /* size in database pages */
954  UNIV_INTERN ulint      srv_log_buffer_size     = ULINT_MAX;
955 -UNIV_INTERN ulong      srv_flush_log_at_trx_commit = 1;
956 +//UNIV_INTERN ulong    srv_flush_log_at_trx_commit = 1;
957 +UNIV_INTERN char       srv_use_global_flush_log_at_trx_commit  = TRUE;
958  
959  /* Try to flush dirty pages so as to avoid IO bursts at
960  the checkpoints. */
961 @@ -404,6 +405,17 @@
962  
963  UNIV_INTERN ulong      srv_replication_delay           = 0;
964  
965 +UNIV_INTERN long long  srv_ibuf_max_size = 0;
966 +UNIV_INTERN ulint      srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
967 +UNIV_INTERN ulint      srv_ibuf_accel_rate = 100;
968 +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
969 +
970 +UNIV_INTERN ulint      srv_checkpoint_age_target = 0;
971 +UNIV_INTERN ulint      srv_flush_neighbor_pages = 1; /* 0:disable 1:area 2:contiguous */
972 +
973 +UNIV_INTERN ulint      srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
974 +UNIV_INTERN ulint      srv_read_ahead = 3; /* 1: random  2: linear  3: Both */
975 +UNIV_INTERN ulint      srv_adaptive_flushing_method = 0; /* 0: native  1: estimate  2: keep_average */
976  /*-------------------------------------------*/
977  UNIV_INTERN ulong      srv_n_spin_wait_rounds  = 30;
978  UNIV_INTERN ulong      srv_n_free_tickets_to_enter = 500;
979 @@ -2713,7 +2725,7 @@
980  
981         ut_ad(!mutex_own(&kernel_mutex));
982  
983 -       ut_a(srv_n_purge_threads == 0);
984 +       ut_a(srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0));
985  
986         do {
987                 /* Check for shutdown and change in purge config. */
988 @@ -2746,6 +2758,7 @@
989         ulint           n_pages_purged  = 0;
990         ulint           n_bytes_merged;
991         ulint           n_pages_flushed;
992 +       ulint           n_pages_flushed_prev = 0;
993         ulint           n_bytes_archived;
994         ulint           n_tables_to_drop;
995         ulint           n_ios;
996 @@ -2753,7 +2766,20 @@
997         ulint           n_ios_very_old;
998         ulint           n_pend_ios;
999         ulint           next_itr_time;
1000 +       ulint           prev_adaptive_flushing_method = ULINT_UNDEFINED;
1001 +       ulint           inner_loop = 0;
1002 +       ibool           skip_sleep      = FALSE;
1003         ulint           i;
1004 +       struct t_prev_flush_info_struct {
1005 +               ulint           count;
1006 +               unsigned        space:32;
1007 +               unsigned        offset:32;
1008 +               ib_uint64_t     oldest_modification;
1009 +       } prev_flush_info[MAX_BUFFER_POOLS];
1010 +
1011 +       ib_uint64_t     lsn_old;
1012 +
1013 +       ib_uint64_t     oldest_lsn;
1014  
1015  #ifdef UNIV_DEBUG_THREAD_CREATION
1016         fprintf(stderr, "Master thread starts, id %lu\n",
1017 @@ -2775,6 +2801,9 @@
1018  
1019         mutex_exit(&kernel_mutex);
1020  
1021 +       mutex_enter(&(log_sys->mutex));
1022 +       lsn_old = log_sys->lsn;
1023 +       mutex_exit(&(log_sys->mutex));
1024  loop:
1025         /*****************************************************************/
1026         /* ---- When there is database activity by users, we cycle in this
1027 @@ -2805,9 +2834,13 @@
1028         /* Sleep for 1 second on entrying the for loop below the first time. */
1029         next_itr_time = ut_time_ms() + 1000;
1030  
1031 +       skip_sleep = FALSE;
1032 +
1033         for (i = 0; i < 10; i++) {
1034                 ulint   cur_time = ut_time_ms();
1035  
1036 +               n_pages_flushed = 0; /* initialize */
1037 +
1038                 /* ALTER TABLE in MySQL requires on Unix that the table handler
1039                 can drop tables lazily after there no longer are SELECT
1040                 queries to them. */
1041 @@ -2831,6 +2864,7 @@
1042                 srv_main_thread_op_info = "sleeping";
1043                 srv_main_1_second_loops++;
1044  
1045 +               if (!skip_sleep) {
1046                 if (next_itr_time > cur_time
1047                     && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
1048  
1049 @@ -2841,10 +2875,26 @@
1050                                         (next_itr_time - cur_time)
1051                                          * 1000));
1052                         srv_main_sleeps++;
1053 +
1054 +                       /*
1055 +                       mutex_enter(&(log_sys->mutex));
1056 +                       oldest_lsn = buf_pool_get_oldest_modification();
1057 +                       ib_uint64_t     lsn = log_sys->lsn;
1058 +                       mutex_exit(&(log_sys->mutex));
1059 +
1060 +                       if(oldest_lsn)
1061 +                       fprintf(stderr,
1062 +                               "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
1063 +                               (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
1064 +                               lsn - lsn_old);
1065 +                       */
1066                 }
1067  
1068                 /* Each iteration should happen at 1 second interval. */
1069                 next_itr_time = ut_time_ms() + 1000;
1070 +               } /* if (!skip_sleep) */
1071 +
1072 +               skip_sleep = FALSE;
1073  
1074                 /* Flush logs if needed */
1075                 srv_sync_log_buffer_in_background();
1076 @@ -2864,7 +2914,7 @@
1077                 if (n_pend_ios < SRV_PEND_IO_THRESHOLD
1078                     && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
1079                         srv_main_thread_op_info = "doing insert buffer merge";
1080 -                       ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
1081 +                       ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
1082  
1083                         /* Flush logs if needed */
1084                         srv_sync_log_buffer_in_background();
1085 @@ -2881,7 +2931,11 @@
1086                         n_pages_flushed = buf_flush_list(
1087                                 PCT_IO(100), IB_ULONGLONG_MAX);
1088  
1089 -               } else if (srv_adaptive_flushing) {
1090 +                       mutex_enter(&(log_sys->mutex));
1091 +                       lsn_old = log_sys->lsn;
1092 +                       mutex_exit(&(log_sys->mutex));
1093 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
1094 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) {
1095  
1096                         /* Try to keep the rate of flushing of dirty
1097                         pages such that redo log generation does not
1098 @@ -2897,6 +2951,224 @@
1099                                                 n_flush,
1100                                                 IB_ULONGLONG_MAX);
1101                         }
1102 +
1103 +                       mutex_enter(&(log_sys->mutex));
1104 +                       lsn_old = log_sys->lsn;
1105 +                       mutex_exit(&(log_sys->mutex));
1106 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
1107 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) {
1108 +
1109 +                       /* Try to keep modified age not to exceed
1110 +                       max_checkpoint_age * 7/8 line */
1111 +
1112 +                       mutex_enter(&(log_sys->mutex));
1113 +
1114 +                       oldest_lsn = buf_pool_get_oldest_modification();
1115 +                       if (oldest_lsn == 0) {
1116 +                               lsn_old = log_sys->lsn;
1117 +                               mutex_exit(&(log_sys->mutex));
1118 +
1119 +                       } else {
1120 +                               if ((log_sys->lsn - oldest_lsn)
1121 +                                   > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
1122 +                                       /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
1123 +                                       /* We should not flush from here. */
1124 +                                       lsn_old = log_sys->lsn;
1125 +                                       mutex_exit(&(log_sys->mutex));
1126 +                               } else if ((log_sys->lsn - oldest_lsn)
1127 +                                          > (log_sys->max_checkpoint_age)/4 ) {
1128 +
1129 +                                       /* defence line (max_checkpoint_age * 1/2) */
1130 +                                       ib_uint64_t     lsn = log_sys->lsn;
1131 +
1132 +                                       ib_uint64_t     level, bpl;
1133 +                                       buf_page_t*     bpage;
1134 +                                       ulint           j;
1135 +
1136 +                                       mutex_exit(&(log_sys->mutex));
1137 +
1138 +                                       bpl = 0;
1139 +
1140 +                                       for (j = 0; j < srv_buf_pool_instances; j++) {
1141 +                                               buf_pool_t*     buf_pool;
1142 +                                               ulint           n_blocks;
1143 +
1144 +                                               buf_pool = buf_pool_from_array(j);
1145 +
1146 +                                               /* The scanning flush_list is optimistic here */
1147 +
1148 +                                               level = 0;
1149 +                                               n_blocks = 0;
1150 +                                               bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1151 +
1152 +                                               while (bpage != NULL) {
1153 +                                                       ib_uint64_t     oldest_modification = bpage->oldest_modification;
1154 +                                                       if (oldest_modification != 0) {
1155 +                                                               level += log_sys->max_checkpoint_age
1156 +                                                                        - (lsn - oldest_modification);
1157 +                                                       }
1158 +                                                       bpage = UT_LIST_GET_NEXT(list, bpage);
1159 +                                                       n_blocks++;
1160 +                                               }
1161 +
1162 +                                               if (level) {
1163 +                                                       bpl += ((ib_uint64_t) n_blocks * n_blocks
1164 +                                                               * (lsn - lsn_old)) / level;
1165 +                                               }
1166 +
1167 +                                       }
1168 +
1169 +                                       if (!srv_use_doublewrite_buf) {
1170 +                                               /* flush is faster than when doublewrite */
1171 +                                               bpl = (bpl * 7) / 8;
1172 +                                       }
1173 +
1174 +                                       if (bpl) {
1175 +retry_flush_batch:
1176 +                                               n_pages_flushed = buf_flush_list(bpl,
1177 +                                                                       oldest_lsn + (lsn - lsn_old));
1178 +                                               if (n_pages_flushed == ULINT_UNDEFINED) {
1179 +                                                       os_thread_sleep(5000);
1180 +                                                       goto retry_flush_batch;
1181 +                                               }
1182 +                                       }
1183 +
1184 +                                       lsn_old = lsn;
1185 +                                       /*
1186 +                                       fprintf(stderr,
1187 +                                               "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
1188 +                                               (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
1189 +                                               lsn - lsn_old, bpl);
1190 +                                       */
1191 +                               } else {
1192 +                                       lsn_old = log_sys->lsn;
1193 +                                       mutex_exit(&(log_sys->mutex));
1194 +                               }
1195 +                       }
1196 +                       prev_adaptive_flushing_method = 1;
1197 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) {
1198 +                       buf_pool_t*     buf_pool;
1199 +                       buf_page_t*     bpage;
1200 +                       ib_uint64_t     lsn;
1201 +                       ulint           j;
1202 +
1203 +                       mutex_enter(&(log_sys->mutex));
1204 +                       oldest_lsn = buf_pool_get_oldest_modification();
1205 +                       lsn = log_sys->lsn;
1206 +                       mutex_exit(&(log_sys->mutex));
1207 +
1208 +                       /* upper loop/sec. (x10) */
1209 +                       next_itr_time -= 900; /* 1000 - 900 == 100 */
1210 +                       inner_loop++;
1211 +                       if (inner_loop < 10) {
1212 +                               i--;
1213 +                       } else {
1214 +                               inner_loop = 0;
1215 +                       }
1216 +
1217 +                       if (prev_adaptive_flushing_method == 2) {
1218 +                               lint    n_flush;
1219 +                               lint    blocks_sum;
1220 +                               ulint   new_blocks_sum, flushed_blocks_sum;
1221 +
1222 +                               blocks_sum = new_blocks_sum = flushed_blocks_sum = 0;
1223 +
1224 +                               /* prev_flush_info[j] should be the previous loop's */
1225 +                               for (j = 0; j < srv_buf_pool_instances; j++) {
1226 +                                       lint    blocks_num, new_blocks_num, flushed_blocks_num;
1227 +                                       ibool   found;
1228 +
1229 +                                       buf_pool = buf_pool_from_array(j);
1230 +
1231 +                                       blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list);
1232 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1233 +                                       new_blocks_num = 0;
1234 +
1235 +                                       found = FALSE;
1236 +                                       while (bpage != NULL) {
1237 +                                               if (prev_flush_info[j].space == bpage->space
1238 +                                                   && prev_flush_info[j].offset == bpage->offset
1239 +                                                   && prev_flush_info[j].oldest_modification
1240 +                                                               == bpage->oldest_modification) {
1241 +                                                       found = TRUE;
1242 +                                                       break;
1243 +                                               }
1244 +                                               bpage = UT_LIST_GET_NEXT(list, bpage);
1245 +                                               new_blocks_num++;
1246 +                                       }
1247 +                                       if (!found) {
1248 +                                               new_blocks_num = blocks_num;
1249 +                                       }
1250 +
1251 +                                       flushed_blocks_num = new_blocks_num + prev_flush_info[j].count
1252 +                                                               - blocks_num;
1253 +                                       if (flushed_blocks_num < 0) {
1254 +                                               flushed_blocks_num = 0;
1255 +                                       }
1256 +
1257 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1258 +
1259 +                                       prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1260 +                                       if (bpage) {
1261 +                                               prev_flush_info[j].space = bpage->space;
1262 +                                               prev_flush_info[j].offset = bpage->offset;
1263 +                                               prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1264 +                                       } else {
1265 +                                               prev_flush_info[j].space = 0;
1266 +                                               prev_flush_info[j].offset = 0;
1267 +                                               prev_flush_info[j].oldest_modification = 0;
1268 +                                       }
1269 +
1270 +                                       new_blocks_sum += new_blocks_num;
1271 +                                       flushed_blocks_sum += flushed_blocks_num;
1272 +                                       blocks_sum += blocks_num;
1273 +                               }
1274 +
1275 +                               n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async;
1276 +                               if (flushed_blocks_sum > n_pages_flushed_prev) {
1277 +                                       n_flush -= (flushed_blocks_sum - n_pages_flushed_prev);
1278 +                               }
1279 +
1280 +                               if (n_flush > 0) {
1281 +                                       n_flush++;
1282 +                                       n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old));
1283 +                               } else {
1284 +                                       n_pages_flushed = 0;
1285 +                               }                                       
1286 +                       } else {
1287 +                               /* store previous first pages of the flush_list */
1288 +                               for (j = 0; j < srv_buf_pool_instances; j++) {
1289 +                                       buf_pool = buf_pool_from_array(j);
1290 +
1291 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1292 +
1293 +                                       prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1294 +                                       if (bpage) {
1295 +                                               prev_flush_info[j].space = bpage->space;
1296 +                                               prev_flush_info[j].offset = bpage->offset;
1297 +                                               prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1298 +                                       } else {
1299 +                                               prev_flush_info[j].space = 0;
1300 +                                               prev_flush_info[j].offset = 0;
1301 +                                               prev_flush_info[j].oldest_modification = 0;
1302 +                                       }
1303 +                               }
1304 +                               n_pages_flushed = 0;
1305 +                       }
1306 +
1307 +                       lsn_old = lsn;
1308 +                       prev_adaptive_flushing_method = 2;
1309 +               } else {
1310 +                       mutex_enter(&(log_sys->mutex));
1311 +                       lsn_old = log_sys->lsn;
1312 +                       mutex_exit(&(log_sys->mutex));
1313 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
1314 +               }
1315 +
1316 +               if (n_pages_flushed == ULINT_UNDEFINED) {
1317 +                       n_pages_flushed_prev = 0;
1318 +               } else {
1319 +                       n_pages_flushed_prev = n_pages_flushed;
1320                 }
1321  
1322                 if (srv_activity_count == old_activity_count) {
1323 @@ -2945,12 +3217,12 @@
1324         even if the server were active */
1325  
1326         srv_main_thread_op_info = "doing insert buffer merge";
1327 -       ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
1328 +       ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
1329  
1330         /* Flush logs if needed */
1331         srv_sync_log_buffer_in_background();
1332  
1333 -       if (srv_n_purge_threads == 0) {
1334 +       if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
1335                 srv_main_thread_op_info = "master purging";
1336  
1337                 srv_master_do_purge();
1338 @@ -3028,7 +3300,7 @@
1339                 }
1340         }
1341  
1342 -       if (srv_n_purge_threads == 0) {
1343 +       if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
1344                 srv_main_thread_op_info = "master purging";
1345  
1346                 srv_master_do_purge();
1347 @@ -3053,7 +3325,7 @@
1348                 buf_flush_list below. Otherwise, the system favors
1349                 clean pages over cleanup throughput. */
1350                 n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
1351 -                                                          PCT_IO(100));
1352 +                                                          PCT_IBUF_IO(100));
1353         }
1354  
1355         srv_main_thread_op_info = "reserving kernel mutex";
1356 @@ -3193,6 +3465,7 @@
1357         srv_slot_t*     slot;
1358         ulint           retries = 0;
1359         ulint           n_total_purged = ULINT_UNDEFINED;
1360 +       ulint           next_itr_time;
1361  
1362         ut_a(srv_n_purge_threads == 1);
1363  
1364 @@ -3213,9 +3486,12 @@
1365  
1366         mutex_exit(&kernel_mutex);
1367  
1368 +       next_itr_time = ut_time_ms();
1369 +
1370         while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
1371  
1372                 ulint   n_pages_purged = 0;
1373 +               ulint   cur_time;
1374  
1375                 /* If there are very few records to purge or the last
1376                 purge didn't purge any records then wait for activity.
1377 @@ -3262,6 +3538,16 @@
1378                 } while (n_pages_purged > 0 && !srv_fast_shutdown);
1379  
1380                 srv_sync_log_buffer_in_background();
1381 +
1382 +               cur_time = ut_time_ms();
1383 +               if (next_itr_time > cur_time) {
1384 +                       os_thread_sleep(ut_min(1000000,
1385 +                                       (next_itr_time - cur_time)
1386 +                                        * 1000));
1387 +                       next_itr_time = ut_time_ms() + 1000;
1388 +               } else {
1389 +                       next_itr_time = cur_time + 1000;
1390 +               }
1391         }
1392  
1393         mutex_enter(&kernel_mutex);
1394 --- a/storage/innobase/srv/srv0start.c
1395 +++ b/storage/innobase/srv/srv0start.c
1396 @@ -1237,6 +1237,9 @@
1397         } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
1398                 srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
1399  
1400 +       } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
1401 +               srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
1402 +
1403         } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
1404                 srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
1405  
1406 --- a/storage/innobase/trx/trx0purge.c
1407 +++ b/storage/innobase/trx/trx0purge.c
1408 @@ -392,10 +392,10 @@
1409         trx_sys->rseg_history_len++;
1410         mutex_exit(&kernel_mutex);
1411  
1412 -       if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
1413 +//     if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
1414                 /* Inform the purge thread that there is work to do. */
1415                 srv_wake_purge_thread_if_not_active();
1416 -       }
1417 +//     }
1418  }
1419  
1420  /**********************************************************************//**
1421 --- a/storage/innobase/trx/trx0trx.c
1422 +++ b/storage/innobase/trx/trx0trx.c
1423 @@ -984,6 +984,7 @@
1424         trx->read_view = NULL;
1425  
1426         if (lsn) {
1427 +               ulint   flush_log_at_trx_commit;
1428  
1429                 mutex_exit(&kernel_mutex);
1430  
1431 @@ -992,6 +993,12 @@
1432                         trx_undo_insert_cleanup(trx);
1433                 }
1434  
1435 +               if (srv_use_global_flush_log_at_trx_commit) {
1436 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1437 +               } else {
1438 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1439 +               }
1440 +
1441                 /* NOTE that we could possibly make a group commit more
1442                 efficient here: call os_thread_yield here to allow also other
1443                 trxs to come to commit! */
1444 @@ -1023,9 +1030,9 @@
1445                 if (trx->flush_log_later) {
1446                         /* Do nothing yet */
1447                         trx->must_flush_log_later = TRUE;
1448 -               } else if (srv_flush_log_at_trx_commit == 0) {
1449 +               } else if (flush_log_at_trx_commit == 0) {
1450                         /* Do nothing */
1451 -               } else if (srv_flush_log_at_trx_commit == 1) {
1452 +               } else if (flush_log_at_trx_commit == 1) {
1453                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1454                                 /* Write the log but do not flush it to disk */
1455  
1456 @@ -1037,7 +1044,7 @@
1457  
1458                                 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1459                         }
1460 -               } else if (srv_flush_log_at_trx_commit == 2) {
1461 +               } else if (flush_log_at_trx_commit == 2) {
1462  
1463                         /* Write the log but do not flush it to disk */
1464  
1465 @@ -1701,16 +1708,23 @@
1466         trx_t*  trx)    /*!< in: trx handle */
1467  {
1468         ib_uint64_t     lsn     = trx->commit_lsn;
1469 +       ulint           flush_log_at_trx_commit;
1470  
1471         ut_a(trx);
1472  
1473         trx->op_info = "flushing log";
1474  
1475 +       if (srv_use_global_flush_log_at_trx_commit) {
1476 +               flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1477 +       } else {
1478 +               flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1479 +       }
1480 +
1481         if (!trx->must_flush_log_later) {
1482                 /* Do nothing */
1483 -       } else if (srv_flush_log_at_trx_commit == 0) {
1484 +       } else if (flush_log_at_trx_commit == 0) {
1485                 /* Do nothing */
1486 -       } else if (srv_flush_log_at_trx_commit == 1) {
1487 +       } else if (flush_log_at_trx_commit == 1) {
1488                 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1489                         /* Write the log but do not flush it to disk */
1490  
1491 @@ -1721,7 +1735,7 @@
1492  
1493                         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1494                 }
1495 -       } else if (srv_flush_log_at_trx_commit == 2) {
1496 +       } else if (flush_log_at_trx_commit == 2) {
1497  
1498                 /* Write the log but do not flush it to disk */
1499  
1500 @@ -1969,6 +1983,8 @@
1501         /*--------------------------------------*/
1502  
1503         if (lsn) {
1504 +               ulint   flush_log_at_trx_commit;
1505 +
1506                 /* Depending on the my.cnf options, we may now write the log
1507                 buffer to the log files, making the prepared state of the
1508                 transaction durable if the OS does not crash. We may also
1509 @@ -1988,9 +2004,15 @@
1510  
1511                 mutex_exit(&kernel_mutex);
1512  
1513 -               if (srv_flush_log_at_trx_commit == 0) {
1514 +               if (srv_use_global_flush_log_at_trx_commit) {
1515 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1516 +               } else {
1517 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1518 +               }
1519 +
1520 +               if (flush_log_at_trx_commit == 0) {
1521                         /* Do nothing */
1522 -               } else if (srv_flush_log_at_trx_commit == 1) {
1523 +               } else if (flush_log_at_trx_commit == 1) {
1524                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1525                                 /* Write the log but do not flush it to disk */
1526  
1527 @@ -2002,7 +2024,7 @@
1528  
1529                                 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1530                         }
1531 -               } else if (srv_flush_log_at_trx_commit == 2) {
1532 +               } else if (flush_log_at_trx_commit == 2) {
1533  
1534                         /* Write the log but do not flush it to disk */
1535  
1536 --- a/mysql-test/include/default_mysqld.cnf
1537 +++ b/mysql-test/include/default_mysqld.cnf
1538 @@ -29,7 +29,7 @@
1539  max_heap_table_size=        1M
1540  
1541  loose-innodb_data_file_path=      ibdata1:10M:autoextend
1542 -loose-innodb_buffer_pool_size=    8M
1543 +loose-innodb_buffer_pool_size=    32M
1544  loose-innodb_write_io_threads=    2
1545  loose-innodb_read_io_threads=     2
1546  loose-innodb_log_buffer_size=     1M
1547 --- a/mysql-test/suite/innodb/r/innodb.result
1548 +++ b/mysql-test/suite/innodb/r/innodb.result
1549 @@ -1678,7 +1678,7 @@
1550  drop table t1;
1551  SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_buffer_pool_pages_total';
1552  variable_value
1553 -511
1554 +2047
1555  SELECT variable_value FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_page_size';
1556  variable_value
1557  16384
1558 --- /dev/null
1559 +++ b/mysql-test/suite/innodb/r/percona_flush_contiguous_neighbors.result
1560 @@ -0,0 +1,21 @@
1561 +DROP TABLE IF EXISTS t1;
1562 +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
1563 +INSERT INTO t1(foo) VALUES ('a'), ('b');
1564 +INSERT INTO t1(foo) SELECT foo FROM t1;
1565 +INSERT INTO t1(foo) SELECT foo FROM t1;
1566 +INSERT INTO t1(foo) SELECT foo FROM t1;
1567 +INSERT INTO t1(foo) SELECT foo FROM t1;
1568 +INSERT INTO t1(foo) SELECT foo FROM t1;
1569 +INSERT INTO t1(foo) SELECT foo FROM t1;
1570 +INSERT INTO t1(foo) SELECT foo FROM t1;
1571 +INSERT INTO t1(foo) SELECT foo FROM t1;
1572 +INSERT INTO t1(foo) SELECT foo FROM t1;
1573 +INSERT INTO t1(foo) SELECT foo FROM t1;
1574 +INSERT INTO t1(foo) SELECT foo FROM t1;
1575 +INSERT INTO t1(foo) SELECT foo FROM t1;
1576 +INSERT INTO t1(foo) SELECT foo FROM t1;
1577 +INSERT INTO t1(foo) SELECT foo FROM t1;
1578 +INSERT INTO t1(foo) SELECT foo FROM t1;
1579 +INSERT INTO t1(foo) SELECT foo FROM t1;
1580 +INSERT INTO t1(foo) SELECT foo FROM t1;
1581 +DROP TABLE t1;
1582 --- /dev/null
1583 +++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors-master.opt
1584 @@ -0,0 +1 @@
1585 +--innodb_flush_neighbor_pages=cont
1586 --- /dev/null
1587 +++ b/mysql-test/suite/innodb/t/percona_flush_contiguous_neighbors.test
1588 @@ -0,0 +1,36 @@
1589 +# Test for innodb_flush_neighbor_pages=contiguous.
1590 +# The test is very crude: we simply overflow the buffer pool with such a number of
1591 +# new/modified pages that some flushing is bound to happen.
1592 +
1593 +--source include/have_innodb.inc
1594 +
1595 +--disable_warnings
1596 +DROP TABLE IF EXISTS t1;
1597 +--enable_warnings
1598 +
1599 +CREATE TABLE t1 (id INT AUTO_INCREMENT, foo CHAR(255), PRIMARY KEY (id)) ENGINE=InnoDB;
1600 +
1601 +INSERT INTO t1(foo) VALUES ('a'), ('b');
1602 +INSERT INTO t1(foo) SELECT foo FROM t1;
1603 +INSERT INTO t1(foo) SELECT foo FROM t1;
1604 +INSERT INTO t1(foo) SELECT foo FROM t1;
1605 +INSERT INTO t1(foo) SELECT foo FROM t1;
1606 +INSERT INTO t1(foo) SELECT foo FROM t1;
1607 +INSERT INTO t1(foo) SELECT foo FROM t1;
1608 +INSERT INTO t1(foo) SELECT foo FROM t1;
1609 +INSERT INTO t1(foo) SELECT foo FROM t1;
1610 +INSERT INTO t1(foo) SELECT foo FROM t1;
1611 +INSERT INTO t1(foo) SELECT foo FROM t1;
1612 +INSERT INTO t1(foo) SELECT foo FROM t1;
1613 +INSERT INTO t1(foo) SELECT foo FROM t1;
1614 +INSERT INTO t1(foo) SELECT foo FROM t1;
1615 +INSERT INTO t1(foo) SELECT foo FROM t1;
1616 +INSERT INTO t1(foo) SELECT foo FROM t1;
1617 +INSERT INTO t1(foo) SELECT foo FROM t1;
1618 +INSERT INTO t1(foo) SELECT foo FROM t1;
1619 +
1620 +# TODO: cannot record a stable value here.  A check of > 0 should be enough,
1621 +# but the variable is not accessible through INFORMATION_SCHEMA currently.
1622 +# SHOW GLOBAL STATUS LIKE 'Innodb_buffer_pool_pages_flushed';
1623 +
1624 +DROP TABLE t1;
1625 --- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
1626 +++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table-master.opt
1627 @@ -1 +1 @@
1628 ---innodb-buffer-pool-size=8M
1629 +--innodb-buffer-pool-size=32M
1630 --- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
1631 +++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
1632 @@ -36,13 +36,14 @@
1633  
1634  -- disable_query_log
1635  
1636 --- let $i = 400
1637 +-- let $i = 4000
1638 +begin;
1639  while ($i)
1640  {
1641    insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
1642    dec $i;
1643  }
1644 -
1645 +commit;
1646  -- enable_query_log
1647  
1648  # now there should be no 8K pages in the buffer pool
This page took 3.383458 seconds and 3 git commands to generate.