]> git.pld-linux.org Git - packages/mysql.git/blob - innodb_io_patches.patch
- rel 3; percona fixes
[packages/mysql.git] / innodb_io_patches.patch
1 # name       : innodb_io_patches.patch
2 # introduced : 11 or before
3 # maintainer : Yasufumi
4 #
5 #!!! notice !!!
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
9 --- a/storage/innobase/buf/buf0buf.c    2010-12-03 15:09:51.273986410 +0900
10 +++ b/storage/innobase/buf/buf0buf.c    2010-12-03 15:10:08.934990091 +0900
11 @@ -320,6 +320,7 @@
12  
13         /* When we traverse all the flush lists we don't want another
14         thread to add a dirty page to any flush list. */
15 +       if (srv_buf_pool_instances > 1)
16         log_flush_order_mutex_enter();
17  
18         for (i = 0; i < srv_buf_pool_instances; i++) {
19 @@ -343,6 +344,7 @@
20                 }
21         }
22  
23 +       if (srv_buf_pool_instances > 1)
24         log_flush_order_mutex_exit();
25  
26         /* The returned answer may be out of date: the flush_list can
27 diff -ruN a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
28 --- a/storage/innobase/buf/buf0flu.c    2010-11-03 07:01:13.000000000 +0900
29 +++ b/storage/innobase/buf/buf0flu.c    2010-12-03 15:10:08.934990091 +0900
30 @@ -855,7 +855,7 @@
31  flush:
32         /* Now flush the doublewrite buffer data to disk */
33  
34 -       fil_flush(TRX_SYS_SPACE);
35 +       fil_flush(TRX_SYS_SPACE, FALSE);
36  
37         /* We know that the writes have been flushed to disk now
38         and in recovery we will find them in the doublewrite buffer
39 @@ -1376,7 +1376,7 @@
40  
41         ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
42  
43 -       if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
44 +       if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
45                 /* If there is little space, it is better not to flush
46                 any block except from the end of the LRU list */
47  
48 diff -ruN a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
49 --- a/storage/innobase/buf/buf0rea.c    2010-11-03 07:01:13.000000000 +0900
50 +++ b/storage/innobase/buf/buf0rea.c    2010-12-03 15:10:08.937050537 +0900
51 @@ -260,6 +260,10 @@
52                 = BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
53         ulint           threshold;
54  
55 +       if (!(srv_read_ahead & 2)) {
56 +               return(0);
57 +       }
58 +
59         if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
60                 /* No read-ahead to avoid thread deadlocks */
61                 return(0);
62 diff -ruN a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.c
63 --- a/storage/innobase/fil/fil0fil.c    2011-06-29 17:48:24.797971571 +0900
64 +++ b/storage/innobase/fil/fil0fil.c    2011-06-29 18:04:02.548053286 +0900
65 @@ -2600,7 +2600,7 @@
66  
67                 os_thread_sleep(20000);
68  
69 -               fil_flush(id);
70 +               fil_flush(id, TRUE);
71  
72                 goto retry;
73  
74 @@ -2814,7 +2814,7 @@
75                 goto error_exit;
76         }
77  
78 -       ret = os_file_flush(file);
79 +       ret = os_file_flush(file, TRUE);
80  
81         if (!ret) {
82                 fputs("InnoDB: Error: file flush of tablespace ", stderr);
83 @@ -3000,7 +3000,7 @@
84                 }
85         }
86  
87 -       success = os_file_flush(file);
88 +       success = os_file_flush(file, TRUE);
89         if (!success) {
90  
91                 goto func_exit;
92 @@ -3022,7 +3022,7 @@
93  
94                 goto func_exit;
95         }
96 -       success = os_file_flush(file);
97 +       success = os_file_flush(file, TRUE);
98  func_exit:
99         os_file_close(file);
100         ut_free(buf2);
101 @@ -4005,7 +4005,7 @@
102         size_after_extend, *actual_size); */
103         mutex_exit(&fil_system->mutex);
104  
105 -       fil_flush(space_id);
106 +       fil_flush(space_id, TRUE);
107  
108         return(success);
109  }
110 @@ -4576,8 +4576,9 @@
111  void
112  fil_flush(
113  /*======*/
114 -       ulint   space_id)       /*!< in: file space id (this can be a group of
115 +       ulint   space_id,       /*!< in: file space id (this can be a group of
116                                 log files or a tablespace of the database) */
117 +       ibool   metadata)
118  {
119         fil_space_t*    space;
120         fil_node_t*     node;
121 @@ -4648,7 +4649,7 @@
122                         /* fprintf(stderr, "Flushing to file %s\n",
123                         node->name); */
124  
125 -                       os_file_flush(file);
126 +                       os_file_flush(file, metadata);
127  
128                         mutex_enter(&fil_system->mutex);
129  
130 @@ -4731,7 +4732,7 @@
131         a non-existing space id. */
132         for (i = 0; i < n_space_ids; i++) {
133  
134 -               fil_flush(space_ids[i]);
135 +               fil_flush(space_ids[i], TRUE);
136         }
137  
138         mem_free(space_ids);
139 diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
140 --- a/storage/innobase/handler/ha_innodb.cc     2010-12-03 15:09:51.283956391 +0900
141 +++ b/storage/innobase/handler/ha_innodb.cc     2010-12-03 15:10:08.963980444 +0900
142 @@ -444,6 +444,12 @@
143    "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
144    NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
145  
146 +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
147 +  "Set to 0 (write and flush once per second),"
148 +  " 1 (write and flush at each commit)"
149 +  " or 2 (write at commit, flush once per second).",
150 +  NULL, NULL, 1, 0, 2, 0);
151 +
152  
153  static handler *innobase_create_handler(handlerton *hton,
154                                          TABLE_SHARE *table,
155 @@ -838,6 +844,17 @@
156         }
157  }
158  
159 +/******************************************************************//**
160 +*/
161 +extern "C" UNIV_INTERN
162 +ulong
163 +thd_flush_log_at_trx_commit(
164 +/*================================*/
165 +       void*   thd)
166 +{
167 +       return(THDVAR((THD*) thd, flush_log_at_trx_commit));
168 +}
169 +
170  /********************************************************************//**
171  Obtain the InnoDB transaction of a MySQL thread.
172  @return        reference to transaction pointer */
173 @@ -2437,6 +2454,9 @@
174         srv_n_read_io_threads = (ulint) innobase_read_io_threads;
175         srv_n_write_io_threads = (ulint) innobase_write_io_threads;
176  
177 +       srv_read_ahead &= 3;
178 +       srv_adaptive_flushing_method %= 3;
179 +
180         srv_force_recovery = (ulint) innobase_force_recovery;
181  
182         srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
183 @@ -11025,7 +11045,7 @@
184    PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
185    "Purge threads can be either 0 or 1.",
186    NULL, NULL,
187 -  0,                   /* Default setting */
188 +  1,                   /* Default setting */
189    0,                   /* Minimum value */
190    1, 0);               /* Maximum value */
191  
192 @@ -11067,12 +11087,18 @@
193    innodb_file_format_max_validate,
194    innodb_file_format_max_update, "Antelope");
195  
196 -static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
197 -  PLUGIN_VAR_OPCMDARG,
198 -  "Set to 0 (write and flush once per second),"
199 -  " 1 (write and flush at each commit)"
200 -  " or 2 (write at commit, flush once per second).",
201 -  NULL, NULL, 1, 0, 2, 0);
202 +/* Changed to the THDVAR */
203 +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
204 +//  PLUGIN_VAR_OPCMDARG,
205 +//  "Set to 0 (write and flush once per second),"
206 +//  " 1 (write and flush at each commit)"
207 +//  " or 2 (write at commit, flush once per second).",
208 +//  NULL, NULL, 1, 0, 2, 0);
209 +
210 +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit,
211 +  PLUGIN_VAR_NOCMDARG,
212 +  "Use global innodb_flush_log_at_trx_commit value. (default: ON).",
213 +  NULL, NULL, TRUE);
214  
215  static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
216    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
217 @@ -11167,7 +11193,7 @@
218  static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
219    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
220    "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
221 -  NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
222 +  NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
223  
224  static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
225    PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
226 @@ -11319,6 +11345,95 @@
227    "trigger a readahead.",
228    NULL, NULL, 56, 0, 64, 0);
229  
230 +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
231 +  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
232 +  "The maximum size of the insert buffer. (in bytes)",
233 +  NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
234 +
235 +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
236 +  PLUGIN_VAR_RQCMDARG,
237 +  "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
238 +  NULL, NULL, 1, 0, 1, 0);
239 +
240 +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
241 +  PLUGIN_VAR_RQCMDARG,
242 +  "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
243 +  NULL, NULL, 100, 100, 999999999, 0);
244 +
245 +static MYSQL_SYSVAR_ULONG(checkpoint_age_target, srv_checkpoint_age_target,
246 +  PLUGIN_VAR_RQCMDARG,
247 +  "Control soft limit of checkpoint age. (0 : not control)",
248 +  NULL, NULL, 0, 0, ~0UL, 0);
249 +
250 +static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages,
251 +  PLUGIN_VAR_RQCMDARG,
252 +  "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
253 +  NULL, NULL, 1, 0, 1, 0);
254 +
255 +static
256 +void
257 +innodb_read_ahead_update(
258 +  THD* thd,
259 +  struct st_mysql_sys_var*     var,
260 +  void*        var_ptr,
261 +  const void*  save)
262 +{
263 +  *(long *)var_ptr= (*(long *)save) & 3;
264 +}
265 +const char *read_ahead_names[]=
266 +{
267 +  "none", /* 0 */
268 +  "random",
269 +  "linear",
270 +  "both", /* 3 */
271 +  /* For compatibility of the older patch */
272 +  "0", /* 4 ("none" + 4) */
273 +  "1",
274 +  "2",
275 +  "3", /* 7 ("both" + 4) */
276 +  NullS
277 +};
278 +TYPELIB read_ahead_typelib=
279 +{
280 +  array_elements(read_ahead_names) - 1, "read_ahead_typelib",
281 +  read_ahead_names, NULL
282 +};
283 +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
284 +  PLUGIN_VAR_RQCMDARG,
285 +  "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
286 +  NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
287 +
288 +static
289 +void
290 +innodb_adaptive_flushing_method_update(
291 +  THD* thd,
292 +  struct st_mysql_sys_var*     var,
293 +  void*        var_ptr,
294 +  const void*  save)
295 +{
296 +  *(long *)var_ptr= (*(long *)save) % 4;
297 +}
298 +const char *adaptive_flushing_method_names[]=
299 +{
300 +  "native", /* 0 */
301 +  "estimate", /* 1 */
302 +  "keep_average", /* 2 */
303 +  /* For compatibility of the older patch */
304 +  "0", /* 3 ("none" + 3) */
305 +  "1", /* 4 ("estimate" + 3) */
306 +  "2", /* 5 ("keep_average" + 3) */
307 +  NullS
308 +};
309 +TYPELIB adaptive_flushing_method_typelib=
310 +{
311 +  array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib",
312 +  adaptive_flushing_method_names, NULL
313 +};
314 +static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method,
315 +  PLUGIN_VAR_RQCMDARG,
316 +  "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)",
317 +  NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib);
318 +
319  static struct st_mysql_sys_var* innobase_system_variables[]= {
320    MYSQL_SYSVAR(additional_mem_pool_size),
321    MYSQL_SYSVAR(autoextend_increment),
322 @@ -11339,6 +11454,7 @@
323    MYSQL_SYSVAR(file_format_check),
324    MYSQL_SYSVAR(file_format_max),
325    MYSQL_SYSVAR(flush_log_at_trx_commit),
326 +  MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
327    MYSQL_SYSVAR(flush_method),
328    MYSQL_SYSVAR(force_recovery),
329    MYSQL_SYSVAR(locks_unsafe_for_binlog),
330 @@ -11376,6 +11492,13 @@
331    MYSQL_SYSVAR(show_verbose_locks),
332    MYSQL_SYSVAR(show_locks_held),
333    MYSQL_SYSVAR(version),
334 +  MYSQL_SYSVAR(ibuf_max_size),
335 +  MYSQL_SYSVAR(ibuf_active_contract),
336 +  MYSQL_SYSVAR(ibuf_accel_rate),
337 +  MYSQL_SYSVAR(checkpoint_age_target),
338 +  MYSQL_SYSVAR(flush_neighbor_pages),
339 +  MYSQL_SYSVAR(read_ahead),
340 +  MYSQL_SYSVAR(adaptive_flushing_method),
341    MYSQL_SYSVAR(use_sys_malloc),
342    MYSQL_SYSVAR(use_native_aio),
343    MYSQL_SYSVAR(change_buffering),
344 diff -ruN a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
345 --- a/storage/innobase/ibuf/ibuf0ibuf.c 2010-11-03 07:01:13.000000000 +0900
346 +++ b/storage/innobase/ibuf/ibuf0ibuf.c 2010-12-03 15:10:09.073984282 +0900
347 @@ -514,8 +514,10 @@
348         grow in size, as the references on the upper levels of the tree can
349         change */
350  
351 -       ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
352 -               / IBUF_POOL_SIZE_PER_MAX_SIZE;
353 +       ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
354 +               / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
355 +
356 +       srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
357  
358         mutex_create(ibuf_pessimistic_insert_mutex_key,
359                      &ibuf_pessimistic_insert_mutex,
360 @@ -2753,9 +2755,11 @@
361         size = ibuf->size;
362         max_size = ibuf->max_size;
363  
364 +       if (!srv_ibuf_active_contract) {
365         if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
366                 return;
367         }
368 +       }
369  
370         sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
371  
372 diff -ruN a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
373 --- a/storage/innobase/include/buf0rea.h        2010-11-03 07:01:13.000000000 +0900
374 +++ b/storage/innobase/include/buf0rea.h        2010-12-03 15:10:09.076066335 +0900
375 @@ -124,8 +124,7 @@
376  
377  /** The size in pages of the area which the read-ahead algorithms read if
378  invoked */
379 -#define        BUF_READ_AHEAD_AREA(b)                                  \
380 -       ut_min(64, ut_2_power_up((b)->curr_size / 32))
381 +#define        BUF_READ_AHEAD_AREA(b)          64
382  
383  /** @name Modes used in read-ahead @{ */
384  /** read only pages belonging to the insert buffer tree */
385 diff -ruN a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
386 --- a/storage/innobase/include/fil0fil.h        2011-06-29 17:48:24.818969583 +0900
387 +++ b/storage/innobase/include/fil0fil.h        2011-06-29 17:58:49.215971540 +0900
388 @@ -658,8 +658,9 @@
389  void
390  fil_flush(
391  /*======*/
392 -       ulint   space_id);      /*!< in: file space id (this can be a group of
393 +       ulint   space_id,       /*!< in: file space id (this can be a group of
394                                 log files or a tablespace of the database) */
395 +       ibool   metadata);
396  /**********************************************************************//**
397  Flushes to disk writes in file spaces of the given type possibly cached by
398  the OS. */
399 diff -ruN a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
400 --- a/storage/innobase/include/ha_prototypes.h  2010-11-03 07:01:13.000000000 +0900
401 +++ b/storage/innobase/include/ha_prototypes.h  2010-12-03 15:10:09.078026360 +0900
402 @@ -284,6 +284,13 @@
403  /*===================*/
404          void*   thd,   /*!< in: thread handle (THD*) */
405          ulint   value);        /*!< in: time waited for the lock */
406 +/******************************************************************//**
407 +*/
408 +
409 +ulong
410 +thd_flush_log_at_trx_commit(
411 +/*================================*/
412 +       void*   thd);
413  
414  /**********************************************************************//**
415  Get the current setting of the lower_case_table_names global parameter from
416 diff -ruN a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
417 --- a/storage/innobase/include/os0file.h        2011-05-11 20:54:12.000000000 +0900
418 +++ b/storage/innobase/include/os0file.h        2011-06-29 17:55:21.947041132 +0900
419 @@ -296,8 +296,8 @@
420         pfs_os_file_write_func(name, file, buf, offset, offset_high,    \
421                                n, __FILE__, __LINE__)
422  
423 -# define os_file_flush(file)                                           \
424 -       pfs_os_file_flush_func(file, __FILE__, __LINE__)
425 +# define os_file_flush(file, metadata)                                 \
426 +       pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__)
427  
428  # define os_file_rename(key, oldpath, newpath)                         \
429         pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
430 @@ -333,7 +333,7 @@
431  # define os_file_write(name, file, buf, offset, offset_high, n)                \
432         os_file_write_func(name, file, buf, offset, offset_high, n)
433  
434 -# define os_file_flush(file)   os_file_flush_func(file)
435 +# define os_file_flush(file, metadata) os_file_flush_func(file, metadata)
436  
437  # define os_file_rename(key, oldpath, newpath)                         \
438         os_file_rename_func(oldpath, newpath)
439 @@ -781,6 +781,7 @@
440  pfs_os_file_flush_func(
441  /*===================*/
442         os_file_t       file,   /*!< in, own: handle to a file */
443 +       ibool           metadata,
444         const char*     src_file,/*!< in: file name where func invoked */
445         ulint           src_line);/*!< in: line where the func invoked */
446  
447 @@ -860,7 +861,8 @@
448  ibool
449  os_file_flush_func(
450  /*===============*/
451 -       os_file_t       file);  /*!< in, own: handle to a file */
452 +       os_file_t       file,   /*!< in, own: handle to a file */
453 +       ibool           metadata);
454  /***********************************************************************//**
455  Retrieves the last error number if an error occurs in a file io function.
456  The number should be retrieved before any other OS calls (because they may
457 diff -ruN a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
458 --- a/storage/innobase/include/os0file.ic       2011-05-11 20:54:12.000000000 +0900
459 +++ b/storage/innobase/include/os0file.ic       2011-06-29 17:56:01.510958172 +0900
460 @@ -369,6 +369,7 @@
461  pfs_os_file_flush_func(
462  /*===================*/
463         os_file_t       file,   /*!< in, own: handle to a file */
464 +       ibool           metadata,
465         const char*     src_file,/*!< in: file name where func invoked */
466         ulint           src_line)/*!< in: line where the func invoked */
467  {
468 @@ -378,7 +379,7 @@
469  
470         register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
471                                    src_file, src_line);
472 -       result = os_file_flush_func(file);
473 +       result = os_file_flush_func(file, metadata);
474  
475         register_pfs_file_io_end(locker, 0);
476  
477 diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
478 --- a/storage/innobase/include/srv0srv.h        2010-12-03 15:09:51.291955835 +0900
479 +++ b/storage/innobase/include/srv0srv.h        2010-12-03 15:10:09.079029047 +0900
480 @@ -138,7 +138,8 @@
481  extern ulint   srv_n_log_files;
482  extern ulint   srv_log_file_size;
483  extern ulint   srv_log_buffer_size;
484 -extern ulong   srv_flush_log_at_trx_commit;
485 +//extern ulong srv_flush_log_at_trx_commit;
486 +extern char    srv_use_global_flush_log_at_trx_commit;
487  extern char    srv_adaptive_flushing;
488  
489  
490 @@ -216,6 +217,16 @@
491  extern ulong   srv_max_purge_lag;
492  
493  extern ulong   srv_replication_delay;
494 +
495 +extern long long       srv_ibuf_max_size;
496 +extern ulint   srv_ibuf_active_contract;
497 +extern ulint   srv_ibuf_accel_rate;
498 +extern ulint   srv_checkpoint_age_target;
499 +extern ulint   srv_flush_neighbor_pages;
500 +extern ulint   srv_enable_unsafe_group_commit;
501 +extern ulint   srv_read_ahead;
502 +extern ulint   srv_adaptive_flushing_method;
503 +
504  /*-------------------------------------------*/
505  
506  extern ulint   srv_n_rows_inserted;
507 @@ -394,8 +405,9 @@
508                                 when writing data files, but do flush
509                                 after writing to log files */
510         SRV_UNIX_NOSYNC,        /*!< do not flush after writing */
511 -       SRV_UNIX_O_DIRECT       /*!< invoke os_file_set_nocache() on
512 +       SRV_UNIX_O_DIRECT,      /*!< invoke os_file_set_nocache() on
513                                 data files */
514 +       SRV_UNIX_ALL_O_DIRECT   /* new method for examination: logfile also open O_DIRECT */
515  };
516  
517  /** Alternatives for file i/o in Windows */
518 diff -ruN a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c
519 --- a/storage/innobase/log/log0log.c    2010-11-03 07:01:13.000000000 +0900
520 +++ b/storage/innobase/log/log0log.c    2010-12-03 15:10:09.084023562 +0900
521 @@ -48,6 +48,7 @@
522  #include "srv0start.h"
523  #include "trx0sys.h"
524  #include "trx0trx.h"
525 +#include "ha_prototypes.h"
526  
527  /*
528  General philosophy of InnoDB redo-logs:
529 @@ -359,6 +360,33 @@
530  }
531  
532  /************************************************************//**
533 +*/
534 +UNIV_INLINE
535 +ulint
536 +log_max_modified_age_async()
537 +{
538 +       if (srv_checkpoint_age_target) {
539 +               return(ut_min(log_sys->max_modified_age_async,
540 +                               srv_checkpoint_age_target
541 +                               - srv_checkpoint_age_target / 8));
542 +       } else {
543 +               return(log_sys->max_modified_age_async);
544 +       }
545 +}
546 +
547 +UNIV_INLINE
548 +ulint
549 +log_max_checkpoint_age_async()
550 +{
551 +       if (srv_checkpoint_age_target) {
552 +               return(ut_min(log_sys->max_checkpoint_age_async,
553 +                               srv_checkpoint_age_target));
554 +       } else {
555 +               return(log_sys->max_checkpoint_age_async);
556 +       }
557 +}
558 +
559 +/************************************************************//**
560  Closes the log.
561  @return        lsn */
562  UNIV_INTERN
563 @@ -427,7 +455,7 @@
564                 }
565         }
566  
567 -       if (checkpoint_age <= log->max_modified_age_async) {
568 +       if (checkpoint_age <= log_max_modified_age_async()) {
569  
570                 goto function_exit;
571         }
572 @@ -435,8 +463,8 @@
573         oldest_lsn = buf_pool_get_oldest_modification();
574  
575         if (!oldest_lsn
576 -           || lsn - oldest_lsn > log->max_modified_age_async
577 -           || checkpoint_age > log->max_checkpoint_age_async) {
578 +           || lsn - oldest_lsn > log_max_modified_age_async()
579 +           || checkpoint_age > log_max_checkpoint_age_async()) {
580  
581                 log->check_flush_or_checkpoint = TRUE;
582         }
583 @@ -1100,9 +1128,10 @@
584                 group = (log_group_t*)((ulint)group - 1);
585  
586                 if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
587 +                   && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
588                     && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
589  
590 -                       fil_flush(group->space_id);
591 +                       fil_flush(group->space_id, FALSE);
592                 }
593  
594  #ifdef UNIV_DEBUG
595 @@ -1121,10 +1150,11 @@
596                         logs and cannot end up here! */
597  
598         if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
599 +           && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
600             && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
601 -           && srv_flush_log_at_trx_commit != 2) {
602 +           && thd_flush_log_at_trx_commit(NULL) != 2) {
603  
604 -               fil_flush(group->space_id);
605 +               fil_flush(group->space_id, FALSE);
606         }
607  
608         mutex_enter(&(log_sys->mutex));
609 @@ -1501,7 +1531,8 @@
610  
611         mutex_exit(&(log_sys->mutex));
612  
613 -       if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
614 +       if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
615 +           || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
616                 /* O_DSYNC means the OS did not buffer the log file at all:
617                 so we have also flushed to disk what we have written */
618  
619 @@ -1511,7 +1542,7 @@
620  
621                 group = UT_LIST_GET_FIRST(log_sys->log_groups);
622  
623 -               fil_flush(group->space_id);
624 +               fil_flush(group->space_id, FALSE);
625                 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
626         }
627  
628 @@ -2120,10 +2151,10 @@
629  
630                 sync = TRUE;
631                 advance = 2 * (age - log->max_modified_age_sync);
632 -       } else if (age > log->max_modified_age_async) {
633 +       } else if (age > log_max_modified_age_async()) {
634  
635                 /* A flush is not urgent: we do an asynchronous preflush */
636 -               advance = age - log->max_modified_age_async;
637 +               advance = age - log_max_modified_age_async();
638         } else {
639                 advance = 0;
640         }
641 @@ -2137,7 +2168,7 @@
642  
643                 do_checkpoint = TRUE;
644  
645 -       } else if (checkpoint_age > log->max_checkpoint_age_async) {
646 +       } else if (checkpoint_age > log_max_checkpoint_age_async()) {
647                 /* A checkpoint is not urgent: do it asynchronously */
648  
649                 do_checkpoint = TRUE;
650 @@ -2607,7 +2638,7 @@
651  
652         mutex_exit(&(log_sys->mutex));
653  
654 -       fil_flush(group->archive_space_id);
655 +       fil_flush(group->archive_space_id, TRUE);
656  
657         mutex_enter(&(log_sys->mutex));
658  
659 @@ -3349,6 +3380,17 @@
660                 log_sys->flushed_to_disk_lsn,
661                 log_sys->last_checkpoint_lsn);
662  
663 +       fprintf(file,
664 +               "Max checkpoint age    %lu\n"
665 +               "Checkpoint age target %lu\n"
666 +               "Modified age          %lu\n"
667 +               "Checkpoint age        %lu\n",
668 +               (ulong) log_sys->max_checkpoint_age,
669 +               (ulong) log_max_checkpoint_age_async(),
670 +               (ulong) (log_sys->lsn -
671 +                               log_buf_pool_get_oldest_modification()),
672 +               (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
673 +
674         current_time = time(NULL);
675  
676         time_elapsed = 0.001 + difftime(current_time,
677 diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
678 --- a/storage/innobase/log/log0recv.c   2010-11-03 07:01:13.000000000 +0900
679 +++ b/storage/innobase/log/log0recv.c   2010-12-03 15:10:09.089024191 +0900
680 @@ -2906,9 +2906,12 @@
681         ib_uint64_t     archived_lsn;
682  #endif /* UNIV_LOG_ARCHIVE */
683         byte*           buf;
684 -       byte            log_hdr_buf[LOG_FILE_HDR_SIZE];
685 +       byte*           log_hdr_buf;
686 +       byte            log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
687         ulint           err;
688  
689 +       log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
690 +
691  #ifdef UNIV_LOG_ARCHIVE
692         ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
693  /** TRUE when recovering from a checkpoint */
694 @@ -3468,7 +3471,7 @@
695                         exit(1);
696                 }
697  
698 -               os_file_flush(log_file);
699 +               os_file_flush(log_file, TRUE);
700                 os_file_close(log_file);
701         }
702  
703 @@ -3492,7 +3495,7 @@
704  
705         os_file_write(name, log_file, buf, 0, 0,
706                       LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
707 -       os_file_flush(log_file);
708 +       os_file_flush(log_file, TRUE);
709         os_file_close(log_file);
710  
711         ut_free(buf);
712 diff -ruN a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c
713 --- a/storage/innobase/os/os0file.c     2010-11-03 07:01:13.000000000 +0900
714 +++ b/storage/innobase/os/os0file.c     2010-12-03 15:10:09.093023540 +0900
715 @@ -1424,7 +1424,7 @@
716  #endif
717  #ifdef UNIV_NON_BUFFERED_IO
718  # ifndef UNIV_HOTBACKUP
719 -               if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
720 +               if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
721                         /* Do not use unbuffered i/o to log files because
722                         value 2 denotes that we do not flush the log at every
723                         commit, but only once per second */
724 @@ -1440,7 +1440,7 @@
725                 attributes = 0;
726  #ifdef UNIV_NON_BUFFERED_IO
727  # ifndef UNIV_HOTBACKUP
728 -               if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
729 +               if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
730                         /* Do not use unbuffered i/o to log files because
731                         value 2 denotes that we do not flush the log at every
732                         commit, but only once per second */
733 @@ -1585,6 +1585,11 @@
734                 os_file_set_nocache(file, name, mode_str);
735         }
736  
737 +       /* ALL_O_DIRECT: O_DIRECT also for transaction log file */
738 +       if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
739 +               os_file_set_nocache(file, name, mode_str);
740 +       }
741 +
742  #ifdef USE_FILE_LOCK
743         if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
744  
745 @@ -2008,7 +2013,7 @@
746  
747         ut_free(buf2);
748  
749 -       ret = os_file_flush(file);
750 +       ret = os_file_flush(file, TRUE);
751  
752         if (ret) {
753                 return(TRUE);
754 @@ -2046,7 +2051,8 @@
755  int
756  os_file_fsync(
757  /*==========*/
758 -       os_file_t       file)   /*!< in: handle to a file */
759 +       os_file_t       file,   /*!< in: handle to a file */
760 +       ibool           metadata)
761  {
762         int     ret;
763         int     failures;
764 @@ -2055,7 +2061,15 @@
765         failures = 0;
766  
767         do {
768 +#ifdef HAVE_FDATASYNC
769 +               if (metadata) {
770 +                       ret = fsync(file);
771 +               } else {
772 +                       ret = fdatasync(file);
773 +               }
774 +#else
775                 ret = fsync(file);
776 +#endif
777  
778                 os_n_fsyncs++;
779  
780 @@ -2092,7 +2106,8 @@
781  ibool
782  os_file_flush_func(
783  /*===============*/
784 -       os_file_t       file)   /*!< in, own: handle to a file */
785 +       os_file_t       file,   /*!< in, own: handle to a file */
786 +       ibool           metadata)
787  {
788  #ifdef __WIN__
789         BOOL    ret;
790 @@ -2142,18 +2157,18 @@
791                 /* If we are not on an operating system that supports this,
792                 then fall back to a plain fsync. */
793  
794 -               ret = os_file_fsync(file);
795 +               ret = os_file_fsync(file, metadata);
796         } else {
797                 ret = fcntl(file, F_FULLFSYNC, NULL);
798  
799                 if (ret) {
800                         /* If we are not on a file system that supports this,
801                         then fall back to a plain fsync. */
802 -                       ret = os_file_fsync(file);
803 +                       ret = os_file_fsync(file, metadata);
804                 }
805         }
806  #else
807 -       ret = os_file_fsync(file);
808 +       ret = os_file_fsync(file, metadata);
809  #endif
810  
811         if (ret == 0) {
812 @@ -2336,7 +2351,7 @@
813                 the OS crashes, a database page is only partially
814                 physically written to disk. */
815  
816 -               ut_a(TRUE == os_file_flush(file));
817 +               ut_a(TRUE == os_file_flush(file, TRUE));
818         }
819  # endif /* UNIV_DO_FLUSH */
820  
821 @@ -2378,7 +2393,7 @@
822                         the OS crashes, a database page is only partially
823                         physically written to disk. */
824  
825 -                       ut_a(TRUE == os_file_flush(file));
826 +                       ut_a(TRUE == os_file_flush(file, TRUE));
827                 }
828  # endif /* UNIV_DO_FLUSH */
829  
830 @@ -2750,7 +2765,7 @@
831  
832  # ifdef UNIV_DO_FLUSH
833         if (!os_do_not_call_flush_at_each_write) {
834 -               ut_a(TRUE == os_file_flush(file));
835 +               ut_a(TRUE == os_file_flush(file, TRUE));
836         }
837  # endif /* UNIV_DO_FLUSH */
838  
839 @@ -4289,7 +4304,7 @@
840  #ifdef UNIV_DO_FLUSH
841                 if (slot->type == OS_FILE_WRITE
842                     && !os_do_not_call_flush_at_each_write) {
843 -                       if (!os_file_flush(slot->file)) {
844 +                       if (!os_file_flush(slot->file, TRUE)) {
845                                 ut_error;
846                         }
847                 }
848 @@ -4590,7 +4605,7 @@
849  #ifdef UNIV_DO_FLUSH
850                 if (slot->type == OS_FILE_WRITE
851                     && !os_do_not_call_flush_at_each_write)
852 -                   && !os_file_flush(slot->file) {
853 +                   && !os_file_flush(slot->file, TRUE) {
854                         ut_error;
855                 }
856  #endif /* UNIV_DO_FLUSH */
857 diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
858 --- a/storage/innobase/srv/srv0srv.c    2010-12-03 15:09:51.301987792 +0900
859 +++ b/storage/innobase/srv/srv0srv.c    2010-12-03 15:13:29.369986988 +0900
860 @@ -183,7 +183,8 @@
861  UNIV_INTERN ulint      srv_log_file_size       = ULINT_MAX;
862  /* size in database pages */
863  UNIV_INTERN ulint      srv_log_buffer_size     = ULINT_MAX;
864 -UNIV_INTERN ulong      srv_flush_log_at_trx_commit = 1;
865 +//UNIV_INTERN ulong    srv_flush_log_at_trx_commit = 1;
866 +UNIV_INTERN char       srv_use_global_flush_log_at_trx_commit  = TRUE;
867  
868  /* Try to flush dirty pages so as to avoid IO bursts at
869  the checkpoints. */
870 @@ -402,6 +403,17 @@
871  
872  UNIV_INTERN ulong      srv_replication_delay           = 0;
873  
874 +UNIV_INTERN long long  srv_ibuf_max_size = 0;
875 +UNIV_INTERN ulint      srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
876 +UNIV_INTERN ulint      srv_ibuf_accel_rate = 100;
877 +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
878 +
879 +UNIV_INTERN ulint      srv_checkpoint_age_target = 0;
880 +UNIV_INTERN ulint      srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
881 +
882 +UNIV_INTERN ulint      srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
883 +UNIV_INTERN ulint      srv_read_ahead = 3; /* 1: random  2: linear  3: Both */
884 +UNIV_INTERN ulint      srv_adaptive_flushing_method = 0; /* 0: native  1: estimate  2: keep_average */
885  /*-------------------------------------------*/
886  UNIV_INTERN ulong      srv_n_spin_wait_rounds  = 30;
887  UNIV_INTERN ulong      srv_n_free_tickets_to_enter = 500;
888 @@ -2742,6 +2754,7 @@
889         ulint           n_pages_purged  = 0;
890         ulint           n_bytes_merged;
891         ulint           n_pages_flushed;
892 +       ulint           n_pages_flushed_prev = 0;
893         ulint           n_bytes_archived;
894         ulint           n_tables_to_drop;
895         ulint           n_ios;
896 @@ -2749,7 +2762,20 @@
897         ulint           n_ios_very_old;
898         ulint           n_pend_ios;
899         ulint           next_itr_time;
900 +       ulint           prev_adaptive_flushing_method = ULINT_UNDEFINED;
901 +       ulint           inner_loop = 0;
902 +       ibool           skip_sleep      = FALSE;
903         ulint           i;
904 +       struct t_prev_flush_info_struct {
905 +               ulint           count;
906 +               unsigned        space:32;
907 +               unsigned        offset:32;
908 +               ib_uint64_t     oldest_modification;
909 +       } prev_flush_info[MAX_BUFFER_POOLS];
910 +
911 +       ib_uint64_t     lsn_old;
912 +
913 +       ib_uint64_t     oldest_lsn;
914  
915  #ifdef UNIV_DEBUG_THREAD_CREATION
916         fprintf(stderr, "Master thread starts, id %lu\n",
917 @@ -2771,6 +2797,9 @@
918  
919         mutex_exit(&kernel_mutex);
920  
921 +       mutex_enter(&(log_sys->mutex));
922 +       lsn_old = log_sys->lsn;
923 +       mutex_exit(&(log_sys->mutex));
924  loop:
925         /*****************************************************************/
926         /* ---- When there is database activity by users, we cycle in this
927 @@ -2801,9 +2830,13 @@
928         /* Sleep for 1 second on entrying the for loop below the first time. */
929         next_itr_time = ut_time_ms() + 1000;
930  
931 +       skip_sleep = FALSE;
932 +
933         for (i = 0; i < 10; i++) {
934                 ulint   cur_time = ut_time_ms();
935  
936 +               n_pages_flushed = 0; /* initialize */
937 +
938                 /* ALTER TABLE in MySQL requires on Unix that the table handler
939                 can drop tables lazily after there no longer are SELECT
940                 queries to them. */
941 @@ -2827,6 +2860,7 @@
942                 srv_main_thread_op_info = "sleeping";
943                 srv_main_1_second_loops++;
944  
945 +               if (!skip_sleep) {
946                 if (next_itr_time > cur_time
947                     && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
948  
949 @@ -2837,10 +2871,26 @@
950                                         (next_itr_time - cur_time)
951                                          * 1000));
952                         srv_main_sleeps++;
953 +
954 +                       /*
955 +                       mutex_enter(&(log_sys->mutex));
956 +                       oldest_lsn = buf_pool_get_oldest_modification();
957 +                       ib_uint64_t     lsn = log_sys->lsn;
958 +                       mutex_exit(&(log_sys->mutex));
959 +
960 +                       if(oldest_lsn)
961 +                       fprintf(stderr,
962 +                               "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
963 +                               (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
964 +                               lsn - lsn_old);
965 +                       */
966                 }
967  
968                 /* Each iteration should happen at 1 second interval. */
969                 next_itr_time = ut_time_ms() + 1000;
970 +               } /* if (!skip_sleep) */
971 +
972 +               skip_sleep = FALSE;
973  
974                 /* Flush logs if needed */
975                 srv_sync_log_buffer_in_background();
976 @@ -2860,7 +2910,7 @@
977                 if (n_pend_ios < SRV_PEND_IO_THRESHOLD
978                     && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
979                         srv_main_thread_op_info = "doing insert buffer merge";
980 -                       ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
981 +                       ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
982  
983                         /* Flush logs if needed */
984                         srv_sync_log_buffer_in_background();
985 @@ -2877,7 +2927,11 @@
986                         n_pages_flushed = buf_flush_list(
987                                 PCT_IO(100), IB_ULONGLONG_MAX);
988  
989 -               } else if (srv_adaptive_flushing) {
990 +                       mutex_enter(&(log_sys->mutex));
991 +                       lsn_old = log_sys->lsn;
992 +                       mutex_exit(&(log_sys->mutex));
993 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
994 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) {
995  
996                         /* Try to keep the rate of flushing of dirty
997                         pages such that redo log generation does not
998 @@ -2893,6 +2947,224 @@
999                                                 n_flush,
1000                                                 IB_ULONGLONG_MAX);
1001                         }
1002 +
1003 +                       mutex_enter(&(log_sys->mutex));
1004 +                       lsn_old = log_sys->lsn;
1005 +                       mutex_exit(&(log_sys->mutex));
1006 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
1007 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) {
1008 +
1009 +                       /* Try to keep modified age not to exceed
1010 +                       max_checkpoint_age * 7/8 line */
1011 +
1012 +                       mutex_enter(&(log_sys->mutex));
1013 +
1014 +                       oldest_lsn = buf_pool_get_oldest_modification();
1015 +                       if (oldest_lsn == 0) {
1016 +                               lsn_old = log_sys->lsn;
1017 +                               mutex_exit(&(log_sys->mutex));
1018 +
1019 +                       } else {
1020 +                               if ((log_sys->lsn - oldest_lsn)
1021 +                                   > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
1022 +                                       /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
1023 +                                       /* We should not flush from here. */
1024 +                                       lsn_old = log_sys->lsn;
1025 +                                       mutex_exit(&(log_sys->mutex));
1026 +                               } else if ((log_sys->lsn - oldest_lsn)
1027 +                                          > (log_sys->max_checkpoint_age)/4 ) {
1028 +
1029 +                                       /* defence line (max_checkpoint_age * 1/2) */
1030 +                                       ib_uint64_t     lsn = log_sys->lsn;
1031 +
1032 +                                       ib_uint64_t     level, bpl;
1033 +                                       buf_page_t*     bpage;
1034 +                                       ulint           j;
1035 +
1036 +                                       mutex_exit(&(log_sys->mutex));
1037 +
1038 +                                       bpl = 0;
1039 +
1040 +                                       for (j = 0; j < srv_buf_pool_instances; j++) {
1041 +                                               buf_pool_t*     buf_pool;
1042 +                                               ulint           n_blocks;
1043 +
1044 +                                               buf_pool = buf_pool_from_array(j);
1045 +
1046 +                                               /* The scanning flush_list is optimistic here */
1047 +
1048 +                                               level = 0;
1049 +                                               n_blocks = 0;
1050 +                                               bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1051 +
1052 +                                               while (bpage != NULL) {
1053 +                                                       ib_uint64_t     oldest_modification = bpage->oldest_modification;
1054 +                                                       if (oldest_modification != 0) {
1055 +                                                               level += log_sys->max_checkpoint_age
1056 +                                                                        - (lsn - oldest_modification);
1057 +                                                       }
1058 +                                                       bpage = UT_LIST_GET_NEXT(list, bpage);
1059 +                                                       n_blocks++;
1060 +                                               }
1061 +
1062 +                                               if (level) {
1063 +                                                       bpl += ((ib_uint64_t) n_blocks * n_blocks
1064 +                                                               * (lsn - lsn_old)) / level;
1065 +                                               }
1066 +
1067 +                                       }
1068 +
1069 +                                       if (!srv_use_doublewrite_buf) {
1070 +                                               /* flush is faster than when doublewrite */
1071 +                                               bpl = (bpl * 7) / 8;
1072 +                                       }
1073 +
1074 +                                       if (bpl) {
1075 +retry_flush_batch:
1076 +                                               n_pages_flushed = buf_flush_list(bpl,
1077 +                                                                       oldest_lsn + (lsn - lsn_old));
1078 +                                               if (n_pages_flushed == ULINT_UNDEFINED) {
1079 +                                                       os_thread_sleep(5000);
1080 +                                                       goto retry_flush_batch;
1081 +                                               }
1082 +                                       }
1083 +
1084 +                                       lsn_old = lsn;
1085 +                                       /*
1086 +                                       fprintf(stderr,
1087 +                                               "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
1088 +                                               (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
1089 +                                               lsn - lsn_old, bpl);
1090 +                                       */
1091 +                               } else {
1092 +                                       lsn_old = log_sys->lsn;
1093 +                                       mutex_exit(&(log_sys->mutex));
1094 +                               }
1095 +                       }
1096 +                       prev_adaptive_flushing_method = 1;
1097 +               } else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) {
1098 +                       buf_pool_t*     buf_pool;
1099 +                       buf_page_t*     bpage;
1100 +                       ib_uint64_t     lsn;
1101 +                       ulint           j;
1102 +
1103 +                       mutex_enter(&(log_sys->mutex));
1104 +                       oldest_lsn = buf_pool_get_oldest_modification();
1105 +                       lsn = log_sys->lsn;
1106 +                       mutex_exit(&(log_sys->mutex));
1107 +
1108 +                       /* upper loop/sec. (x10) */
1109 +                       next_itr_time -= 900; /* 1000 - 900 == 100 */
1110 +                       inner_loop++;
1111 +                       if (inner_loop < 10) {
1112 +                               i--;
1113 +                       } else {
1114 +                               inner_loop = 0;
1115 +                       }
1116 +
1117 +                       if (prev_adaptive_flushing_method == 2) {
1118 +                               lint    n_flush;
1119 +                               lint    blocks_sum;
1120 +                               ulint   new_blocks_sum, flushed_blocks_sum;
1121 +
1122 +                               blocks_sum = new_blocks_sum = flushed_blocks_sum = 0;
1123 +
1124 +                               /* prev_flush_info[j] should be the previous loop's */
1125 +                               for (j = 0; j < srv_buf_pool_instances; j++) {
1126 +                                       lint    blocks_num, new_blocks_num, flushed_blocks_num;
1127 +                                       ibool   found;
1128 +
1129 +                                       buf_pool = buf_pool_from_array(j);
1130 +
1131 +                                       blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list);
1132 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1133 +                                       new_blocks_num = 0;
1134 +
1135 +                                       found = FALSE;
1136 +                                       while (bpage != NULL) {
1137 +                                               if (prev_flush_info[j].space == bpage->space
1138 +                                                   && prev_flush_info[j].offset == bpage->offset
1139 +                                                   && prev_flush_info[j].oldest_modification
1140 +                                                               == bpage->oldest_modification) {
1141 +                                                       found = TRUE;
1142 +                                                       break;
1143 +                                               }
1144 +                                               bpage = UT_LIST_GET_NEXT(list, bpage);
1145 +                                               new_blocks_num++;
1146 +                                       }
1147 +                                       if (!found) {
1148 +                                               new_blocks_num = blocks_num;
1149 +                                       }
1150 +
1151 +                                       flushed_blocks_num = new_blocks_num + prev_flush_info[j].count
1152 +                                                               - blocks_num;
1153 +                                       if (flushed_blocks_num < 0) {
1154 +                                               flushed_blocks_num = 0;
1155 +                                       }
1156 +
1157 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1158 +
1159 +                                       prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1160 +                                       if (bpage) {
1161 +                                               prev_flush_info[j].space = bpage->space;
1162 +                                               prev_flush_info[j].offset = bpage->offset;
1163 +                                               prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1164 +                                       } else {
1165 +                                               prev_flush_info[j].space = 0;
1166 +                                               prev_flush_info[j].offset = 0;
1167 +                                               prev_flush_info[j].oldest_modification = 0;
1168 +                                       }
1169 +
1170 +                                       new_blocks_sum += new_blocks_num;
1171 +                                       flushed_blocks_sum += flushed_blocks_num;
1172 +                                       blocks_sum += blocks_num;
1173 +                               }
1174 +
1175 +                               n_flush = blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async;
1176 +                               if (flushed_blocks_sum > n_pages_flushed_prev) {
1177 +                                       n_flush -= (flushed_blocks_sum - n_pages_flushed_prev);
1178 +                               }
1179 +
1180 +                               if (n_flush > 0) {
1181 +                                       n_flush++;
1182 +                                       n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old));
1183 +                               } else {
1184 +                                       n_pages_flushed = 0;
1185 +                               }                                       
1186 +                       } else {
1187 +                               /* store previous first pages of the flush_list */
1188 +                               for (j = 0; j < srv_buf_pool_instances; j++) {
1189 +                                       buf_pool = buf_pool_from_array(j);
1190 +
1191 +                                       bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1192 +
1193 +                                       prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
1194 +                                       if (bpage) {
1195 +                                               prev_flush_info[j].space = bpage->space;
1196 +                                               prev_flush_info[j].offset = bpage->offset;
1197 +                                               prev_flush_info[j].oldest_modification = bpage->oldest_modification;
1198 +                                       } else {
1199 +                                               prev_flush_info[j].space = 0;
1200 +                                               prev_flush_info[j].offset = 0;
1201 +                                               prev_flush_info[j].oldest_modification = 0;
1202 +                                       }
1203 +                               }
1204 +                               n_pages_flushed = 0;
1205 +                       }
1206 +
1207 +                       lsn_old = lsn;
1208 +                       prev_adaptive_flushing_method = 2;
1209 +               } else {
1210 +                       mutex_enter(&(log_sys->mutex));
1211 +                       lsn_old = log_sys->lsn;
1212 +                       mutex_exit(&(log_sys->mutex));
1213 +                       prev_adaptive_flushing_method = ULINT_UNDEFINED;
1214 +               }
1215 +
1216 +               if (n_pages_flushed == ULINT_UNDEFINED) {
1217 +                       n_pages_flushed_prev = 0;
1218 +               } else {
1219 +                       n_pages_flushed_prev = n_pages_flushed;
1220                 }
1221  
1222                 if (srv_activity_count == old_activity_count) {
1223 @@ -2941,7 +3213,7 @@
1224         even if the server were active */
1225  
1226         srv_main_thread_op_info = "doing insert buffer merge";
1227 -       ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
1228 +       ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
1229  
1230         /* Flush logs if needed */
1231         srv_sync_log_buffer_in_background();
1232 @@ -3049,7 +3321,7 @@
1233                 buf_flush_list below. Otherwise, the system favors
1234                 clean pages over cleanup throughput. */
1235                 n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
1236 -                                                          PCT_IO(100));
1237 +                                                          PCT_IBUF_IO(100));
1238         }
1239  
1240         srv_main_thread_op_info = "reserving kernel mutex";
1241 @@ -3189,6 +3461,7 @@
1242         srv_slot_t*     slot;
1243         ulint           retries = 0;
1244         ulint           n_total_purged = ULINT_UNDEFINED;
1245 +       ulint           next_itr_time;
1246  
1247         ut_a(srv_n_purge_threads == 1);
1248  
1249 @@ -3209,9 +3482,12 @@
1250  
1251         mutex_exit(&kernel_mutex);
1252  
1253 +       next_itr_time = ut_time_ms();
1254 +
1255         while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
1256  
1257                 ulint   n_pages_purged = 0;
1258 +               ulint   cur_time;
1259  
1260                 /* If there are very few records to purge or the last
1261                 purge didn't purge any records then wait for activity.
1262 @@ -3258,6 +3534,16 @@
1263                 } while (n_pages_purged > 0 && !srv_fast_shutdown);
1264  
1265                 srv_sync_log_buffer_in_background();
1266 +
1267 +               cur_time = ut_time_ms();
1268 +               if (next_itr_time > cur_time) {
1269 +                       os_thread_sleep(ut_min(1000000,
1270 +                                       (next_itr_time - cur_time)
1271 +                                        * 1000));
1272 +                       next_itr_time = ut_time_ms() + 1000;
1273 +               } else {
1274 +                       next_itr_time = cur_time + 1000;
1275 +               }
1276         }
1277  
1278         mutex_enter(&kernel_mutex);
1279 diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
1280 --- a/storage/innobase/srv/srv0start.c  2010-11-03 07:01:13.000000000 +0900
1281 +++ b/storage/innobase/srv/srv0start.c  2010-12-03 15:10:09.103023543 +0900
1282 @@ -1217,6 +1217,9 @@
1283         } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
1284                 srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
1285  
1286 +       } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
1287 +               srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
1288 +
1289         } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
1290                 srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
1291  
1292 diff -ruN a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c
1293 --- a/storage/innobase/trx/trx0purge.c  2011-04-12 14:14:14.000000000 +0900
1294 +++ b/storage/innobase/trx/trx0purge.c  2011-04-12 14:15:44.000000000 +0900
1295 @@ -392,10 +392,10 @@
1296         trx_sys->rseg_history_len++;
1297         mutex_exit(&kernel_mutex);
1298  
1299 -       if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
1300 +//     if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
1301                 /* Inform the purge thread that there is work to do. */
1302                 srv_wake_purge_thread_if_not_active();
1303 -       }
1304 +//     }
1305  }
1306  
1307  /**********************************************************************//**
1308 diff -ruN a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
1309 --- a/storage/innobase/trx/trx0trx.c    2010-11-03 07:01:13.000000000 +0900
1310 +++ b/storage/innobase/trx/trx0trx.c    2010-12-03 15:10:09.106023937 +0900
1311 @@ -984,6 +984,7 @@
1312         trx->read_view = NULL;
1313  
1314         if (lsn) {
1315 +               ulint   flush_log_at_trx_commit;
1316  
1317                 mutex_exit(&kernel_mutex);
1318  
1319 @@ -992,6 +993,12 @@
1320                         trx_undo_insert_cleanup(trx);
1321                 }
1322  
1323 +               if (srv_use_global_flush_log_at_trx_commit) {
1324 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1325 +               } else {
1326 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1327 +               }
1328 +
1329                 /* NOTE that we could possibly make a group commit more
1330                 efficient here: call os_thread_yield here to allow also other
1331                 trxs to come to commit! */
1332 @@ -1023,9 +1030,9 @@
1333                 if (trx->flush_log_later) {
1334                         /* Do nothing yet */
1335                         trx->must_flush_log_later = TRUE;
1336 -               } else if (srv_flush_log_at_trx_commit == 0) {
1337 +               } else if (flush_log_at_trx_commit == 0) {
1338                         /* Do nothing */
1339 -               } else if (srv_flush_log_at_trx_commit == 1) {
1340 +               } else if (flush_log_at_trx_commit == 1) {
1341                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1342                                 /* Write the log but do not flush it to disk */
1343  
1344 @@ -1037,7 +1044,7 @@
1345  
1346                                 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1347                         }
1348 -               } else if (srv_flush_log_at_trx_commit == 2) {
1349 +               } else if (flush_log_at_trx_commit == 2) {
1350  
1351                         /* Write the log but do not flush it to disk */
1352  
1353 @@ -1701,16 +1708,23 @@
1354         trx_t*  trx)    /*!< in: trx handle */
1355  {
1356         ib_uint64_t     lsn     = trx->commit_lsn;
1357 +       ulint           flush_log_at_trx_commit;
1358  
1359         ut_a(trx);
1360  
1361         trx->op_info = "flushing log";
1362  
1363 +       if (srv_use_global_flush_log_at_trx_commit) {
1364 +               flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1365 +       } else {
1366 +               flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1367 +       }
1368 +
1369         if (!trx->must_flush_log_later) {
1370                 /* Do nothing */
1371 -       } else if (srv_flush_log_at_trx_commit == 0) {
1372 +       } else if (flush_log_at_trx_commit == 0) {
1373                 /* Do nothing */
1374 -       } else if (srv_flush_log_at_trx_commit == 1) {
1375 +       } else if (flush_log_at_trx_commit == 1) {
1376                 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1377                         /* Write the log but do not flush it to disk */
1378  
1379 @@ -1721,7 +1735,7 @@
1380  
1381                         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1382                 }
1383 -       } else if (srv_flush_log_at_trx_commit == 2) {
1384 +       } else if (flush_log_at_trx_commit == 2) {
1385  
1386                 /* Write the log but do not flush it to disk */
1387  
1388 @@ -1969,6 +1983,8 @@
1389         /*--------------------------------------*/
1390  
1391         if (lsn) {
1392 +               ulint   flush_log_at_trx_commit;
1393 +
1394                 /* Depending on the my.cnf options, we may now write the log
1395                 buffer to the log files, making the prepared state of the
1396                 transaction durable if the OS does not crash. We may also
1397 @@ -1988,9 +2004,15 @@
1398  
1399                 mutex_exit(&kernel_mutex);
1400  
1401 -               if (srv_flush_log_at_trx_commit == 0) {
1402 +               if (srv_use_global_flush_log_at_trx_commit) {
1403 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1404 +               } else {
1405 +                       flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1406 +               }
1407 +
1408 +               if (flush_log_at_trx_commit == 0) {
1409                         /* Do nothing */
1410 -               } else if (srv_flush_log_at_trx_commit == 1) {
1411 +               } else if (flush_log_at_trx_commit == 1) {
1412                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1413                                 /* Write the log but do not flush it to disk */
1414  
1415 @@ -2002,7 +2024,7 @@
1416  
1417                                 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1418                         }
1419 -               } else if (srv_flush_log_at_trx_commit == 2) {
1420 +               } else if (flush_log_at_trx_commit == 2) {
1421  
1422                         /* Write the log but do not flush it to disk */
1423  
This page took 0.334324 seconds and 4 git commands to generate.