1 # name : innodb_separate_doublewrite.patch
2 # introduced : 11 or before
3 # maintainer : Yasufumi
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
9 --- a/storage/innobase/buf/buf0buf.c 2010-12-03 17:49:11.574962867 +0900
10 +++ b/storage/innobase/buf/buf0buf.c 2010-12-04 15:35:58.624514033 +0900
12 read_space_id = mach_read_from_4(
13 frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
15 - if (bpage->space == TRX_SYS_SPACE
16 + if ((bpage->space == TRX_SYS_SPACE
17 + || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
18 && trx_doublewrite_page_inside(bpage->offset)) {
20 ut_print_timestamp(stderr);
21 diff -ruN a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
22 --- a/storage/innobase/buf/buf0flu.c 2010-12-03 15:49:59.179956111 +0900
23 +++ b/storage/innobase/buf/buf0flu.c 2010-12-04 15:35:58.624514033 +0900
25 write_buf = trx_doublewrite->write_buf;
28 - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
29 + fil_io(OS_FILE_WRITE, TRUE,
30 + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
31 trx_doublewrite->block1, 0, len,
32 (void*) write_buf, NULL);
35 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
36 ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
38 - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
39 + fil_io(OS_FILE_WRITE, TRUE,
40 + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
41 trx_doublewrite->block2, 0, len,
42 (void*) write_buf, NULL);
46 /* Now flush the doublewrite buffer data to disk */
48 - fil_flush(TRX_SYS_SPACE);
49 + fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
51 /* We know that the writes have been flushed to disk now
52 and in recovery we will find them in the doublewrite buffer
53 diff -ruN a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
54 --- a/storage/innobase/buf/buf0rea.c 2010-12-04 15:35:29.138514157 +0900
55 +++ b/storage/innobase/buf/buf0rea.c 2010-12-04 15:35:58.626486771 +0900
57 wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
58 mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
60 - if (trx_doublewrite && space == TRX_SYS_SPACE
62 + && (space == TRX_SYS_SPACE
63 + || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
64 && ( (offset >= trx_doublewrite->block1
65 && offset < trx_doublewrite->block1
66 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
67 diff -ruN a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c
68 --- a/storage/innobase/dict/dict0load.c 2010-12-03 17:30:16.252956569 +0900
69 +++ b/storage/innobase/dict/dict0load.c 2010-12-04 15:35:58.627482825 +0900
74 - if (space_id == 0) {
75 + if (trx_sys_sys_space(space_id)) {
76 /* The system tablespace always exists. */
77 } else if (in_crash_recovery) {
78 /* Check that the tablespace (the .ibd file) really
80 space = mach_read_from_4(field);
82 /* Check if the tablespace exists and has the right name */
84 + if (!trx_sys_sys_space(space)) {
85 flags = dict_sys_tables_get_flags(rec);
87 if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
92 - if (table->space == 0) {
93 + if (trx_sys_sys_space(table->space)) {
94 /* The system tablespace is always available. */
95 } else if (!fil_space_for_table_exists_in_mem(
97 diff -ruN a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.c
98 --- a/storage/innobase/fil/fil0fil.c 2010-12-04 15:35:29.143813775 +0900
99 +++ b/storage/innobase/fil/fil0fil.c 2010-12-04 15:35:58.628498870 +0900
102 UT_LIST_ADD_LAST(chain, space->chain, node);
104 - if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
105 + if (id < SRV_EXTRA_SYS_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
107 fil_system->max_assigned_id = id;
109 @@ -691,14 +691,14 @@
110 size_bytes = (((ib_int64_t)size_high) << 32)
111 + (ib_int64_t)size_low;
112 #ifdef UNIV_HOTBACKUP
113 - if (space->id == 0) {
114 + if (trx_sys_sys_space(space->id)) {
115 node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
116 os_file_close(node->handle);
119 #endif /* UNIV_HOTBACKUP */
120 ut_a(space->purpose != FIL_LOG);
121 - ut_a(space->id != 0);
122 + ut_a(!trx_sys_sys_space(space->id));
124 if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
129 if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
130 - || space_id == 0)) {
131 + || trx_sys_sys_space(space_id))) {
133 "InnoDB: Error: tablespace id %lu"
134 " in file %s is not sensible\n",
139 - if (space->purpose == FIL_TABLESPACE && space->id != 0) {
140 + if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) {
141 /* Put the node to the LRU list */
142 UT_LIST_ADD_FIRST(LRU, system->LRU, node);
145 ut_a(system->n_open > 0);
148 - if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) {
149 + if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
150 ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
152 /* The node is in the LRU list, remove it */
155 mutex_enter(&fil_system->mutex);
157 - if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
158 + if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) {
159 /* We keep log files and system tablespace files always open;
160 this is important in preventing deadlocks in this module, as
161 a page read completion often performs another read from the
162 @@ -1162,7 +1162,7 @@
163 " tablespace memory cache!\n",
166 - if (id == 0 || purpose != FIL_TABLESPACE) {
167 + if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) {
169 mutex_exit(&fil_system->mutex);
171 @@ -1224,6 +1224,7 @@
174 if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on)
175 + && UNIV_UNLIKELY(id < SRV_EXTRA_SYS_SPACE_FIRST_ID)
176 && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) {
177 if (!fil_system->space_id_reuse_warned) {
178 fil_system->space_id_reuse_warned = TRUE;
179 @@ -1307,7 +1308,7 @@
180 (ulong) SRV_LOG_SPACE_FIRST_ID);
183 - success = (id < SRV_LOG_SPACE_FIRST_ID);
184 + success = (id < SRV_EXTRA_SYS_SPACE_FIRST_ID);
187 *space_id = fil_system->max_assigned_id = id;
188 @@ -1570,6 +1571,8 @@
189 UT_LIST_INIT(fil_system->LRU);
191 fil_system->max_n_open = max_n_open;
193 + fil_system->max_assigned_id = TRX_SYS_SPACE_MAX;
196 /*******************************************************************//**
197 @@ -1591,7 +1594,7 @@
198 space = UT_LIST_GET_FIRST(fil_system->space_list);
200 while (space != NULL) {
201 - if (space->purpose != FIL_TABLESPACE || space->id == 0) {
202 + if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) {
203 node = UT_LIST_GET_FIRST(space->chain);
205 while (node != NULL) {
206 @@ -1681,6 +1684,10 @@
210 + if (max_id >= SRV_EXTRA_SYS_SPACE_FIRST_ID) {
214 mutex_enter(&fil_system->mutex);
216 if (fil_system->max_assigned_id < max_id) {
217 @@ -1699,6 +1706,7 @@
219 fil_write_lsn_and_arch_no_to_file(
220 /*==============================*/
222 ulint sum_of_sizes, /*!< in: combined size of previous files
223 in space, in database pages */
224 ib_uint64_t lsn, /*!< in: lsn to write */
225 @@ -1708,14 +1716,16 @@
229 + ut_a(trx_sys_sys_space(space_id));
231 buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
232 buf = ut_align(buf1, UNIV_PAGE_SIZE);
234 - fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
235 + fil_read(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
237 mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
239 - fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
240 + fil_write(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
244 @@ -1751,7 +1761,7 @@
247 if (space->purpose == FIL_TABLESPACE
248 - && space->id == 0) {
249 + && trx_sys_sys_space(space->id)) {
252 node = UT_LIST_GET_FIRST(space->chain);
253 @@ -1759,7 +1769,7 @@
254 mutex_exit(&fil_system->mutex);
256 err = fil_write_lsn_and_arch_no_to_file(
257 - sum_of_sizes, lsn, arch_log_no);
258 + space->id, sum_of_sizes, lsn, arch_log_no);
259 if (err != DB_SUCCESS) {
262 @@ -3806,7 +3816,7 @@
265 #ifndef UNIV_HOTBACKUP
266 - if (space_id == ULINT_UNDEFINED || space_id == 0) {
267 + if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
269 "InnoDB: Error: tablespace id %lu in file %s"
270 " is not sensible\n",
271 @@ -3815,7 +3825,7 @@
275 - if (space_id == ULINT_UNDEFINED || space_id == 0) {
276 + if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
280 @@ -4636,7 +4646,7 @@
283 if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
284 - && space->id != 0) {
285 + && !trx_sys_sys_space(space->id)) {
286 /* The node is in the LRU list, remove it */
288 ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
289 @@ -4682,7 +4692,7 @@
292 if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
293 - && node->space->id != 0) {
294 + && !trx_sys_sys_space(node->space->id)) {
295 /* The node must be put back to the LRU list */
296 UT_LIST_ADD_FIRST(LRU, system->LRU, node);
298 @@ -5298,7 +5308,7 @@
299 ut_a(fil_node->n_pending == 0);
300 ut_a(fil_node->open);
301 ut_a(fil_node->space->purpose == FIL_TABLESPACE);
302 - ut_a(fil_node->space->id != 0);
303 + ut_a(!trx_sys_sys_space(fil_node->space->id));
305 fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
307 diff -ruN a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.c
308 --- a/storage/innobase/fsp/fsp0fsp.c 2010-11-03 07:01:13.000000000 +0900
309 +++ b/storage/innobase/fsp/fsp0fsp.c 2010-12-04 15:35:58.632513243 +0900
311 # include "log0log.h"
312 #endif /* UNIV_HOTBACKUP */
313 #include "dict0mem.h"
315 +#include "trx0sys.h"
317 #define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header
318 within a file page */
319 @@ -999,10 +999,10 @@
320 flst_init(header + FSP_SEG_INODES_FREE, mtr);
322 mlog_write_ull(header + FSP_SEG_ID, 1, mtr);
324 + if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) {
325 fsp_fill_free_list(FALSE, space, header, mtr);
326 btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
327 - 0, 0, DICT_IBUF_ID_MIN + space,
328 + space, 0, DICT_IBUF_ID_MIN + space,
329 dict_ind_redundant, mtr);
331 fsp_fill_free_list(TRUE, space, header, mtr);
332 diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
333 --- a/storage/innobase/handler/ha_innodb.cc 2010-12-04 15:35:29.153514047 +0900
334 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-04 15:35:58.636549909 +0900
336 static char* innobase_log_group_home_dir = NULL;
337 static char* innobase_file_format_name = NULL;
338 static char* innobase_change_buffering = NULL;
339 +static char* innobase_doublewrite_file = NULL;
341 /* The highest file format being used in the database. The value can be
342 set by user, however, it will be adjusted to the newer file format if
343 @@ -2425,6 +2426,8 @@
347 + srv_doublewrite_file = innobase_doublewrite_file;
349 srv_use_sys_stats_table = (ibool) innobase_use_sys_stats_table;
351 /* -------------- Log files ---------------------------*/
352 @@ -11553,6 +11556,11 @@
353 "Path to individual files and their sizes.",
356 +static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file,
357 + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
358 + "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###",
361 static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
362 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
363 "The AUTOINC lock modes supported by InnoDB: "
364 @@ -11723,6 +11731,7 @@
365 MYSQL_SYSVAR(commit_concurrency),
366 MYSQL_SYSVAR(concurrency_tickets),
367 MYSQL_SYSVAR(data_file_path),
368 + MYSQL_SYSVAR(doublewrite_file),
369 MYSQL_SYSVAR(data_home_dir),
370 MYSQL_SYSVAR(doublewrite),
371 MYSQL_SYSVAR(recovery_stats),
372 diff -ruN a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic
373 --- a/storage/innobase/include/mtr0log.ic 2010-11-03 07:01:13.000000000 +0900
374 +++ b/storage/innobase/include/mtr0log.ic 2010-12-04 15:35:58.644607059 +0900
378 #include "fsp0types.h"
379 +#include "srv0srv.h"
382 /********************************************************//**
383 Opens a buffer to mlog. It must be closed with mlog_close.
384 @return buffer, NULL if log mode MTR_LOG_NONE */
386 the doublewrite buffer is located in pages
387 FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
389 - if (space == TRX_SYS_SPACE
390 + if ((space == TRX_SYS_SPACE
391 + || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
392 && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
393 if (trx_doublewrite_buf_is_being_created) {
394 /* Do nothing: we only come to this branch in an
395 diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
396 --- a/storage/innobase/include/srv0srv.h 2010-12-04 15:35:29.177480351 +0900
397 +++ b/storage/innobase/include/srv0srv.h 2010-12-04 15:35:58.646556250 +0900
399 extern ulint* srv_data_file_sizes;
400 extern ulint* srv_data_file_is_raw_partition;
402 +extern char* srv_doublewrite_file;
404 extern ibool srv_recovery_stats;
406 extern ibool srv_auto_extend_last_data_file;
407 diff -ruN a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
408 --- a/storage/innobase/include/srv0start.h 2010-11-03 07:01:13.000000000 +0900
409 +++ b/storage/innobase/include/srv0start.h 2010-12-08 17:15:07.602605797 +0900
411 /** Log 'spaces' have id's >= this */
412 #define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL
414 +/** reserved for extra system tables */
415 +#define SRV_EXTRA_SYS_SPACE_FIRST_ID 0xFFFFFFE0UL
418 diff -ruN a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
419 --- a/storage/innobase/include/trx0sys.h 2010-12-03 15:41:52.047049291 +0900
420 +++ b/storage/innobase/include/trx0sys.h 2010-12-04 15:35:58.647551222 +0900
423 ulint space, /*!< in: space */
424 ulint page_no);/*!< in: page number */
425 +/***************************************************************//**
426 +Checks if a space is the system tablespaces.
427 +@return TRUE if system tablespace */
432 + ulint space); /*!< in: space */
433 +/***************************************************************//**
434 +Checks if a space is the doublewrite tablespace.
435 +@return TRUE if doublewrite tablespace */
438 +trx_sys_doublewrite_space(
439 +/*======================*/
440 + ulint space); /*!< in: space */
441 /*****************************************************************//**
442 Creates and initializes the central memory structures for the transaction
443 system. This is called when the database is started. */
446 trx_sys_create(void);
448 +/*****************************************************************//**
449 +Creates and initializes the dummy transaction system page for tablespace. */
452 +trx_sys_dummy_create(
453 +/*=================*/
455 /****************************************************************//**
456 Looks for a free slot for a rollback segment in the trx system file copy.
457 @return slot index or ULINT_UNDEFINED if not found */
460 /* Space id and page no where the trx system file copy resides */
461 #define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
462 +#define TRX_DOUBLEWRITE_SPACE 0xFFFFFFE0UL /* the doublewrite buffer tablespace if used */
463 +#define TRX_SYS_SPACE_MAX 9 /* reserved max space id for system tablespaces */
465 #define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
467 diff -ruN a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
468 --- a/storage/innobase/include/trx0sys.ic 2010-11-03 07:01:13.000000000 +0900
469 +++ b/storage/innobase/include/trx0sys.ic 2010-12-04 15:35:58.649473284 +0900
473 /***************************************************************//**
474 +Checks if a space is the system tablespaces.
475 +@return TRUE if system tablespace */
480 + ulint space) /*!< in: space */
482 + if (srv_doublewrite_file) {
483 + /* several spaces are reserved */
484 + return((ibool)(space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE));
486 + return((ibool)(space == TRX_SYS_SPACE));
490 +/***************************************************************//**
491 +Checks if a space is the doublewrite tablespace.
492 +@return TRUE if doublewrite tablespace */
495 +trx_sys_doublewrite_space(
496 +/*======================*/
497 + ulint space) /*!< in: space */
499 + if (srv_doublewrite_file) {
500 + /* doublewrite buffer is separated */
501 + return((ibool)(space == TRX_DOUBLEWRITE_SPACE));
503 + return((ibool)(space == TRX_SYS_SPACE));
507 +/***************************************************************//**
508 Gets the pointer in the nth slot of the rseg array.
509 @return pointer to rseg object, NULL if slot not in use */
511 diff -ruN a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
512 --- a/storage/innobase/row/row0mysql.c 2010-12-03 17:30:16.334989510 +0900
513 +++ b/storage/innobase/row/row0mysql.c 2010-12-04 15:35:58.652496484 +0900
514 @@ -3423,7 +3423,7 @@
515 /* Do not drop possible .ibd tablespace if something went
516 wrong: we do not want to delete valuable data of the user */
518 - if (err == DB_SUCCESS && space_id > 0) {
519 + if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) {
520 if (!fil_space_for_table_exists_in_mem(space_id,
523 diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
524 --- a/storage/innobase/srv/srv0srv.c 2010-12-04 15:35:29.180483212 +0900
525 +++ b/storage/innobase/srv/srv0srv.c 2010-12-04 15:35:58.656550107 +0900
527 /* size in database pages */
528 UNIV_INTERN ulint* srv_data_file_sizes = NULL;
530 +UNIV_INTERN char* srv_doublewrite_file = NULL;
532 UNIV_INTERN ibool srv_recovery_stats = FALSE;
534 /* if TRUE, then we auto-extend the last data file */
535 diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
536 --- a/storage/innobase/srv/srv0start.c 2010-12-04 15:35:29.183481330 +0900
537 +++ b/storage/innobase/srv/srv0start.c 2010-12-04 15:35:58.661550545 +0900
539 /*======================*/
540 ibool* create_new_db, /*!< out: TRUE if new database should be
542 + ibool* create_new_doublewrite_file,
543 #ifdef UNIV_LOG_ARCHIVE
544 ulint* min_arch_log_no,/*!< out: min of archived log
545 numbers in data files */
547 *sum_of_new_sizes = 0;
549 *create_new_db = FALSE;
550 + *create_new_doublewrite_file = FALSE;
552 srv_normalize_path_for_win(srv_data_home);
554 @@ -984,6 +986,142 @@
555 srv_data_file_is_raw_partition[i] != 0);
558 + /* special file for doublewrite buffer */
559 + if (srv_doublewrite_file)
561 + srv_normalize_path_for_win(srv_doublewrite_file);
564 + "InnoDB: Notice: innodb_doublewrite_file is specified.\n"
565 + "InnoDB: This is for expert only. Don't use if you don't understand what is it 'WELL'.\n"
566 + "InnoDB: ### Don't specify older file than the last checkpoint ###\n"
567 + "InnoDB: otherwise the older doublewrite buffer will break your data during recovery!\n");
569 + strcpy(name, srv_doublewrite_file);
571 + /* First we try to create the file: if it already
572 + exists, ret will get value FALSE */
574 + files[i] = os_file_create(innodb_file_data_key, name, OS_FILE_CREATE,
576 + OS_DATA_FILE, &ret);
578 + if (ret == FALSE && os_file_get_last_error(FALSE)
579 + != OS_FILE_ALREADY_EXISTS
581 + /* AIX 5.1 after security patch ML7 may have
582 + errno set to 0 here, which causes our function
583 + to return 100; work around that AIX problem */
584 + && os_file_get_last_error(FALSE) != 100
588 + "InnoDB: Error in creating"
589 + " or opening %s\n",
595 + if (ret == FALSE) {
596 + /* We open the data file */
598 + files[i] = os_file_create(innodb_file_data_key,
599 + name, OS_FILE_OPEN, OS_FILE_NORMAL,
600 + OS_DATA_FILE, &ret);
604 + "InnoDB: Error in opening %s\n", name);
605 + os_file_get_last_error(TRUE);
610 + ret = os_file_get_size(files[i], &size, &size_high);
612 + /* Round size downward to megabytes */
615 + = (size / (1024 * 1024) + 4096 * size_high)
616 + << (20 - UNIV_PAGE_SIZE_SHIFT);
618 + if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) {
621 + "InnoDB: Warning: doublewrite buffer file %s"
622 + " is of a different size\n"
623 + "InnoDB: %lu pages"
624 + " (rounded down to MB)\n"
625 + "InnoDB: than intended size"
628 + (ulong) rounded_size_pages,
629 + (ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9);
632 + fil_read_flushed_lsn_and_arch_log_no(
633 + files[i], one_opened,
634 +#ifdef UNIV_LOG_ARCHIVE
635 + min_arch_log_no, max_arch_log_no,
636 +#endif /* UNIV_LOG_ARCHIVE */
637 + min_flushed_lsn, max_flushed_lsn);
640 + /* We created the data file and now write it full of
643 + *create_new_doublewrite_file = TRUE;
645 + ut_print_timestamp(stderr);
647 + " InnoDB: Doublewrite buffer file %s did not"
648 + " exist: new to be created\n",
651 + if (*create_new_db == FALSE) {
653 + "InnoDB: Warning: Previous version's ibdata files may cause crash.\n"
654 + " If you use that, please use the ibdata files of this version.\n");
657 + ut_print_timestamp(stderr);
659 + " InnoDB: Setting file %s size to %lu MB\n",
661 + (ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)
662 + >> (20 - UNIV_PAGE_SIZE_SHIFT)));
665 + "InnoDB: Database physically writes the"
666 + " file full: wait...\n");
668 + ret = os_file_set_size(
670 + srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9),
671 + srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9));
675 + "InnoDB: Error in creating %s:"
676 + " probably out of disk space\n", name);
682 + ret = os_file_close(files[i]);
685 + fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE);
687 + ut_a(fil_validate());
689 + fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE);
698 /*====================================*/
701 + ibool create_new_doublewrite_file;
702 ibool log_file_created;
703 ibool log_created = FALSE;
704 ibool log_opened = FALSE;
705 @@ -1416,6 +1555,7 @@
708 err = open_or_create_data_files(&create_new_db,
709 + &create_new_doublewrite_file,
710 #ifdef UNIV_LOG_ARCHIVE
711 &min_arch_log_no, &max_arch_log_no,
712 #endif /* UNIV_LOG_ARCHIVE */
713 @@ -1545,6 +1685,14 @@
714 after the double write buffer has been created. */
717 + if (create_new_doublewrite_file) {
719 + fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
722 + trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
727 srv_startup_is_before_trx_rollback_phase = FALSE;
728 @@ -1577,6 +1725,13 @@
729 recv_recovery_from_archive_finish();
730 #endif /* UNIV_LOG_ARCHIVE */
732 + char* save_srv_doublewrite_file = NULL;
734 + if (create_new_doublewrite_file) {
735 + /* doublewrite_file cannot be used for recovery yet. */
736 + save_srv_doublewrite_file = srv_doublewrite_file;
737 + srv_doublewrite_file = NULL;
740 /* Check if we support the max format that is stamped
741 on the system tablespace.
742 @@ -1663,6 +1818,17 @@
743 we have finished the recovery process so that the
744 image of TRX_SYS_PAGE_NO is not stale. */
745 trx_sys_file_format_tag_init();
747 + if (create_new_doublewrite_file) {
748 + /* restore the value */
749 + srv_doublewrite_file = save_srv_doublewrite_file;
752 + fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
755 + trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
759 if (!create_new_db && sum_of_new_sizes > 0) {
760 diff -ruN a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c
761 --- a/storage/innobase/trx/trx0sys.c 2010-12-03 17:32:15.651024019 +0900
762 +++ b/storage/innobase/trx/trx0sys.c 2010-12-04 15:35:58.664550291 +0900
763 @@ -414,6 +414,152 @@
768 + if (srv_doublewrite_file) {
769 + /* the same doublewrite buffer to TRX_SYS_SPACE should exist.
770 + check and create if not exist.*/
773 + trx_doublewrite_buf_is_being_created = TRUE;
775 + block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
777 + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
779 + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
781 + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
782 + == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
783 + /* The doublewrite buffer has already been created:
784 + just read in some numbers */
789 + "InnoDB: Doublewrite buffer not found in the doublewrite file:"
790 + " creating new\n");
792 + if (buf_pool_get_curr_size()
793 + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
794 + + FSP_EXTENT_SIZE / 2 + 100)
795 + * UNIV_PAGE_SIZE)) {
797 + "InnoDB: Cannot create doublewrite buffer:"
799 + "InnoDB: increase your buffer pool size.\n"
800 + "InnoDB: Cannot continue operation.\n");
805 + block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
806 + TRX_SYS_DOUBLEWRITE
807 + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
809 + /* fseg_create acquires a second latch on the page,
810 + therefore we must declare it: */
812 + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
814 + if (block2 == NULL) {
816 + "InnoDB: Cannot create doublewrite buffer:"
818 + "InnoDB: increase your tablespace size.\n"
819 + "InnoDB: Cannot continue operation.\n");
821 + /* We exit without committing the mtr to prevent
822 + its modifications to the database getting to disk */
827 + fseg_header = buf_block_get_frame(block)
828 + + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
831 + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
832 + + FSP_EXTENT_SIZE / 2; i++) {
833 + page_no = fseg_alloc_free_page(fseg_header,
836 + if (page_no == FIL_NULL) {
838 + "InnoDB: Cannot create doublewrite"
839 + " buffer: you must\n"
840 + "InnoDB: increase your"
841 + " tablespace size.\n"
842 + "InnoDB: Cannot continue operation.\n"
848 + /* We read the allocated pages to the buffer pool;
849 + when they are written to disk in a flush, the space
850 + id and page number fields are also written to the
851 + pages. When we at database startup read pages
852 + from the doublewrite buffer, we know that if the
853 + space id and page number in them are the same as
854 + the page position in the tablespace, then the page
855 + has not been written to in doublewrite. */
857 +#ifdef UNIV_SYNC_DEBUG
859 +#endif /* UNIV_SYNC_DEBUG */
860 + buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, page_no,
862 + buf_block_dbg_add_level(new_block,
863 + SYNC_NO_ORDER_CHECK);
865 + if (i == FSP_EXTENT_SIZE / 2) {
866 + ut_a(page_no == FSP_EXTENT_SIZE);
867 + mlog_write_ulint(doublewrite
868 + + TRX_SYS_DOUBLEWRITE_BLOCK1,
869 + page_no, MLOG_4BYTES, &mtr);
870 + mlog_write_ulint(doublewrite
871 + + TRX_SYS_DOUBLEWRITE_REPEAT
872 + + TRX_SYS_DOUBLEWRITE_BLOCK1,
873 + page_no, MLOG_4BYTES, &mtr);
874 + } else if (i == FSP_EXTENT_SIZE / 2
875 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
876 + ut_a(page_no == 2 * FSP_EXTENT_SIZE);
877 + mlog_write_ulint(doublewrite
878 + + TRX_SYS_DOUBLEWRITE_BLOCK2,
879 + page_no, MLOG_4BYTES, &mtr);
880 + mlog_write_ulint(doublewrite
881 + + TRX_SYS_DOUBLEWRITE_REPEAT
882 + + TRX_SYS_DOUBLEWRITE_BLOCK2,
883 + page_no, MLOG_4BYTES, &mtr);
884 + } else if (i > FSP_EXTENT_SIZE / 2) {
885 + ut_a(page_no == prev_page_no + 1);
888 + prev_page_no = page_no;
891 + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
892 + TRX_SYS_DOUBLEWRITE_MAGIC_N,
893 + MLOG_4BYTES, &mtr);
894 + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
895 + + TRX_SYS_DOUBLEWRITE_REPEAT,
896 + TRX_SYS_DOUBLEWRITE_MAGIC_N,
897 + MLOG_4BYTES, &mtr);
899 + mlog_write_ulint(doublewrite
900 + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
901 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
902 + MLOG_4BYTES, &mtr);
905 + /* Flush the modified pages to disk and make a checkpoint */
906 + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
908 + fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
909 + trx_sys_multiple_tablespace_format = TRUE;
911 + trx_doublewrite_buf_is_being_created = FALSE;
915 /****************************************************************//**
916 @@ -437,10 +583,19 @@
917 ulint source_page_no;
920 + ulint doublewrite_space_id;
925 + doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
927 + if (srv_doublewrite_file) {
929 + "InnoDB: doublewrite file '%s' is used.\n",
930 + srv_doublewrite_file);
933 /* We do the file i/o past the buffer pool */
935 unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
937 /* Read the trx sys header to check if we are using the doublewrite
940 - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
941 + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
942 UNIV_PAGE_SIZE, read_buf, NULL);
943 doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
945 @@ -487,10 +642,10 @@
947 /* Read the pages from the doublewrite buffer to memory */
949 - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
950 + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
951 TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
953 - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
954 + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
955 TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
956 buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
959 " doublewrite buf.\n",
960 (ulong) space_id, (ulong) page_no, (ulong) i);
962 - } else if (space_id == TRX_SYS_SPACE
963 + } else if ((space_id == TRX_SYS_SPACE
964 + || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
965 && ((page_no >= block1
967 < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
968 @@ -990,6 +1146,83 @@
971 /*****************************************************************//**
972 +Creates dummy of the file page for the transaction system. */
975 +trx_sysf_dummy_create(
976 +/*==================*/
980 + buf_block_t* block;
985 + /* Note that below we first reserve the file space x-latch, and
986 + then enter the kernel: we must do it in this order to conform
987 + to the latching order rules. */
989 + mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
990 + mutex_enter(&kernel_mutex);
992 + /* Create the trx sys file block in a new allocated file segment */
993 + block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
995 + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
997 + fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
998 + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
1000 + page = buf_block_get_frame(block);
1002 + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
1003 + MLOG_2BYTES, mtr);
1005 + /* Reset the doublewrite buffer magic number to zero so that we
1006 + know that the doublewrite buffer has not yet been created (this
1007 + suppresses a Valgrind warning) */
1009 + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
1010 + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
1013 + /* TODO: REMOVE IT: The bellow is not needed, I think */
1014 + sys_header = trx_sysf_get(mtr);
1016 + /* Start counting transaction ids from number 1 up */
1017 + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
1018 + ut_dulint_create(0, 1), mtr);
1020 + /* Reset the rollback segment slots */
1021 + for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
1023 + trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
1024 + trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
1027 + /* The remaining area (up to the page trailer) is uninitialized.
1028 + Silence Valgrind warnings about it. */
1029 + UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
1030 + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
1031 + + TRX_SYS_RSEG_SPACE),
1032 + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
1034 + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
1035 + + TRX_SYS_RSEG_SPACE))
1036 + + page - sys_header);
1038 + /* Create the first rollback segment in the SYSTEM tablespace */
1039 + page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
1041 + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
1042 + ut_a(page_no != FIL_NULL);
1045 + mutex_exit(&kernel_mutex);
1048 +/*****************************************************************//**
1049 Creates and initializes the central memory structures for the transaction
1050 system. This is called when the database is started. */
1052 @@ -1351,6 +1584,26 @@
1053 /* Does nothing at the moment */
1056 +/*****************************************************************//**
1057 +Creates and initializes the dummy transaction system page for tablespace. */
1060 +trx_sys_dummy_create(
1061 +/*=================*/
1066 + /* This function is only for doublewrite file for now */
1067 + ut_a(space == TRX_DOUBLEWRITE_SPACE);
1071 + trx_sysf_dummy_create(space, &mtr);
1076 /*********************************************************************
1077 Creates the rollback segments */