# name : innodb_separate_doublewrite.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -3807,7 +3807,8 @@ read_space_id = mach_read_from_4( frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - if (bpage->space == TRX_SYS_SPACE + if ((bpage->space == TRX_SYS_SPACE + || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE)) && trx_doublewrite_page_inside(bpage->offset)) { ut_print_timestamp(stderr); --- a/storage/innobase/buf/buf0flu.c +++ b/storage/innobase/buf/buf0flu.c @@ -793,7 +793,8 @@ write_buf = trx_doublewrite->write_buf; i = 0; - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, TRUE, + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0, trx_doublewrite->block1, 0, len, (void*) write_buf, NULL); @@ -830,7 +831,8 @@ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE); - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, TRUE, + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0, trx_doublewrite->block2, 0, len, (void*) write_buf, NULL); @@ -860,7 +862,7 @@ flush: /* Now flush the doublewrite buffer data to disk */ - fil_flush(TRX_SYS_SPACE, FALSE); + fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -90,7 +90,9 @@ wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; - if (trx_doublewrite && space == TRX_SYS_SPACE + if (trx_doublewrite + && (space == TRX_SYS_SPACE + || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE)) && ( (offset >= trx_doublewrite->block1 && offset < trx_doublewrite->block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) --- a/storage/innobase/dict/dict0load.c +++ b/storage/innobase/dict/dict0load.c @@ -41,6 +41,7 @@ #include "srv0start.h" #include "srv0srv.h" #include "ha_prototypes.h" /* innobase_casedn_str() */ +#include "trx0sys.h" /** Following are six InnoDB system tables */ @@ -816,7 +817,7 @@ mtr_commit(&mtr); - if (space_id == 0) { + if (trx_sys_sys_space(space_id)) { /* The system tablespace always exists. */ } else if (in_crash_recovery) { /* Check that the tablespace (the .ibd file) really @@ -1727,7 +1728,7 @@ space = mach_read_from_4(field); /* Check if the tablespace exists and has the right name */ - if (space != 0) { + if (!trx_sys_sys_space(space)) { flags = dict_sys_tables_get_flags(rec); if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { @@ -1880,7 +1881,7 @@ goto err_exit; } - if (table->space == 0) { + if (trx_sys_sys_space(table->space)) { /* The system tablespace is always available. */ } else if (!fil_space_for_table_exists_in_mem( table->space, name, --- a/storage/innobase/fil/fil0fil.c +++ b/storage/innobase/fil/fil0fil.c @@ -657,7 +657,7 @@ UT_LIST_ADD_LAST(chain, space->chain, node); - if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { + if (id < SRV_EXTRA_SYS_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { fil_system->max_assigned_id = id; } @@ -721,14 +721,14 @@ size_bytes = (((ib_int64_t)size_high) << 32) + (ib_int64_t)size_low; #ifdef UNIV_HOTBACKUP - if (space->id == 0) { + if (trx_sys_sys_space(space->id)) { node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); os_file_close(node->handle); goto add_size; } #endif /* UNIV_HOTBACKUP */ ut_a(space->purpose != FIL_LOG); - ut_a(space->id != 0); + ut_a(!trx_sys_sys_space(space->id)); if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { fprintf(stderr, @@ -774,7 +774,7 @@ } if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED - || space_id == 0)) { + || trx_sys_sys_space(space_id))) { fprintf(stderr, "InnoDB: Error: tablespace id %lu" " in file %s is not sensible\n", @@ -842,7 +842,7 @@ system->n_open++; - if (space->purpose == FIL_TABLESPACE && space->id != 0) { + if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) { /* Put the node to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); } @@ -876,7 +876,7 @@ ut_a(system->n_open > 0); system->n_open--; - if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) { + if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) { ut_a(UT_LIST_GET_LEN(system->LRU) > 0); /* The node is in the LRU list, remove it */ @@ -962,7 +962,7 @@ retry: mutex_enter(&fil_system->mutex); - if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) { /* We keep log files and system tablespace files always open; this is important in preventing deadlocks in this module, as a page read completion often performs another read from the @@ -1193,7 +1193,7 @@ " tablespace memory cache!\n", (ulong) space->id); - if (id == 0 || purpose != FIL_TABLESPACE) { + if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) { mutex_exit(&fil_system->mutex); @@ -1255,6 +1255,7 @@ space->mark = FALSE; if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on) + && UNIV_UNLIKELY(id < SRV_EXTRA_SYS_SPACE_FIRST_ID) && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) { if (!fil_system->space_id_reuse_warned) { fil_system->space_id_reuse_warned = TRUE; @@ -1338,7 +1339,7 @@ (ulong) SRV_LOG_SPACE_FIRST_ID); } - success = (id < SRV_LOG_SPACE_FIRST_ID); + success = (id < SRV_EXTRA_SYS_SPACE_FIRST_ID); if (success) { *space_id = fil_system->max_assigned_id = id; @@ -1601,6 +1602,8 @@ UT_LIST_INIT(fil_system->LRU); fil_system->max_n_open = max_n_open; + + fil_system->max_assigned_id = TRX_SYS_SPACE_MAX; } /*******************************************************************//** @@ -1622,7 +1625,7 @@ space = UT_LIST_GET_FIRST(fil_system->space_list); while (space != NULL) { - if (space->purpose != FIL_TABLESPACE || space->id == 0) { + if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) { node = UT_LIST_GET_FIRST(space->chain); while (node != NULL) { @@ -1712,6 +1715,10 @@ ut_error; } + if (max_id >= SRV_EXTRA_SYS_SPACE_FIRST_ID) { + return; + } + mutex_enter(&fil_system->mutex); if (fil_system->max_assigned_id < max_id) { @@ -1730,6 +1737,7 @@ ulint fil_write_lsn_and_arch_no_to_file( /*==============================*/ + ulint space_id, ulint sum_of_sizes, /*!< in: combined size of previous files in space, in database pages */ ib_uint64_t lsn, /*!< in: lsn to write */ @@ -1739,14 +1747,16 @@ byte* buf1; byte* buf; + ut_a(trx_sys_sys_space(space_id)); + buf1 = mem_alloc(2 * UNIV_PAGE_SIZE); buf = ut_align(buf1, UNIV_PAGE_SIZE); - fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + fil_read(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + fil_write(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mem_free(buf1); @@ -1782,7 +1792,7 @@ always open. */ if (space->purpose == FIL_TABLESPACE - && space->id == 0) { + && trx_sys_sys_space(space->id)) { sum_of_sizes = 0; node = UT_LIST_GET_FIRST(space->chain); @@ -1790,7 +1800,7 @@ mutex_exit(&fil_system->mutex); err = fil_write_lsn_and_arch_no_to_file( - sum_of_sizes, lsn, arch_log_no); + space->id, sum_of_sizes, lsn, arch_log_no); if (err != DB_SUCCESS) { return(err); @@ -4176,7 +4186,7 @@ } #ifndef UNIV_HOTBACKUP - if (space_id == ULINT_UNDEFINED || space_id == 0) { + if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) { fprintf(stderr, "InnoDB: Error: tablespace id %lu in file %s" " is not sensible\n", @@ -4185,7 +4195,7 @@ goto func_exit; } #else - if (space_id == ULINT_UNDEFINED || space_id == 0) { + if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) { char* new_path; fprintf(stderr, @@ -5006,7 +5016,7 @@ } if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE - && space->id != 0) { + && !trx_sys_sys_space(space->id)) { /* The node is in the LRU list, remove it */ ut_a(UT_LIST_GET_LEN(system->LRU) > 0); @@ -5052,7 +5062,7 @@ } if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE - && node->space->id != 0) { + && !trx_sys_sys_space(node->space->id)) { /* The node must be put back to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); } @@ -5663,7 +5673,7 @@ ut_a(fil_node->n_pending == 0); ut_a(fil_node->open); ut_a(fil_node->space->purpose == FIL_TABLESPACE); - ut_a(fil_node->space->id != 0); + ut_a(!trx_sys_sys_space(fil_node->space->id)); fil_node = UT_LIST_GET_NEXT(LRU, fil_node); } --- a/storage/innobase/fsp/fsp0fsp.c +++ b/storage/innobase/fsp/fsp0fsp.c @@ -48,7 +48,7 @@ # include "log0log.h" #endif /* UNIV_HOTBACKUP */ #include "dict0mem.h" - +#include "trx0sys.h" /* FILE SEGMENT INODE ================== @@ -938,10 +938,10 @@ flst_init(header + FSP_SEG_INODES_FREE, mtr); mlog_write_ull(header + FSP_SEG_ID, 1, mtr); - if (space == 0) { + if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) { fsp_fill_free_list(FALSE, space, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, - 0, 0, DICT_IBUF_ID_MIN + space, + space, 0, DICT_IBUF_ID_MIN + space, dict_ind_redundant, mtr); } else { fsp_fill_free_list(TRUE, space, header, mtr); --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -163,6 +163,7 @@ static char* innobase_log_group_home_dir = NULL; static char* innobase_file_format_name = NULL; static char* innobase_change_buffering = NULL; +static char* innobase_doublewrite_file = NULL; /* The highest file format being used in the database. The value can be set by user, however, it will be adjusted to the newer file format if @@ -2508,6 +2509,8 @@ goto error; } + srv_doublewrite_file = innobase_doublewrite_file; + srv_use_sys_stats_table = (ibool) innobase_use_sys_stats_table; /* -------------- Log files ---------------------------*/ @@ -11771,6 +11774,11 @@ "Path to individual files and their sizes.", NULL, NULL, NULL); +static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###", + NULL, NULL, NULL); + static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "The AUTOINC lock modes supported by InnoDB: " @@ -11990,6 +11998,7 @@ MYSQL_SYSVAR(commit_concurrency), MYSQL_SYSVAR(concurrency_tickets), MYSQL_SYSVAR(data_file_path), + MYSQL_SYSVAR(doublewrite_file), MYSQL_SYSVAR(data_home_dir), MYSQL_SYSVAR(doublewrite), MYSQL_SYSVAR(recovery_stats), --- a/storage/innobase/include/mtr0log.ic +++ b/storage/innobase/include/mtr0log.ic @@ -27,8 +27,8 @@ #include "ut0lst.h" #include "buf0buf.h" #include "fsp0types.h" +#include "srv0srv.h" #include "trx0sys.h" - /********************************************************//** Opens a buffer to mlog. It must be closed with mlog_close. @return buffer, NULL if log mode MTR_LOG_NONE */ @@ -201,7 +201,8 @@ the doublewrite buffer is located in pages FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the system tablespace */ - if (space == TRX_SYS_SPACE + if ((space == TRX_SYS_SPACE + || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE)) && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) { if (trx_doublewrite_buf_is_being_created) { /* Do nothing: we only come to this branch in an --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -129,6 +129,8 @@ extern ulint* srv_data_file_sizes; extern ulint* srv_data_file_is_raw_partition; +extern char* srv_doublewrite_file; + extern ibool srv_recovery_stats; extern ibool srv_auto_extend_last_data_file; --- a/storage/innobase/include/srv0start.h +++ b/storage/innobase/include/srv0start.h @@ -127,4 +127,7 @@ /** Log 'spaces' have id's >= this */ #define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL +/** reserved for extra system tables */ +#define SRV_EXTRA_SYS_SPACE_FIRST_ID 0xFFFFFFE0UL + #endif --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -125,6 +125,22 @@ /*=============*/ ulint space, /*!< in: space */ ulint page_no);/*!< in: page number */ +/***************************************************************//** +Checks if a space is the system tablespaces. +@return TRUE if system tablespace */ +UNIV_INLINE +ibool +trx_sys_sys_space( +/*==============*/ + ulint space); /*!< in: space */ +/***************************************************************//** +Checks if a space is the doublewrite tablespace. +@return TRUE if doublewrite tablespace */ +UNIV_INLINE +ibool +trx_sys_doublewrite_space( +/*======================*/ + ulint space); /*!< in: space */ /*****************************************************************//** Creates and initializes the central memory structures for the transaction system. This is called when the database is started. */ @@ -138,6 +154,13 @@ void trx_sys_create(void); /*================*/ +/*****************************************************************//** +Creates and initializes the dummy transaction system page for tablespace. */ +UNIV_INTERN +void +trx_sys_dummy_create( +/*=================*/ + ulint space); /****************************************************************//** Looks for a free slot for a rollback segment in the trx system file copy. @return slot index or ULINT_UNDEFINED if not found */ @@ -453,6 +476,8 @@ /* Space id and page no where the trx system file copy resides */ #define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ +#define TRX_DOUBLEWRITE_SPACE 0xFFFFFFE0UL /* the doublewrite buffer tablespace if used */ +#define TRX_SYS_SPACE_MAX 9 /* reserved max space id for system tablespaces */ #include "fsp0fsp.h" #define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -71,6 +71,40 @@ } /***************************************************************//** +Checks if a space is the system tablespaces. +@return TRUE if system tablespace */ +UNIV_INLINE +ibool +trx_sys_sys_space( +/*==============*/ + ulint space) /*!< in: space */ +{ + if (srv_doublewrite_file) { + /* several spaces are reserved */ + return((ibool)(space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE)); + } else { + return((ibool)(space == TRX_SYS_SPACE)); + } +} + +/***************************************************************//** +Checks if a space is the doublewrite tablespace. +@return TRUE if doublewrite tablespace */ +UNIV_INLINE +ibool +trx_sys_doublewrite_space( +/*======================*/ + ulint space) /*!< in: space */ +{ + if (srv_doublewrite_file) { + /* doublewrite buffer is separated */ + return((ibool)(space == TRX_DOUBLEWRITE_SPACE)); + } else { + return((ibool)(space == TRX_SYS_SPACE)); + } +} + +/***************************************************************//** Gets the pointer in the nth slot of the rseg array. @return pointer to rseg object, NULL if slot not in use */ UNIV_INLINE --- a/storage/innobase/row/row0mysql.c +++ b/storage/innobase/row/row0mysql.c @@ -3436,7 +3436,7 @@ /* Do not drop possible .ibd tablespace if something went wrong: we do not want to delete valuable data of the user */ - if (err == DB_SUCCESS && space_id > 0) { + if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) { if (!fil_space_for_table_exists_in_mem(space_id, name_or_path, is_temp, FALSE, --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -163,6 +163,8 @@ /* size in database pages */ UNIV_INTERN ulint* srv_data_file_sizes = NULL; +UNIV_INTERN char* srv_doublewrite_file = NULL; + UNIV_INTERN ibool srv_recovery_stats = FALSE; /* if TRUE, then we auto-extend the last data file */ --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -715,6 +715,7 @@ /*======================*/ ibool* create_new_db, /*!< out: TRUE if new database should be created */ + ibool* create_new_doublewrite_file, #ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no,/*!< out: min of archived log numbers in data files */ @@ -748,6 +749,7 @@ *sum_of_new_sizes = 0; *create_new_db = FALSE; + *create_new_doublewrite_file = FALSE; srv_normalize_path_for_win(srv_data_home); @@ -1004,6 +1006,142 @@ srv_data_file_is_raw_partition[i] != 0); } + /* special file for doublewrite buffer */ + if (srv_doublewrite_file) + { + srv_normalize_path_for_win(srv_doublewrite_file); + + fprintf(stderr, + "InnoDB: Note: The innodb_doublewrite_file option has been specified.\n" + "InnoDB: This option is for experts only. Don't use it unless you understand WELL what it is.\n" + "InnoDB: ### Don't specify a file older than the last checkpoint. ###\n" + "InnoDB: Otherwise, the older doublewrite buffer will break your data during recovery!\n"); + + strcpy(name, srv_doublewrite_file); + + /* First we try to create the file: if it already + exists, ret will get value FALSE */ + + files[i] = os_file_create(innodb_file_data_key, name, OS_FILE_CREATE, + OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + + if (ret == FALSE && os_file_get_last_error(FALSE) + != OS_FILE_ALREADY_EXISTS +#ifdef UNIV_AIX + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our function + to return 100; work around that AIX problem */ + && os_file_get_last_error(FALSE) != 100 +#endif + ) { + fprintf(stderr, + "InnoDB: Error in creating" + " or opening %s\n", + name); + + return(DB_ERROR); + } + + if (ret == FALSE) { + /* We open the data file */ + + files[i] = os_file_create(innodb_file_data_key, + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + + if (!ret) { + fprintf(stderr, + "InnoDB: Error in opening %s\n", name); + os_file_get_last_error(TRUE); + + return(DB_ERROR); + } + + ret = os_file_get_size(files[i], &size, &size_high); + ut_a(ret); + /* Round size downward to megabytes */ + + rounded_size_pages + = (size / (1024 * 1024) + 4096 * size_high) + << (20 - UNIV_PAGE_SIZE_SHIFT); + + if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) { + + fprintf(stderr, + "InnoDB: Warning: doublewrite buffer file %s" + " is of a different size\n" + "InnoDB: %lu pages" + " (rounded down to MB)\n" + "InnoDB: than intended size" + " %lu pages...\n", + name, + (ulong) rounded_size_pages, + (ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9); + } + + fil_read_first_page( + files[i], one_opened, &flags, +#ifdef UNIV_LOG_ARCHIVE + min_arch_log_no, max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + min_flushed_lsn, max_flushed_lsn); + one_opened = TRUE; + } else { + /* We created the data file and now write it full of + zeros */ + + *create_new_doublewrite_file = TRUE; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Doublewrite buffer file %s did not" + " exist. It will be be created.\n", + name); + + if (*create_new_db == FALSE) { + fprintf(stderr, + "InnoDB: Notice: Previous version's ibdata files may cause crash.\n" + " If you use that, please use the ibdata files of this version.\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Setting file %s size to %lu MB\n", + name, + (ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) + >> (20 - UNIV_PAGE_SIZE_SHIFT))); + + fprintf(stderr, + "InnoDB: Database physically writes the" + " file full: wait...\n"); + + ret = os_file_set_size( + name, files[i], + srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9), + srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)); + + if (!ret) { + fprintf(stderr, + "InnoDB: Error in creating %s:" + " probably out of disk space\n", name); + + return(DB_ERROR); + } + } + + ret = os_file_close(files[i]); + ut_a(ret); + + fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE); + + ut_a(fil_validate()); + + fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE); + + i++; + } + return(DB_SUCCESS); } @@ -1017,6 +1155,7 @@ /*====================================*/ { ibool create_new_db; + ibool create_new_doublewrite_file; ibool log_file_created; ibool log_created = FALSE; ibool log_opened = FALSE; @@ -1482,6 +1621,7 @@ } err = open_or_create_data_files(&create_new_db, + &create_new_doublewrite_file, #ifdef UNIV_LOG_ARCHIVE &min_arch_log_no, &max_arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ @@ -1649,6 +1789,14 @@ after the double write buffer has been created. */ trx_sys_create(); + if (create_new_doublewrite_file) { + mtr_start(&mtr); + fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr); + mtr_commit(&mtr); + + trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE); + } + dict_create(); srv_startup_is_before_trx_rollback_phase = FALSE; @@ -1682,6 +1830,13 @@ recv_recovery_from_archive_finish(); #endif /* UNIV_LOG_ARCHIVE */ } else { + char* save_srv_doublewrite_file = NULL; + + if (create_new_doublewrite_file) { + /* doublewrite_file cannot be used for recovery yet. */ + save_srv_doublewrite_file = srv_doublewrite_file; + srv_doublewrite_file = NULL; + } /* Check if we support the max format that is stamped on the system tablespace. @@ -1768,6 +1923,17 @@ we have finished the recovery process so that the image of TRX_SYS_PAGE_NO is not stale. */ trx_sys_file_format_tag_init(); + + if (create_new_doublewrite_file) { + /* restore the value */ + srv_doublewrite_file = save_srv_doublewrite_file; + + mtr_start(&mtr); + fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr); + mtr_commit(&mtr); + + trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE); + } } if (!create_new_db && sum_of_new_sizes > 0) { --- a/storage/innobase/trx/trx0sys.c +++ b/storage/innobase/trx/trx0sys.c @@ -415,6 +415,152 @@ goto start_again; } + + if (srv_doublewrite_file) { + /* the same doublewrite buffer to TRX_SYS_SPACE should exist. + check and create if not exist.*/ + + mtr_start(&mtr); + trx_doublewrite_buf_is_being_created = TRUE; + + block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + mtr_commit(&mtr); + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found in the doublewrite file:" + " creating new doublewrite buffer.\n"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Cannot create the doublewrite buffer:" + " You must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue processing.\n"); + + exit(1); + } + + block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create the doublewrite buffer:" + " You must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue processing.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = buf_block_get_frame(block) + + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + page_no = fseg_alloc_free_page(fseg_header, + prev_page_no + 1, + FSP_UP, &mtr); + if (page_no == FIL_NULL) { + fprintf(stderr, + "InnoDB: Cannot create the doublewrite" + " buffer: You must\n" + "InnoDB: increase your" + " tablespace size.\n" + "InnoDB: Cannot continue operation.\n" + ); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + +#ifdef UNIV_SYNC_DEBUG + new_block = +#endif /* UNIV_SYNC_DEBUG */ + buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(new_block, + SYNC_NO_ORDER_CHECK); + + if (i == FSP_EXTENT_SIZE / 2) { + ut_a(page_no == FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + ut_a(page_no == 2 * FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n"); + trx_sys_multiple_tablespace_format = TRUE; + } + trx_doublewrite_buf_is_being_created = FALSE; + } } /****************************************************************//** @@ -438,10 +584,19 @@ ulint source_page_no; byte* page; byte* doublewrite; + ulint doublewrite_space_id; ulint space_id; ulint page_no; ulint i; + doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE); + + if (srv_doublewrite_file) { + fprintf(stderr, + "InnoDB: doublewrite file '%s' is used.\n", + srv_doublewrite_file); + } + /* We do the file i/o past the buffer pool */ unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); @@ -450,7 +605,7 @@ /* Read the trx sys header to check if we are using the doublewrite buffer */ - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0, UNIV_PAGE_SIZE, read_buf, NULL); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; @@ -488,10 +643,10 @@ /* Read the pages from the doublewrite buffer to memory */ - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0, + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf, NULL); - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0, + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, NULL); @@ -547,7 +702,8 @@ " doublewrite buf.\n", (ulong) space_id, (ulong) page_no, (ulong) i); - } else if (space_id == TRX_SYS_SPACE + } else if ((space_id == TRX_SYS_SPACE + || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE)) && ((page_no >= block1 && page_no < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) @@ -1016,6 +1172,83 @@ } /*****************************************************************//** +Creates dummy of the file page for the transaction system. */ +static +void +trx_sysf_dummy_create( +/*==================*/ + ulint space, + mtr_t* mtr) +{ + buf_block_t* block; + page_t* page; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(space, NULL), mtr); + mutex_enter(&kernel_mutex); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + fprintf(stderr, "%lu\n", buf_block_get_page_no(block)); + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + +#ifdef UNDEFINED + /* TODO: REMOVE IT: The bellow is not needed, I think */ + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + ut_dulint_create(0, 1), mtr); + + /* Reset the rollback segment slots */ + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr); + trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr); + } + + /* The remaining area (up to the page trailer) is uninitialized. + Silence Valgrind warnings about it. */ + UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE), + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + - (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE)) + + page - sys_header); + + /* Create the first rollback segment in the SYSTEM tablespace */ + page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no, + mtr); + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no != FIL_NULL); +#endif + + mutex_exit(&kernel_mutex); +} + +/*****************************************************************//** Creates and initializes the central memory structures for the transaction system. This is called when the database is started. */ UNIV_INTERN @@ -1387,6 +1620,26 @@ /* Does nothing at the moment */ } +/*****************************************************************//** +Creates and initializes the dummy transaction system page for tablespace. */ +UNIV_INTERN +void +trx_sys_dummy_create( +/*=================*/ + ulint space) +{ + mtr_t mtr; + + /* This function is only for doublewrite file for now */ + ut_a(space == TRX_DOUBLEWRITE_SPACE); + + mtr_start(&mtr); + + trx_sysf_dummy_create(space, &mtr); + + mtr_commit(&mtr); +} + /********************************************************************* Creates the rollback segments */ UNIV_INTERN --- /dev/null +++ b/mysql-test/r/percona_innodb_doublewrite_file.result @@ -0,0 +1,4 @@ +show variables like 'innodb_doublewrite%'; +Variable_name Value +innodb_doublewrite ON +innodb_doublewrite_file ib_doublewrite --- /dev/null +++ b/mysql-test/t/percona_innodb_doublewrite_file-master.opt @@ -0,0 +1 @@ +--innodb_doublewrite_file=ib_doublewrite --- /dev/null +++ b/mysql-test/t/percona_innodb_doublewrite_file.test @@ -0,0 +1,2 @@ +--source include/have_innodb.inc +show variables like 'innodb_doublewrite%';