# name : innodb_lru_dump_restore.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -2197,6 +2197,289 @@ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); } +/********************************************************************//** +Dump the LRU page list to the specific file. */ +#define LRU_DUMP_FILE "ib_lru_dump" + +UNIV_INTERN +ibool +buf_LRU_file_dump(void) +/*===================*/ +{ + os_file_t dump_file = -1; + ibool success; + byte* buffer_base = NULL; + byte* buffer = NULL; + buf_page_t* bpage; + ulint buffers; + ulint offset; + ibool ret = FALSE; + ulint i; + + for (i = 0; i < srv_n_data_files; i++) { + if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) { + fprintf(stderr, + " InnoDB: The name '%s' seems to be used for" + " innodb_data_file_path. For safety, dumping of the LRU list" + " is not being done.\n", LRU_DUMP_FILE); + goto end; + } + } + + buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE); + buffer = ut_align(buffer_base, UNIV_PAGE_SIZE); + if (!buffer) { + fprintf(stderr, + " InnoDB: cannot allocate buffer.\n"); + goto end; + } + + dump_file = os_file_create(innodb_file_temp_key, LRU_DUMP_FILE, OS_FILE_OVERWRITE, + OS_FILE_NORMAL, OS_DATA_FILE, &success); + if (!success) { + os_file_get_last_error(TRUE); + fprintf(stderr, + " InnoDB: cannot open %s\n", LRU_DUMP_FILE); + goto end; + } + + buffers = offset = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + mutex_enter(&buf_pool->LRU_list_mutex); + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + if (offset == 0) { + memset(buffer, 0, UNIV_PAGE_SIZE); + } + + mach_write_to_4(buffer + offset * 4, bpage->space); + offset++; + mach_write_to_4(buffer + offset * 4, bpage->offset); + offset++; + + if (offset == UNIV_PAGE_SIZE/4) { + success = os_file_write(LRU_DUMP_FILE, dump_file, buffer, + (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, + (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), + UNIV_PAGE_SIZE); + if (!success) { + mutex_exit(&buf_pool->LRU_list_mutex); + fprintf(stderr, + " InnoDB: cannot write page %lu of %s\n", + buffers, LRU_DUMP_FILE); + goto end; + } + buffers++; + offset = 0; + } + + bpage = UT_LIST_GET_PREV(LRU, bpage); + } + mutex_exit(&buf_pool->LRU_list_mutex); + } + + if (offset == 0) { + memset(buffer, 0, UNIV_PAGE_SIZE); + } + + mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL); + offset++; + mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL); + offset++; + + success = os_file_write(LRU_DUMP_FILE, dump_file, buffer, + (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, + (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), + UNIV_PAGE_SIZE); + if (!success) { + goto end; + } + + ret = TRUE; +end: + if (dump_file != -1) + os_file_close(dump_file); + if (buffer_base) + ut_free(buffer_base); + + return(ret); +} + +typedef struct { + ib_uint32_t space_id; + ib_uint32_t page_no; +} dump_record_t; + +static int dump_record_cmp(const void *a, const void *b) +{ + const dump_record_t *rec1 = (dump_record_t *) a; + const dump_record_t *rec2 = (dump_record_t *) b; + + if (rec1->space_id < rec2->space_id) + return -1; + if (rec1->space_id > rec2->space_id) + return 1; + if (rec1->page_no < rec2->page_no) + return -1; + return rec1->page_no > rec2->page_no; +} + +/********************************************************************//** +Read the pages based on the specific file.*/ +UNIV_INTERN +ibool +buf_LRU_file_restore(void) +/*======================*/ +{ + os_file_t dump_file = -1; + ibool success; + byte* buffer_base = NULL; + byte* buffer = NULL; + ulint buffers; + ulint offset; + ulint reads = 0; + ulint req = 0; + ibool terminated = FALSE; + ibool ret = FALSE; + dump_record_t* records = NULL; + ulint size; + ulint size_high; + ulint length; + + dump_file = os_file_create_simple_no_error_handling(innodb_file_temp_key, + LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success || !os_file_get_size(dump_file, &size, &size_high)) { + os_file_get_last_error(TRUE); + fprintf(stderr, + " InnoDB: cannot open %s\n", LRU_DUMP_FILE); + goto end; + } + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Restoring buffer pool pages from %s\n", + LRU_DUMP_FILE); + + if (size == 0 || size_high > 0 || size % 8) { + fprintf(stderr, " InnoDB: broken LRU dump file\n"); + goto end; + } + buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE); + buffer = ut_align(buffer_base, UNIV_PAGE_SIZE); + records = ut_malloc(size); + if (!buffer || !records) { + fprintf(stderr, + " InnoDB: cannot allocate buffer.\n"); + goto end; + } + + buffers = 0; + length = 0; + while (!terminated) { + success = os_file_read(dump_file, buffer, + (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, + (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), + UNIV_PAGE_SIZE); + if (!success) { + fprintf(stderr, + " InnoDB: either could not read page %lu of %s," + " or terminated unexpectedly.\n", + buffers, LRU_DUMP_FILE); + goto end; + } + + for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) { + ulint space_id; + ulint page_no; + + space_id = mach_read_from_4(buffer + offset * 4); + page_no = mach_read_from_4(buffer + (offset + 1) * 4); + if (space_id == 0xFFFFFFFFUL + || page_no == 0xFFFFFFFFUL) { + terminated = TRUE; + break; + } + + records[length].space_id = space_id; + records[length].page_no = page_no; + length++; + if (length * 8 >= size) { + fprintf(stderr, + " InnoDB: could not find the " + "end-of-file marker after reading " + "the expected %lu bytes from the " + "LRU dump file.\n" + " InnoDB: this could be caused by a " + "broken or incomplete file.\n" + " InnoDB: trying to process what has " + "been read so far.\n", + size); + terminated= TRUE; + break; + } + } + buffers++; + } + + qsort(records, length, sizeof(dump_record_t), dump_record_cmp); + + for (offset = 0; offset < length; offset++) { + ulint space_id; + ulint page_no; + ulint zip_size; + ulint err; + ib_int64_t tablespace_version; + + space_id = records[offset].space_id; + page_no = records[offset].page_no; + + if (offset % 16 == 15) { + os_aio_simulated_wake_handler_threads(); + buf_flush_free_margins(FALSE); + } + + zip_size = fil_space_get_zip_size(space_id); + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + continue; + } + + if (fil_is_exist(space_id, page_no)) { + + tablespace_version = fil_space_get_version(space_id); + + req++; + reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, + space_id, zip_size, TRUE, + tablespace_version, page_no, NULL); + buf_LRU_stat_inc_io(); + } + } + + os_aio_simulated_wake_handler_threads(); + buf_flush_free_margins(FALSE); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Completed reading buffer pool pages" + " (requested: %lu, read: %lu)\n", req, reads); + ret = TRUE; +end: + if (dump_file != -1) + os_file_close(dump_file); + if (buffer_base) + ut_free(buffer_base); + if (records) + ut_free(records); + + return(ret); +} + #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /**********************************************************************//** Validates the LRU list for one buffer pool instance. */ --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -60,7 +60,7 @@ which case it is never read into the pool, or if the tablespace does not exist or is being dropped @return 1 if read request is issued. 0 if it is not */ -static +UNIV_INTERN ulint buf_read_page_low( /*==============*/ --- a/storage/innobase/fil/fil0fil.c +++ b/storage/innobase/fil/fil0fil.c @@ -5307,6 +5307,70 @@ return(DB_SUCCESS); } +/********************************************************************//** +Confirm whether the parameters are valid or not */ +UNIV_INTERN +ibool +fil_is_exist( +/*==============*/ + ulint space_id, /*!< in: space id */ + ulint block_offset) /*!< in: offset in number of blocks */ +{ + fil_space_t* space; + fil_node_t* node; + + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + + if (!space) { + mutex_exit(&fil_system->mutex); + return(FALSE); + } + + node = UT_LIST_GET_FIRST(space->chain); + + for (;;) { + if (UNIV_UNLIKELY(node == NULL)) { + mutex_exit(&fil_system->mutex); + return(FALSE); + } + + if (space->id != 0 && node->size == 0) { + /* We do not know the size of a single-table tablespace + before we open the file */ + + break; + } + + if (node->size > block_offset) { + /* Found! */ + break; + } else { + block_offset -= node->size; + node = UT_LIST_GET_NEXT(chain, node); + } + } + + /* Open file if closed */ + fil_node_prepare_for_io(node, fil_system, space); + fil_node_complete_io(node, fil_system, OS_FILE_READ); + + /* Check that at least the start offset is within the bounds of a + single-table tablespace */ + if (UNIV_UNLIKELY(node->size <= block_offset) + && space->id != 0 && space->purpose == FIL_TABLESPACE) { + mutex_exit(&fil_system->mutex); + return(FALSE); + } + + mutex_exit(&fil_system->mutex); + return(TRUE); +} + #ifndef UNIV_HOTBACKUP /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -196,6 +196,8 @@ static char* innodb_version_str = (char*) INNODB_VERSION_STR; +static my_bool innobase_blocking_lru_restore = FALSE; + /** Possible values for system variable "innodb_stats_method". The values are defined the same as its corresponding MyISAM system variable "myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ @@ -2652,6 +2654,8 @@ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; srv_use_checksums = (ibool) innobase_use_checksums; + srv_blocking_lru_restore = (ibool) innobase_blocking_lru_restore; + #ifdef HAVE_LARGE_PAGES if ((os_use_large_pages = (ibool) my_use_large_pages)) os_large_page_size = (ulint) opt_large_page_size; @@ -11964,6 +11968,19 @@ "Limit the allocated memory for dictionary cache. (0: unlimited)", NULL, NULL, 0, 0, LONG_MAX, 0); +static MYSQL_SYSVAR_UINT(buffer_pool_restore_at_startup, srv_auto_lru_dump, + PLUGIN_VAR_RQCMDARG, + "Time in seconds between automatic buffer pool dumps. " + "0 (the default) disables automatic dumps.", + NULL, NULL, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(blocking_buffer_pool_restore, + innobase_blocking_lru_restore, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Block XtraDB startup process until buffer pool is full restored from a " + "dump file (if present). Disabled by default.", + NULL, NULL, FALSE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), @@ -12048,6 +12065,8 @@ MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(buffer_pool_restore_at_startup), + MYSQL_SYSVAR(blocking_buffer_pool_restore), MYSQL_SYSVAR(purge_threads), MYSQL_SYSVAR(purge_batch_size), MYSQL_SYSVAR(rollback_segments), --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -50,6 +50,7 @@ #include "trx0rseg.h" /* for trx_rseg_struct */ #include "trx0sys.h" /* for trx_sys */ #include "dict0dict.h" /* for dict_sys */ +#include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */ } #define OK(expr) \ @@ -4336,6 +4337,36 @@ "Hello!"); goto end_func; } + else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Administrative command 'XTRA_LRU_DUMP'" + " was detected.\n"); + + if (buf_LRU_file_dump()) { + field_store_string(i_s_table->field[0], + "XTRA_LRU_DUMP was succeeded."); + } else { + field_store_string(i_s_table->field[0], + "XTRA_LRU_DUMP was failed."); + } + + goto end_func; + } + else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Administrative command 'XTRA_LRU_RESTORE'" + " was detected.\n"); + + if (buf_LRU_file_restore()) { + field_store_string(i_s_table->field[0], + "XTRA_LRU_RESTORE was succeeded."); + } else { + field_store_string(i_s_table->field[0], + "XTRA_LRU_RESTORE was failed."); + } + + goto end_func; + } field_store_string(i_s_table->field[0], "Undefined XTRA_* command."); --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -204,6 +204,18 @@ void buf_LRU_stat_update(void); /*=====================*/ +/********************************************************************//** +Dump the LRU page list to the specific file. */ +UNIV_INTERN +ibool +buf_LRU_file_dump(void); +/*===================*/ +/********************************************************************//** +Read the pages based on the specific file.*/ +UNIV_INTERN +ibool +buf_LRU_file_restore(void); +/*======================*/ /******************************************************************//** Remove one page from LRU list and put it to free list */ --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -31,6 +31,37 @@ #include "buf0types.h" /********************************************************************//** +Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. +@return 1 if a read request was queued, 0 if the page already resided +in buf_pool, or if the page is in the doublewrite buffer blocks in +which case it is never read into the pool, or if the tablespace does +not exist or is being dropped +@return 1 if read request is issued. 0 if it is not */ +UNIV_INTERN +ulint +buf_read_page_low( +/*==============*/ + ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + trying to read from a non-existent tablespace, or a + tablespace which is just now being dropped */ + ibool sync, /*!< in: TRUE if synchronous aio is desired */ + ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ..., + ORed to OS_AIO_SIMULATED_WAKE_LATER (see below + at read-ahead functions) */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + ibool unzip, /*!< in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version, /*!< in: if the space memory object has + this timestamp different from what we are giving here, + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ + ulint offset, /*!< in: page number */ + trx_t* trx); +/********************************************************************//** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -653,6 +653,14 @@ void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ trx_t* trx); +/********************************************************************//** +Confirm whether the parameters are valid or not */ +UNIV_INTERN +ibool +fil_is_exist( +/*==============*/ + ulint space_id, /*!< in: space id */ + ulint block_offset); /*!< in: offset in number of blocks */ /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the handler for completed requests. The aio array of pending requests is divided --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -364,6 +364,12 @@ reading of a disk page */ extern ulint srv_buf_pool_reads; +/** Time in seconds between automatic buffer pool dumps */ +extern uint srv_auto_lru_dump; + +/** Whether startup should be blocked until buffer pool is fully restored */ +extern ibool srv_blocking_lru_restore; + /** Status variables to be passed to MySQL */ typedef struct export_var_struct export_struc; @@ -669,6 +675,16 @@ /*=====================*/ void* arg); /*!< in: a dummy parameter required by os_thread_create */ +/*********************************************************************//** +A thread which restores the buffer pool from a dump file on startup and does +periodic buffer pool dumps. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +srv_LRU_dump_restore_thread( +/*====================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ /******************************************************************//** Outputs to a file the output of the InnoDB Monitor. @return FALSE if not all information printed --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -332,6 +332,12 @@ reading of a disk page */ UNIV_INTERN ulint srv_buf_pool_reads = 0; +/** Time in seconds between automatic buffer pool dumps */ +UNIV_INTERN uint srv_auto_lru_dump = 0; + +/** Whether startup should be blocked until buffer pool is fully restored */ +UNIV_INTERN ibool srv_blocking_lru_restore; + /* structure to pass status variables to MySQL */ UNIV_INTERN export_struc export_vars; @@ -2708,6 +2714,58 @@ /* We count the number of threads in os_thread_exit(). A created thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************//** +A thread which restores the buffer pool from a dump file on startup and does +periodic buffer pool dumps. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +srv_LRU_dump_restore_thread( +/*====================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + uint auto_lru_dump; + time_t last_dump_time; + time_t time_elapsed; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "The LRU dump/restore thread has started, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + + /* If srv_blocking_lru_restore is TRUE, restore will be done + synchronously on startup. */ + if (srv_auto_lru_dump && !srv_blocking_lru_restore) + buf_LRU_file_restore(); + + last_dump_time = time(NULL); + +loop: + os_thread_sleep(5000000); + + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + goto exit_func; + } + + time_elapsed = time(NULL) - last_dump_time; + auto_lru_dump = srv_auto_lru_dump; + if (auto_lru_dump > 0 && (time_t) auto_lru_dump < time_elapsed) { + last_dump_time = time(NULL); + buf_LRU_file_dump(); + } + + goto loop; +exit_func: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); OS_THREAD_DUMMY_RETURN; --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -87,6 +87,7 @@ # include "btr0pcur.h" # include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */ # include "zlib.h" /* for ZLIB_VERSION */ +# include "buf0lru.h" /* for buf_LRU_file_restore() */ /** Log sequence number immediately after startup */ UNIV_INTERN ib_uint64_t srv_start_lsn; @@ -120,9 +121,9 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ -static ulint n[SRV_MAX_N_IO_THREADS + 6]; +static ulint n[SRV_MAX_N_IO_THREADS + 7]; /** io_handler_thread identifiers */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6]; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -1841,6 +1842,15 @@ os_thread_create(&srv_monitor_thread, NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + /* Create the thread which automaticaly dumps/restore buffer pool */ + os_thread_create(&srv_LRU_dump_restore_thread, NULL, + thread_ids + 5 + SRV_MAX_N_IO_THREADS); + + /* If srv_blocking_lru_restore is TRUE, load buffer pool contents + synchronously */ + if (srv_auto_lru_dump && srv_blocking_lru_restore) + buf_LRU_file_restore(); + srv_is_being_started = FALSE; err = dict_create_or_check_foreign_constraint_tables(); --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_blocking_buffer_pool_restore_basic.result @@ -0,0 +1,3 @@ +SELECT @@global.innodb_blocking_buffer_pool_restore; +@@global.innodb_blocking_buffer_pool_restore +0 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_blocking_buffer_pool_restore_basic.test @@ -0,0 +1 @@ +SELECT @@global.innodb_blocking_buffer_pool_restore;