# name : innodb_expand_import.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -838,7 +838,7 @@ /**************************************************************//** Creates a new index page (not the root, and also not used in page reorganization). @see btr_page_empty(). */ -static +UNIV_INTERN void btr_page_create( /*============*/ @@ -1712,7 +1712,7 @@ #ifndef UNIV_HOTBACKUP /*************************************************************//** Empties an index page. @see btr_page_create(). */ -static +UNIV_INTERN void btr_page_empty( /*===========*/ @@ -2274,7 +2274,7 @@ /**************************************************************//** Attaches the halves of an index page on the appropriate level in an index tree. */ -static +UNIV_INTERN void btr_attach_half_pages( /*==================*/ --- a/storage/innobase/fil/fil0fil.c +++ b/storage/innobase/fil/fil0fil.c @@ -40,6 +40,14 @@ #include "dict0dict.h" #include "page0page.h" #include "page0zip.h" +#include "trx0trx.h" +#include "trx0sys.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "row0row.h" +#include "que0que.h" +#include "btr0btr.h" +#include "btr0sea.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -3041,6 +3049,84 @@ } /********************************************************************//** +Checks if a page is corrupt. (for offline page) +*/ +static +ibool +fil_page_buf_page_is_corrupted_offline( +/*===================================*/ + const byte* page, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ +{ + ulint checksum_field; + ulint old_checksum_field; + + if (!zip_size + && memcmp(page + FIL_PAGE_LSN + 4, + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + return(TRUE); + } + + checksum_field = mach_read_from_4(page + + FIL_PAGE_SPACE_OR_CHKSUM); + + if (zip_size) { + return(checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != page_zip_calc_checksum(page, zip_size)); + } + + old_checksum_field = mach_read_from_4( + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + if (old_checksum_field != mach_read_from_4(page + + FIL_PAGE_LSN) + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC + && old_checksum_field + != buf_calc_page_old_checksum(page)) { + return(TRUE); + } + + if (checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum(page)) { + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************//** +*/ +static +void +fil_page_buf_page_store_checksum( +/*=============================*/ + byte* page, + ulint zip_size) +{ + if (!zip_size) { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? buf_calc_page_new_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + } else { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? page_zip_calc_checksum(page, zip_size) + : BUF_NO_CHECKSUM_MAGIC); + } +} + +/********************************************************************//** Tries to open a single-table tablespace and optionally checks the space id is right in it. If does not succeed, prints an error message to the .err log. This function is used to open a tablespace when we start up mysqld, and also in @@ -3063,8 +3149,11 @@ accessing the first page of the file */ ulint id, /*!< in: space id */ ulint flags, /*!< in: tablespace flags */ - const char* name) /*!< in: table name in the + const char* name, /*!< in: table name in the databasename/tablename format */ + trx_t* trx) /*!< in: transaction. This is only used + for IMPORT TABLESPACE, must be NULL + otherwise */ { os_file_t file; char* filepath; @@ -3087,7 +3176,7 @@ file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_WRITE, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -3134,6 +3223,453 @@ space_id = fsp_header_get_space_id(page); space_flags = fsp_header_get_flags(page); + if (srv_expand_import) { + + ibool file_is_corrupt = FALSE; + byte* buf3; + byte* descr_page; + ibool descr_is_corrupt = FALSE; + index_id_t old_id[31]; + index_id_t new_id[31]; + ulint root_page[31]; + ulint n_index; + os_file_t info_file = -1; + char* info_file_path; + ulint i; + int len; + ib_uint64_t current_lsn; + ulint size_low, size_high, size, free_limit; + ib_int64_t size_bytes, free_limit_bytes; + dict_table_t* table; + dict_index_t* index; + fil_system_t* system; + fil_node_t* node = NULL; + fil_space_t* space; + ulint zip_size; + + buf3 = ut_malloc(2 * UNIV_PAGE_SIZE); + descr_page = ut_align(buf3, UNIV_PAGE_SIZE); + + current_lsn = log_get_lsn(); + + /* check the header page's consistency */ + if (buf_page_is_corrupted(page, + dict_table_flags_to_zip_size(space_flags))) { + fprintf(stderr, "InnoDB: page 0 of %s seems corrupt.\n", filepath); + file_is_corrupt = TRUE; + descr_is_corrupt = TRUE; + } + + /* store as first descr page */ + memcpy(descr_page, page, UNIV_PAGE_SIZE); + + zip_size = dict_table_flags_to_zip_size(flags); + ut_a(zip_size == dict_table_flags_to_zip_size(space_flags)); + + /* get free limit (page number) of the table space */ +/* these should be same to the definition in fsp0fsp.c */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA +#define FSP_FREE_LIMIT 12 + free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page); + free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)(zip_size ? zip_size : UNIV_PAGE_SIZE); + + /* overwrite fsp header */ + fsp_header_init_fields(page, id, flags); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); + space_id = id; + space_flags = flags; + if (mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn) + mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + + fil_page_buf_page_store_checksum(page, zip_size); + + success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE); + + /* get file size */ + os_file_get_size(file, &size_low, &size_high); + size_bytes = (((ib_int64_t)size_high) << 32) + + (ib_int64_t)size_low; + + if (size_bytes < free_limit_bytes) { + free_limit_bytes = size_bytes; + if (size_bytes >= (lint)FSP_EXTENT_SIZE * (lint)(zip_size ? zip_size : UNIV_PAGE_SIZE)) { + fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath); + file_is_corrupt = TRUE; + } + } + + /* get cruster index information */ + table = dict_table_get_low(name); + index = dict_table_get_first_index(table); + ut_a(index->page==3); + + /* read metadata from .exp file */ + n_index = 0; + memset(old_id, 0, sizeof(old_id)); + memset(new_id, 0, sizeof(new_id)); + memset(root_page, 0, sizeof(root_page)); + + info_file_path = fil_make_ibd_name(name, FALSE); + len = strlen(info_file_path); + info_file_path[len - 3] = 'e'; + info_file_path[len - 2] = 'x'; + info_file_path[len - 1] = 'p'; + + info_file = os_file_create_simple_no_error_handling(innodb_file_data_key, + info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success) { + fprintf(stderr, "InnoDB: Cannot open the file: %s\n", info_file_path); + file_is_corrupt = TRUE; + goto skip_info; + } + success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + fprintf(stderr, "InnoDB: Cannot read the file: %s\n", info_file_path); + file_is_corrupt = TRUE; + goto skip_info; + } + if (mach_read_from_4(page) != 0x78706f72UL + || mach_read_from_4(page + 4) != 0x74696e66UL) { + fprintf(stderr, "InnoDB: %s seems to be an incorrect .exp file.\n", info_file_path); + file_is_corrupt = TRUE; + goto skip_info; + } + + fprintf(stderr, "InnoDB: Import: The extended import of %s is being started.\n", name); + + n_index = mach_read_from_4(page + 8); + fprintf(stderr, "InnoDB: Import: %lu indexes have been detected.\n", (ulong)n_index); + for (i = 0; i < n_index; i++) { + new_id[i] = + dict_table_get_index_on_name(table, + (char*)(page + (i + 1) * 512 + 12))->id; + old_id[i] = mach_read_from_8(page + (i + 1) * 512); + root_page[i] = mach_read_from_4(page + (i + 1) * 512 + 8); + } + +skip_info: + if (info_file != -1) + os_file_close(info_file); + + /* + if (size_bytes >= 1024 * 1024) { + size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); + } + */ + + if (zip_size) { + fprintf(stderr, "InnoDB: Warning: importing compressed table is still EXPERIMENTAL, currently.\n"); + } + + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ib_int64_t offset; + + size = (ulint) (size_bytes / (zip_size ? zip_size : UNIV_PAGE_SIZE)); + /* over write space id of all pages */ + rec_offs_init(offsets_); + + /* Unlock the data dictionary to not block queries + accessing other tables */ + ut_a(trx); + row_mysql_unlock_data_dictionary(trx); + + fprintf(stderr, "InnoDB: Progress in %%:"); + + for (offset = 0; offset < free_limit_bytes; + offset += zip_size ? zip_size : UNIV_PAGE_SIZE) { + ibool page_is_corrupt; + + success = os_file_read(file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); + + page_is_corrupt = FALSE; + + /* check consistency */ + if (fil_page_buf_page_is_corrupted_offline(page, zip_size)) { + page_is_corrupt = TRUE; + } + + if (mach_read_from_4(page + FIL_PAGE_OFFSET) + != offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) { + + page_is_corrupt = TRUE; + } + + /* if it is free page, inconsistency is acceptable */ + if (!offset) { + /* header page*/ + /* it should be overwritten already */ + ut_a(!page_is_corrupt); + + } else if (!((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) + % (zip_size ? zip_size : UNIV_PAGE_SIZE))) { + /* descr page (not header) */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + descr_is_corrupt = TRUE; + } else { + ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES); + descr_is_corrupt = FALSE; + } + + /* store as descr page */ + memcpy(descr_page, page, (zip_size ? zip_size : UNIV_PAGE_SIZE)); + + } else if (descr_is_corrupt) { + /* unknown state of the page */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + } + + } else { + /* check free page or not */ + /* These definitions should be same to fsp0fsp.c */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define XDES_BITMAP (FLST_NODE_SIZE + 12) +#define XDES_BITS_PER_PAGE 2 +#define XDES_FREE_BIT 0 +#define XDES_SIZE \ + (XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + + /*descr = descr_page + XDES_ARR_OFFSET + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)*/ + /*xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)*/ + byte* descr; + ulint index; + ulint byte_index; + ulint bit_index; + + descr = descr_page + XDES_ARR_OFFSET + + XDES_SIZE * (ut_2pow_remainder( + (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)), + (zip_size ? zip_size : UNIV_PAGE_SIZE)) / FSP_EXTENT_SIZE); + + index = XDES_FREE_BIT + + XDES_BITS_PER_PAGE * ((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) % FSP_EXTENT_SIZE); + byte_index = index / 8; + bit_index = index % 8; + + if (ut_bit_get_nth(mach_read_from_1(descr + XDES_BITMAP + byte_index), bit_index)) { + /* free page */ + if (page_is_corrupt) { + goto skip_write; + } + } else { + /* not free */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + } + } + } + + if (page_is_corrupt) { + fprintf(stderr, " [errp:%lld]", offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)); + + /* cannot treat corrupt page */ + goto skip_write; + } + + if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) { + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); + + for (i = 0; i < n_index; i++) { + if (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE) == root_page[i]) { + if (fil_page_get_type(page) != FIL_PAGE_INDEX) { + file_is_corrupt = TRUE; + fprintf(stderr, " [etyp:%lld]", + offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)); + goto skip_write; + } + /* this is index root page */ + mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + FSEG_HDR_SPACE, id); + mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + FSEG_HDR_SPACE, id); + break; + } + } + + if (fil_page_get_type(page) == FIL_PAGE_INDEX) { + index_id_t tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID)); + + for (i = 0; i < n_index; i++) { + if (old_id[i] == tmp) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); + break; + } + } + + if (!zip_size && mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && old_id[0] == tmp) { + /* leaf page of cluster index, reset trx_id of records */ + rec_t* rec; + rec_t* supremum; + ulint n_recs; + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + n_recs = page_get_n_recs(page); + + while (rec && rec != supremum && n_recs > 0) { + ulint n_fields; + ulint i; + ulint offset = index->trx_id_offset; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_fields = rec_offs_n_fields(offsets); + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + trx_write_trx_id(rec + offset, 1); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint local_len; + byte* data; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id); + } + } + + rec = page_rec_get_next(rec); + n_recs--; + } + } else if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && old_id[0] != tmp) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 1); + } + } + + if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) { + mach_write_to_8(page + FIL_PAGE_LSN, current_lsn); + if (!zip_size) { + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + current_lsn); + } + } + + fil_page_buf_page_store_checksum(page, zip_size); + + success = os_file_write(filepath, file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); + } + +skip_write: + if (free_limit_bytes + && ((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes) + != ((offset * 100) / free_limit_bytes)) { + fprintf(stderr, " %lu", + (ulong)((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes)); + } + } + + fprintf(stderr, " done.\n"); + + /* Reacquire the data dictionary lock */ + row_mysql_lock_data_dictionary(trx); + + /* update SYS_INDEXES set root page */ + index = dict_table_get_first_index(table); + while (index) { + for (i = 0; i < n_index; i++) { + if (new_id[i] == index->id) { + break; + } + } + + if (i != n_index + && root_page[i] != index->page) { + /* must update */ + ulint error; + trx_t* trx; + pars_info_t* info = NULL; + + trx = trx_allocate_for_mysql(); + trx->op_info = "extended import"; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "indexid", new_id[i]); + pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]); + + error = que_eval_sql(info, + "PROCEDURE UPDATE_INDEX_PAGE () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES" + " SET PAGE_NO = :new_page" + " WHERE ID = :indexid;\n" + "COMMIT WORK;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: failed to update SYS_INDEXES\n"); + } + + trx_commit_for_mysql(trx); + + trx_free_for_mysql(trx); + + index->page = root_page[i]; + } + + index = dict_table_get_next_index(index); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + /* .exp file should be removed */ + success = os_file_delete(info_file_path); + if (!success) { + success = os_file_delete_if_exists(info_file_path); + } + mem_free(info_file_path); + + system = fil_system; + mutex_enter(&(system->mutex)); + space = fil_space_get_by_id(id); + if (space) + node = UT_LIST_GET_FIRST(space->chain); + if (node && node->size < size) { + space->size += (size - node->size); + node->size = size; + } + mutex_exit(&(system->mutex)); + + ut_free(buf3); + + if (file_is_corrupt) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: file ", + stderr); + ut_print_filename(stderr, filepath); + fprintf(stderr, " seems to be corrupt.\n" + "InnoDB: An attempt to convert and salvage all corrupt pages was not made.\n" + "InnoDB: ##### CAUTION #####\n" + "InnoDB: ## The .ibd file may cause InnoDB to crash, even though its re-import seems to have succeeded.\n" + "InnoDB: ## If you don't know how to salvage data from a .ibd, you should not use the file.\n" + "InnoDB: ###################\n"); + success = FALSE; + + ut_free(buf2); + + goto func_exit; + } + } + ut_free(buf2); if (UNIV_UNLIKELY(space_id != id @@ -3175,6 +3711,269 @@ os_file_close(file); mem_free(filepath); + if (srv_expand_import && dict_table_flags_to_zip_size(flags)) { + ulint page_no; + ulint zip_size; + ulint height; + rec_t* node_ptr; + dict_table_t* table; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + mtr_t mtr; + + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + zip_size = dict_table_flags_to_zip_size(flags); + + table = dict_table_get_low(name); + index = dict_table_get_first_index(table); + page_no = dict_index_get_page(index); + ut_a(page_no == 3); + + fprintf(stderr, "InnoDB: It is compressed .ibd file. need to convert additionaly on buffer pool.\n"); + + /* down to leaf */ + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + height = ULINT_UNDEFINED; + + for (;;) { + block = buf_page_get(space_id, zip_size, page_no, + RW_NO_LATCH, &mtr); + page = buf_block_get_frame(block); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + height = btr_page_get_level(page, &mtr); + } + + if (height == 0) { + break; + } + + node_ptr = page_rec_get_next(page_get_infimum_rec(page)); + + height--; + + offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + mtr_commit(&mtr); + + fprintf(stderr, "InnoDB: pages needs split are ..."); + + /* scan reaf pages */ + while (page_no != FIL_NULL) { + rec_t* rec; + rec_t* supremum; + ulint n_recs; + + mtr_start(&mtr); + + block = buf_page_get(space_id, zip_size, page_no, + RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!page_zip) { + /*something wrong*/ + fprintf(stderr, "InnoDB: Something wrong with reading page %lu.\n", page_no); +convert_err_exit: + mtr_commit(&mtr); + mutex_enter(&fil_system->mutex); + fil_space_free(space_id, FALSE); + mutex_exit(&fil_system->mutex); + success = FALSE; + goto convert_exit; + } + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + n_recs = page_get_n_recs(page); + + /* illegal operation as InnoDB online system. so not logged */ + while (rec && rec != supremum && n_recs > 0) { + ulint n_fields; + ulint i; + ulint offset = index->trx_id_offset; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_fields = rec_offs_n_fields(offsets); + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + trx_write_trx_id(rec + offset, 1); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint local_len; + byte* data; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id); + } + } + + rec = page_rec_get_next(rec); + n_recs--; + } + + /* dummy logged update for along with modified page path */ + if (index->id != btr_page_get_index_id(page)) { + /* this should be adjusted already */ + fprintf(stderr, "InnoDB: The page %lu seems to be converted wrong.\n", page_no); + goto convert_err_exit; + } + btr_page_set_index_id(page, page_zip, index->id, &mtr); + + /* confirm whether fits to the page size or not */ + if (!page_zip_compress(page_zip, page, index, &mtr) + && !btr_page_reorganize(block, index, &mtr)) { + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + ulint n_uniq; + + /* split page is needed */ + fprintf(stderr, " %lu", page_no); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + n_uniq = dict_index_get_n_unique_in_tree(index); + + if(page_get_n_recs(page) < 2) { + /* no way to make smaller */ + fprintf(stderr, "InnoDB: The page %lu cannot be store to the page size.\n", page_no); + goto convert_err_exit; + } + + if (UNIV_UNLIKELY(page_no == dict_index_get_page(index))) { + ulint new_page_no; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t page_cursor; + + /* it is root page, need to raise before split */ + + level = btr_page_get_level(page, &mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, level, &mtr); + + btr_page_set_next(new_page, new_page_zip, FIL_NULL, &mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, + level); + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + btr_page_empty(block, page_zip, index, level + 1, &mtr); + + btr_page_set_next(page, page_zip, FIL_NULL, &mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, &mtr); + + page_cur_set_before_first(block, &page_cursor); + + node_ptr_rec = page_cur_tuple_insert(&page_cursor, node_ptr, + index, 0, &mtr); + ut_a(node_ptr_rec); + + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + + /* move to the raised page */ + page_no = new_page_no; + block = new_block; + page = new_page; + page_zip = new_page_zip; + + fprintf(stderr, "(raise_to:%lu)", page_no); + } + + split_rec = page_get_middle_rec(page); + + new_block = btr_page_alloc(index, page_no + 1, FSP_UP, + btr_page_get_level(page, &mtr), &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, + btr_page_get_level(page, &mtr), &mtr); + + offsets = rec_get_offsets(split_rec, index, offsets, n_uniq, &heap); + + btr_attach_half_pages(index, block, + split_rec, new_block, FSP_UP, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + page_delete_rec_list_start(split_rec - page + new_page, + new_block, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + page_delete_rec_list_end(split_rec, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, &mtr); + + fprintf(stderr, "(new:%lu)", buf_block_get_page_no(new_block)); + + /* Are they needed? */ + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + if (!btr_page_reorganize(new_block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", buf_block_get_page_no(new_block)); + goto convert_err_exit; + } + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (heap) { + mem_heap_empty(heap); + } + } + + fprintf(stderr, "...done.\nInnoDB: waiting the flush batch of the additional conversion.\n"); + + /* should wait for the not-logged changes are all flushed */ + buf_flush_list(ULINT_MAX, mtr.end_lsn + 1); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + fprintf(stderr, "InnoDB: done.\n"); +convert_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + return(success); } #endif /* !UNIV_HOTBACKUP */ --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -7421,6 +7421,14 @@ err = row_discard_tablespace_for_mysql(dict_table->name, trx); } else { err = row_import_tablespace_for_mysql(dict_table->name, trx); + + /* in expanded import mode re-initialize auto_increment again */ + if ((err == DB_SUCCESS) && srv_expand_import && + (table->found_next_number_field != NULL)) { + dict_table_autoinc_lock(dict_table); + innobase_initialize_autoinc(); + dict_table_autoinc_unlock(dict_table); + } } err = convert_error_code_to_mysql(err, dict_table->flags, NULL); @@ -11820,6 +11828,11 @@ NULL, NULL, 0, 0, 1, 0); #endif +static MYSQL_SYSVAR_ULONG(import_table_from_xtrabackup, srv_expand_import, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable converting automatically *.ibd files when import tablespace.", + NULL, NULL, 0, 0, 1, 0); + static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit, PLUGIN_VAR_RQCMDARG, "Limit the allocated memory for dictionary cache. (0: unlimited)", @@ -11894,6 +11907,7 @@ MYSQL_SYSVAR(flush_neighbor_pages), MYSQL_SYSVAR(read_ahead), MYSQL_SYSVAR(adaptive_flushing_method), + MYSQL_SYSVAR(import_table_from_xtrabackup), MYSQL_SYSVAR(dict_size_limit), MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(use_native_aio), --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -238,6 +238,17 @@ @return the uncompressed page frame */ # define btr_page_get(space,zip_size,page_no,mode,idx,mtr) \ buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr)) +/**************************************************************//** +Sets the index id field of a page. */ +UNIV_INLINE +void +btr_page_set_index_id( +/*==================*/ + page_t* page, /*!< in: page to be created */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + index_id_t id, /*!< in: index id */ + mtr_t* mtr); /*!< in: mtr */ #endif /* !UNIV_HOTBACKUP */ /**************************************************************//** Gets the index id field of a page. @@ -275,6 +286,17 @@ const page_t* page, /*!< in: index page */ mtr_t* mtr); /*!< in: mini-transaction handle */ /********************************************************//** +Sets the next index page field. */ +UNIV_INLINE +void +btr_page_set_next( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint next, /*!< in: next page number */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** Gets the previous index page number. @return prev page number */ UNIV_INLINE @@ -283,6 +305,17 @@ /*==============*/ const page_t* page, /*!< in: index page */ mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Sets the previous index page field. */ +UNIV_INLINE +void +btr_page_set_prev( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint prev, /*!< in: previous page number */ + mtr_t* mtr); /*!< in: mini-transaction handle */ /*************************************************************//** Gets pointer to the previous user record in the tree. It is assumed that the caller has appropriate latches on the page and its neighbor. @@ -328,6 +361,18 @@ /*===========================*/ const rec_t* rec, /*!< in: node pointer record */ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +UNIV_INTERN +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ /************************************************************//** Creates the root node for a new index tree. @return page number of the created root, FIL_NULL if did not succeed */ @@ -397,6 +442,17 @@ dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** +Empties an index page. @see btr_page_create(). */ +UNIV_INTERN +void +btr_page_empty( +/*===========*/ + buf_block_t* block, /*!< in: page to be emptied */ + page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** Decides if the page should be split at the convergence point of inserts converging to left. @return TRUE if split recommended */ @@ -455,6 +511,20 @@ # define btr_insert_on_non_leaf_level(i,l,t,m) \ btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m) #endif /* !UNIV_HOTBACKUP */ +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +UNIV_INTERN +void +btr_attach_half_pages( +/*==================*/ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr); /*!< in: mtr */ /****************************************************************//** Sets a record as the predefined minimum record. */ UNIV_INTERN --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -234,6 +234,8 @@ extern ulint srv_read_ahead; extern ulint srv_adaptive_flushing_method; +extern ulint srv_expand_import; + extern ulint srv_dict_size_limit; /*-------------------------------------------*/ --- a/storage/innobase/row/row0mysql.c +++ b/storage/innobase/row/row0mysql.c @@ -2547,6 +2547,11 @@ current_lsn = log_get_lsn(); + /* Enlarge the fatal lock wait timeout during import. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + /* It is possible, though very improbable, that the lsn's in the tablespace to be imported have risen above the current system lsn, if a lengthy purge, ibuf merge, or rollback was performed on a backup @@ -2632,7 +2637,7 @@ success = fil_open_single_table_tablespace( TRUE, table->space, table->flags == DICT_TF_COMPACT ? 0 : table->flags, - table->name); + table->name, trx); if (success) { table->ibd_file_missing = FALSE; table->tablespace_discarded = FALSE; @@ -2658,6 +2663,11 @@ trx->op_info = ""; + /* Restore the fatal semaphore wait timeout */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + return((int) err); } --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -418,6 +418,8 @@ UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ UNIV_INTERN ulint srv_adaptive_flushing_method = 0; /* 0: native 1: estimate 2: keep_average */ +UNIV_INTERN ulint srv_expand_import = 0; /* 0:disable 1:enable */ + UNIV_INTERN ulint srv_dict_size_limit = 0; /*-------------------------------------------*/ UNIV_INTERN ulong srv_n_spin_wait_rounds = 30; --- a/storage/innobase/dict/dict0load.c +++ b/storage/innobase/dict/dict0load.c @@ -778,7 +778,7 @@ object and check that the .ibd file exists. */ fil_open_single_table_tablespace(FALSE, space_id, - flags, name); + flags, name, NULL); } mem_free(name); @@ -1833,7 +1833,7 @@ if (!fil_open_single_table_tablespace( TRUE, table->space, table->flags == DICT_TF_COMPACT ? 0 : - table->flags & ~(~0 << DICT_TF_BITS), name)) { + table->flags & ~(~0 << DICT_TF_BITS), name, NULL)) { /* We failed to find a sensible tablespace file */ --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -34,6 +34,7 @@ #include "sync0rw.h" #include "ibuf0types.h" #endif /* !UNIV_HOTBACKUP */ +#include "trx0types.h" /** When mysqld is run, the default directory "." is the mysqld datadir, but in the MySQL Embedded Server Library and ibbackup it is not the default @@ -478,8 +479,11 @@ accessing the first page of the file */ ulint id, /*!< in: space id */ ulint flags, /*!< in: tablespace flags */ - const char* name); /*!< in: table name in the + const char* name, /*!< in: table name in the databasename/tablename format */ + trx_t* trx); /*!< in: transaction. This is only used + for IMPORT TABLESPACE, must be NULL + otherwise */ /********************************************************************//** It is possible, though very improbable, that the lsn's in the tablespace to be imported have risen above the current system lsn, if a lengthy purge, ibuf