# name : innodb_buffer_pool_shm.patch # introduced : 12 # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! diff -ruN a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c --- a/storage/innobase/buf/buf0buddy.c 2010-12-04 19:46:39.372513543 +0900 +++ b/storage/innobase/buf/buf0buddy.c 2010-12-07 17:56:28.302087851 +0900 @@ -183,7 +183,7 @@ void* buf, /*!< in: buffer frame to deallocate */ ibool have_page_hash_mutex) { - const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf_pool, buf); buf_page_t* bpage; buf_block_t* block; @@ -227,7 +227,7 @@ buf_block_t* block) /*!< in: buffer frame to allocate */ { buf_pool_t* buf_pool = buf_pool_from_block(block); - const ulint fold = BUF_POOL_ZIP_FOLD(block); + const ulint fold = BUF_POOL_ZIP_FOLD(buf_pool, block); //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c --- a/storage/innobase/buf/buf0buf.c 2010-12-06 20:16:21.726195340 +0900 +++ b/storage/innobase/buf/buf0buf.c 2010-12-07 20:40:30.824749814 +0900 @@ -53,6 +53,10 @@ #include "page0zip.h" #include "trx0trx.h" #include "srv0start.h" +#include "que0que.h" +#include "read0read.h" +#include "row0row.h" +#include "ha_prototypes.h" /* prototypes for new functions added to ha_innodb.cc */ trx_t* innobase_get_trx(); @@ -342,6 +346,31 @@ // was allocated for the frames */ // buf_block_t* blocks; /*!< array of buffer control blocks */ //}; + +/* Buffer pool shared memory segment information */ +typedef struct buf_shm_info_struct buf_shm_info_t; + +struct buf_shm_info_struct { + char head_str[8]; + ulint binary_id; + ibool is_new; /* during initializing */ + ibool clean; /* clean shutdowned and free */ + ibool reusable; /* reusable */ + ulint buf_pool_size; /* backup value */ + ulint page_size; /* backup value */ + ulint frame_offset; /* offset of the first frame based on chunk->mem */ + ulint zip_hash_offset; + ulint zip_hash_n; + + ulint checksum; + + buf_pool_t buf_pool_backup; + buf_chunk_t chunk_backup; + + ib_uint64_t dummy; +}; + +#define BUF_SHM_INFO_HEAD "XTRA_SHM" #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** @@ -988,6 +1017,58 @@ #endif /* UNIV_SYNC_DEBUG */ } +static +void +buf_block_reuse( +/*============*/ + buf_block_t* block, + ptrdiff_t frame_offset) +{ + /* block_init */ + block->frame += frame_offset; + + UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block); + + block->index = NULL; + block->btr_search_latch = NULL; + +#ifdef UNIV_DEBUG + /* recreate later */ + block->page.in_page_hash = FALSE; + block->page.in_zip_hash = FALSE; +#endif /* UNIV_DEBUG */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers = 0; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + if (block->page.zip.data) + block->page.zip.data += frame_offset; + + block->is_hashed = FALSE; + +#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC + /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration + of buffer block mutex/rwlock with performance schema. If + PFS_GROUP_BUFFER_SYNC is defined, skip the registration + since buffer block mutex/rwlock will be registered later in + pfs_register_buffer_block() */ + + mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING); +#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ + mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING); +#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ + + ut_ad(rw_lock_validate(&(block->lock))); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_create(buf_block_debug_latch_key, + &block->debug_latch, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ +} + /********************************************************************//** Allocates a chunk of buffer frames. @return chunk, or NULL on failure */ @@ -1001,26 +1082,190 @@ { buf_block_t* block; byte* frame; + ulint zip_hash_n = 0; + ulint zip_hash_mem_size = 0; + hash_table_t* zip_hash_tmp = NULL; ulint i; + ulint size_target; + buf_shm_info_t* shm_info = NULL; /* Round down to a multiple of page size, although it already should be. */ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); + size_target = (mem_size / UNIV_PAGE_SIZE) - 1; + + srv_buffer_pool_shm_is_reused = FALSE; + + if (srv_buffer_pool_shm_key) { + /* zip_hash size */ + zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2; + zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n) + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + } + /* Reserve space for the block descriptors. */ mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + if (srv_buffer_pool_shm_key) { + mem_size += ut_2pow_round(sizeof(buf_shm_info_t) + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + mem_size += zip_hash_mem_size; + } chunk->mem_size = mem_size; + + if (srv_buffer_pool_shm_key) { + ulint binary_id; + ibool is_new; + + ut_a(buf_pool->n_chunks == 1); + + fprintf(stderr, + "InnoDB: Notice: The innodb_buffer_pool_shm_key option has been specified.\n" + "InnoDB: Do not change the following between restarts of the server while this option is being used:\n" + "InnoDB: * the mysqld executable between restarts of the server.\n" + "InnoDB: * the value of innodb_buffer_pool_size.\n" + "InnoDB: * the value of innodb_page_size.\n" + "InnoDB: * datafiles created by InnoDB during this session.\n" + "InnoDB: Otherwise, data corruption in datafiles may result.\n"); + + /* FIXME: This is vague id still */ + binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get) + + (ulint) ((byte*)os_file_get_last_error - (byte*)buf_calc_page_new_checksum) + + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal) + + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str) + + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version) + + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low) + + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func) + + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread) + + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside) + + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity) + + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup); + + chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new); + + if (UNIV_UNLIKELY(chunk->mem == NULL)) { + return(NULL); + } +init_again: +#ifdef UNIV_SET_MEM_TO_ZERO + if (is_new) { + memset(chunk->mem, '\0', chunk->mem_size); + } +#endif + /* for ut_fold_binary_32(), these values should be 32-bit aligned */ + ut_a(sizeof(buf_shm_info_t) % 4 == 0); + ut_a((ulint)chunk->mem % 4 == 0); + ut_a(chunk->mem_size % 4 == 0); + + shm_info = chunk->mem; + + zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size); + + if (is_new) { + strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8); + shm_info->binary_id = binary_id; + shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */ + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */ + shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */ + shm_info->buf_pool_size = srv_buf_pool_size; + shm_info->page_size = srv_page_size; + shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size; + shm_info->zip_hash_n = zip_hash_n; + } else { + ulint checksum; + + if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) { + fprintf(stderr, + "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n"); + return(NULL); + } + if (shm_info->binary_id != binary_id) { + fprintf(stderr, + "InnoDB: Error: The shared memory segment seems not to be for this binary.\n"); + return(NULL); + } + if (shm_info->is_new) { + fprintf(stderr, + "InnoDB: Error: The shared memory was not initialized yet.\n"); + return(NULL); + } + if (shm_info->buf_pool_size != srv_buf_pool_size) { + fprintf(stderr, + "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n", + shm_info->buf_pool_size, srv_buf_pool_size); + return(NULL); + } + if (shm_info->page_size != srv_page_size) { + fprintf(stderr, + "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n", + shm_info->page_size, srv_page_size); + return(NULL); + } + if (!shm_info->reusable) { + fprintf(stderr, + "InnoDB: Warning: The shared memory has unrecoverable contents.\n" + "InnoDB: The shared memory segment is initialized.\n"); + is_new = TRUE; + goto init_again; + } + if (!shm_info->clean) { + fprintf(stderr, + "InnoDB: Warning: The shared memory was not shut down cleanly.\n" + "InnoDB: The shared memory segment is initialized.\n"); + is_new = TRUE; + goto init_again; + } + + ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size); + ut_a(shm_info->zip_hash_n == zip_hash_n); + + /* check checksum */ + if (srv_buffer_pool_shm_checksum) { + checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t), + chunk->mem_size - sizeof(buf_shm_info_t)); + } else { + checksum = BUF_NO_CHECKSUM_MAGIC; + } + + if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC + && shm_info->checksum != checksum) { + fprintf(stderr, + "InnoDB: Error: checksum of the shared memory is not match. " + "(stored=%lu calculated=%lu)\n", + shm_info->checksum, checksum); + return(NULL); + } + + /* flag to use the segment. */ + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */ + } + + /* init zip_hash contents */ + if (is_new) { + hash_create_init(zip_hash_tmp, zip_hash_n); + } else { + /* adjust offset is done later */ + hash_create_reuse(zip_hash_tmp); + + srv_buffer_pool_shm_is_reused = TRUE; + } + } else { chunk->mem = os_mem_alloc_large(&chunk->mem_size); if (UNIV_UNLIKELY(chunk->mem == NULL)) { return(NULL); } + } /* Allocate the block descriptors from the start of the memory block. */ + if (srv_buffer_pool_shm_key) { + chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t)); + } else { chunk->blocks = chunk->mem; + } /* Align a pointer to the first frame. Note that when os_large_page_size is smaller than UNIV_PAGE_SIZE, @@ -1028,8 +1273,13 @@ it is bigger, we may allocate more blocks than requested. */ frame = ut_align(chunk->mem, UNIV_PAGE_SIZE); + if (srv_buffer_pool_shm_key) { + /* reserve zip_hash space and always -1 for reproductibity */ + chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1; + } else { chunk->size = chunk->mem_size / UNIV_PAGE_SIZE - (frame != chunk->mem); + } /* Subtract the space needed for block descriptors. */ { @@ -1043,6 +1293,102 @@ chunk->size = size; } + if (chunk->size > size_target) { + chunk->size = size_target; + } + + if (shm_info && !(shm_info->is_new)) { + /* convert the shared memory segment for reuse */ + ptrdiff_t phys_offset; + ptrdiff_t logi_offset; + ptrdiff_t blocks_offset; + void* previous_frame_address; + + if (chunk->size < shm_info->chunk_backup.size) { + fprintf(stderr, + "InnoDB: Error: The buffer pool became smaller because of allocated address.\n" + "InnoDB: Retrying may avoid this situation.\n"); + shm_info->clean = TRUE; /* release the flag for retrying */ + return(NULL); + } + + chunk->size = shm_info->chunk_backup.size; + phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset); + logi_offset = frame - chunk->blocks[0].frame; + previous_frame_address = chunk->blocks[0].frame; + blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks; + + if (phys_offset || logi_offset || blocks_offset) { + fprintf(stderr, + "InnoDB: Buffer pool in the shared memory segment should be converted.\n" + "InnoDB: Previous frames in address : %p\n" + "InnoDB: Previous frames were located : %p\n" + "InnoDB: Current frames should be located: %p\n" + "InnoDB: Pysical offset : %ld (%#lx)\n" + "InnoDB: Logical offset (frames) : %ld (%#lx)\n" + "InnoDB: Logical offset (blocks) : %ld (%#lx)\n", + (byte*)chunk->mem + shm_info->frame_offset, + chunk->blocks[0].frame, frame, + phys_offset, phys_offset, logi_offset, logi_offset, + blocks_offset, blocks_offset); + } else { + fprintf(stderr, + "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n"); + } + + if (phys_offset) { + fprintf(stderr, + "InnoDB: Aligning physical offset..."); + + memmove(frame, (byte*)chunk->mem + shm_info->frame_offset, + chunk->size * UNIV_PAGE_SIZE); + + fprintf(stderr, + " Done.\n"); + } + + /* buf_block_t */ + block = chunk->blocks; + for (i = chunk->size; i--; ) { + buf_block_reuse(block, logi_offset); + block++; + } + + if (logi_offset || blocks_offset) { + fprintf(stderr, + "InnoDB: Aligning logical offset..."); + + + /* buf_pool_t buf_pool_backup */ + UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list, + previous_frame_address, logi_offset, blocks_offset); + UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free, + previous_frame_address, logi_offset, blocks_offset); + UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU, + previous_frame_address, logi_offset, blocks_offset); + if (shm_info->buf_pool_backup.LRU_old) + shm_info->buf_pool_backup.LRU_old = + (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old) + + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address) + ? logi_offset : blocks_offset)); + + UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU, + previous_frame_address, logi_offset, blocks_offset); + + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean, + previous_frame_address, logi_offset, blocks_offset); + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) { + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i], + previous_frame_address, logi_offset, blocks_offset); + } + + HASH_OFFSET(zip_hash_tmp, buf_page_t, hash, + previous_frame_address, logi_offset, blocks_offset); + + fprintf(stderr, + " Done.\n"); + } + } else { /* Init block structs and assign frames for them. Then we assign the frames to the first blocks (we already mapped the memory above). */ @@ -1068,6 +1414,11 @@ block++; frame += UNIV_PAGE_SIZE; } + } + + if (shm_info) { + shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem; + } #ifdef PFS_GROUP_BUFFER_SYNC pfs_register_buffer_block(chunk); @@ -1249,6 +1600,8 @@ UNIV_MEM_UNDESC(block); } + ut_a(!srv_buffer_pool_shm_key); + os_mem_free_large(chunk->mem, chunk->mem_size); } @@ -1289,7 +1642,7 @@ ulint instance_no) /*!< in: id of the instance */ { ulint i; - buf_chunk_t* chunk; + buf_chunk_t* chunk = NULL; /* 1. Initialize general fields ------------------------------- */ @@ -1335,7 +1688,10 @@ buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; buf_pool->page_hash = hash_create(2 * buf_pool->curr_size); + /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */ + if (!srv_buffer_pool_shm_key) { buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); + } buf_pool->last_printout_time = ut_time(); } @@ -1354,6 +1710,86 @@ /* All fields are initialized by mem_zalloc(). */ + if (chunk && srv_buffer_pool_shm_key) { + buf_shm_info_t* shm_info; + + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); + shm_info = chunk->mem; + + buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset); + + if(shm_info->is_new) { + shm_info->is_new = FALSE; /* initialization was finished */ + } else { + buf_block_t* block = chunk->blocks; + buf_page_t* b; + + /* shm_info->buf_pool_backup should be converted */ + /* at buf_chunk_init(). So copy simply. */ + buf_pool->flush_list = shm_info->buf_pool_backup.flush_list; + buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock; + buf_pool->free = shm_info->buf_pool_backup.free; + buf_pool->LRU = shm_info->buf_pool_backup.LRU; + buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old; + buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len; + buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU; + buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean; + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) { + buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i]; + } + + for (i = 0; i < chunk->size; i++, block++) { + if (buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE) { + ut_d(block->page.in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold( + block->page.space, + block->page.offset), + &block->page); + } + } + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(zip_list, b)) { + ut_ad(!b->in_flush_list); + ut_ad(b->in_LRU_list); + + ut_d(b->in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold(b->space, b->offset), b); + } + + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(flush_list, b)) { + ut_ad(b->in_flush_list); + ut_ad(b->in_LRU_list); + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + ut_d(b->in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold(b->space, + b->offset), b); + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + break; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + } + + + } + } + mutex_exit(&buf_pool->LRU_list_mutex); rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_pool_mutex_exit(buf_pool); @@ -1373,6 +1809,42 @@ buf_chunk_t* chunk; buf_chunk_t* chunks; + if (srv_buffer_pool_shm_key) { + buf_shm_info_t* shm_info; + + ut_a(buf_pool->n_chunks == 1); + + chunk = buf_pool->chunks; + shm_info = chunk->mem; + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); + + /* if opened, close shm. */ + if (!shm_info->clean) { + /* validation the shared memory segment doesn't have unrecoverable contents. */ + /* Currently, validation became not needed */ + shm_info->reusable = TRUE; + + memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t)); + memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t)); + + if (srv_fast_shutdown < 2) { + if (srv_buffer_pool_shm_checksum) { + shm_info->checksum = + ut_fold_binary_32( + (byte*)chunk->mem + sizeof(buf_shm_info_t), + chunk->mem_size - sizeof(buf_shm_info_t)); + } else { + shm_info->checksum = BUF_NO_CHECKSUM_MAGIC; + } + shm_info->clean = TRUE; + } + + fprintf(stderr, + "InnoDB: The shared memory was closed.\n"); + } + + os_shm_free(chunk->mem, chunk->mem_size); + } else { chunks = buf_pool->chunks; chunk = chunks + buf_pool->n_chunks; @@ -1381,10 +1853,13 @@ would fail at shutdown. */ os_mem_free_large(chunk->mem, chunk->mem_size); } + } mem_free(buf_pool->chunks); hash_table_free(buf_pool->page_hash); + if (!srv_buffer_pool_shm_key) { hash_table_free(buf_pool->zip_hash); + } } /********************************************************************//** @@ -1668,6 +2143,11 @@ //buf_pool_mutex_enter(buf_pool); mutex_enter(&buf_pool->LRU_list_mutex); + if (srv_buffer_pool_shm_key) { + /* Cannot support shrink */ + goto func_done; + } + shrink_again: if (buf_pool->n_chunks <= 1) { @@ -1848,7 +2328,7 @@ zip_hash = hash_create(2 * buf_pool->curr_size); HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash, - BUF_POOL_ZIP_FOLD_BPAGE); + buf_pool, BUF_POOL_ZIP_FOLD_BPAGE); hash_table_free(buf_pool->zip_hash); buf_pool->zip_hash = zip_hash; @@ -2130,6 +2610,11 @@ ulint change_size; ulint min_change_size = 1048576 * srv_buf_pool_instances; + if (srv_buffer_pool_shm_key) { + /* Cannot support resize */ + return; + } + buf_pool_mutex_enter_all(); if (srv_buf_pool_old_size == srv_buf_pool_size) { diff -ruN a/storage/innobase/ha/hash0hash.c b/storage/innobase/ha/hash0hash.c --- a/storage/innobase/ha/hash0hash.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/ha/hash0hash.c 2010-12-07 16:10:14.937749140 +0900 @@ -133,6 +133,70 @@ } /*************************************************************//** +*/ +UNIV_INTERN +ulint +hash_create_needed( +/*===============*/ + ulint n) +{ + ulint prime; + ulint offset; + + prime = ut_find_prime(n); + + offset = (sizeof(hash_table_t) + 7) / 8; + offset *= 8; + + return(offset + sizeof(hash_cell_t) * prime); +} + +UNIV_INTERN +void +hash_create_init( +/*=============*/ + hash_table_t* table, + ulint n) +{ + ulint prime; + ulint offset; + + prime = ut_find_prime(n); + + offset = (sizeof(hash_table_t) + 7) / 8; + offset *= 8; + + table->array = (hash_cell_t*)(((byte*)table) + offset); + table->n_cells = prime; +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + table->adaptive = FALSE; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + table->n_mutexes = 0; + table->mutexes = NULL; + table->heaps = NULL; + table->heap = NULL; + ut_d(table->magic_n = HASH_TABLE_MAGIC_N); + + /* Initialize the cell array */ + hash_table_clear(table); +} + +UNIV_INTERN +void +hash_create_reuse( +/*==============*/ + hash_table_t* table) +{ + ulint offset; + + offset = (sizeof(hash_table_t) + 7) / 8; + offset *= 8; + + table->array = (hash_cell_t*)(((byte*)table) + offset); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); +} + +/*************************************************************//** Frees a hash table. */ UNIV_INTERN void diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc --- a/storage/innobase/handler/ha_innodb.cc 2010-12-06 20:16:21.733263627 +0900 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-07 17:56:28.316139830 +0900 @@ -194,6 +194,7 @@ static my_bool innobase_create_status_file = FALSE; static my_bool innobase_stats_on_metadata = TRUE; static my_bool innobase_use_sys_stats_table = FALSE; +static my_bool innobase_buffer_pool_shm_checksum = TRUE; static char* internal_innobase_data_file_path = NULL; @@ -2643,6 +2644,14 @@ srv_buf_pool_size = (ulint) innobase_buffer_pool_size; srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; + if (srv_buffer_pool_shm_key && srv_buf_pool_instances > 1) { + fprintf(stderr, + "InnoDB: Warning: innodb_buffer_pool_shm_key cannot be used with several innodb_buffer_pool_instances.\n" + "InnoDB: innodb_buffer_pool_instances was set to 1.\n"); + srv_buf_pool_instances = 1; + innobase_buffer_pool_instances = 1; + } + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; srv_n_file_io_threads = (ulint) innobase_file_io_threads; @@ -2659,6 +2668,7 @@ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; srv_use_checksums = (ibool) innobase_use_checksums; srv_fast_checksum = (ibool) innobase_fast_checksum; + srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum; #ifdef HAVE_LARGE_PAGES if ((os_use_large_pages = (ibool) my_use_large_pages)) @@ -11702,6 +11712,16 @@ "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L); +static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.", + NULL, NULL, 0, 0, INT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable buffer_pool_shm checksum validation (enabled by default).", + NULL, NULL, TRUE); + static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, PLUGIN_VAR_RQCMDARG, "Helps in performance tuning in heavily concurrent environments.", @@ -12000,6 +12020,8 @@ MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), MYSQL_SYSVAR(buffer_pool_instances), + MYSQL_SYSVAR(buffer_pool_shm_key), + MYSQL_SYSVAR(buffer_pool_shm_checksum), MYSQL_SYSVAR(checksums), MYSQL_SYSVAR(fast_checksum), MYSQL_SYSVAR(commit_concurrency), diff -ruN a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h --- a/storage/innobase/include/buf0buf.h 2010-12-06 20:16:21.778264552 +0900 +++ b/storage/innobase/include/buf0buf.h 2010-12-07 17:56:28.322749380 +0900 @@ -36,6 +36,7 @@ #ifndef UNIV_HOTBACKUP #include "ut0rbt.h" #include "os0proc.h" +#include "srv0srv.h" /** @name Modes for buf_page_get_gen */ /* @{ */ @@ -1592,9 +1593,12 @@ /**********************************************************************//** Compute the hash fold value for blocks in buf_pool->zip_hash. */ /* @{ */ -#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) -#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) -#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) +/* the fold should be relative when srv_buffer_pool_shm_key is enabled */ +#define BUF_POOL_ZIP_FOLD_PTR(bpool, ptr) (!srv_buffer_pool_shm_key\ + ?((ulint) (ptr) / UNIV_PAGE_SIZE)\ + :((ulint) ((byte*)ptr - (byte*)(buf_page_from_array(bpool, 0)->frame)) / UNIV_PAGE_SIZE)) +#define BUF_POOL_ZIP_FOLD(bpool, b) BUF_POOL_ZIP_FOLD_PTR(bpool, (b)->frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(bpool, b) BUF_POOL_ZIP_FOLD(bpool, (buf_block_t*) (b)) /* @} */ /** A chunk of buffers. The buffer pool is allocated in chunks. */ diff -ruN a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h --- a/storage/innobase/include/hash0hash.h 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/include/hash0hash.h 2010-12-07 17:56:28.324726446 +0900 @@ -49,6 +49,28 @@ hash_create( /*========*/ ulint n); /*!< in: number of array cells */ + +/*************************************************************//** +*/ +UNIV_INTERN +ulint +hash_create_needed( +/*===============*/ + ulint n); + +UNIV_INTERN +void +hash_create_init( +/*=============*/ + hash_table_t* table, + ulint n); + +UNIV_INTERN +void +hash_create_reuse( +/*==============*/ + hash_table_t* table); + #ifndef UNIV_HOTBACKUP /*************************************************************//** Creates a mutex array to protect a hash table. */ @@ -306,7 +328,7 @@ /****************************************************************//** Move all hash table entries from OLD_TABLE to NEW_TABLE. */ -#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \ +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, BPOOL, FOLD_FUNC) \ do {\ ulint i2222;\ ulint cell_count2222;\ @@ -318,7 +340,7 @@ \ while (node2222) {\ NODE_TYPE* next2222 = node2222->PTR_NAME;\ - ulint fold2222 = FOLD_FUNC(node2222);\ + ulint fold2222 = FOLD_FUNC(BPOOL, node2222);\ \ HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\ fold2222, node2222);\ @@ -327,6 +349,33 @@ }\ }\ } while (0) + +/********************************************************************//** +Align nodes with moving location.*/ +#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \ +do {\ + ulint i2222;\ + ulint cell_count2222;\ +\ + cell_count2222 = hash_get_n_cells(TABLE);\ +\ + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ + NODE_TYPE* node2222;\ +\ + if ((TABLE)->array[i2222].node) \ + (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \ + + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\ + node2222 = HASH_GET_FIRST((TABLE), i2222);\ +\ + while (node2222) {\ + if (node2222->PTR_NAME) \ + node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \ + + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\ +\ + node2222 = node2222->PTR_NAME;\ + }\ + }\ +} while (0) /************************************************************//** Gets the mutex index for a fold value in a hash table. diff -ruN a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h --- a/storage/innobase/include/os0proc.h 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/include/os0proc.h 2010-12-07 16:10:14.955718750 +0900 @@ -32,6 +32,11 @@ #ifdef UNIV_LINUX #include #include +#else +# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H +#include +#include +# endif #endif typedef void* os_process_t; @@ -70,6 +75,29 @@ ulint size); /*!< in: size returned by os_mem_alloc_large() */ + +/****************************************************************//** +Allocates or attaches and reuses shared memory segment. +The content is not cleared automatically. +@return allocated memory */ +UNIV_INTERN +void* +os_shm_alloc( +/*=========*/ + ulint* n, /*!< in/out: number of bytes */ + uint key, + ibool* is_new); + +/****************************************************************//** +Detach shared memory segment. */ +UNIV_INTERN +void +os_shm_free( +/*========*/ + void *ptr, /*!< in: pointer returned by + os_shm_alloc() */ + ulint size); /*!< in: size returned by + os_shm_alloc() */ #ifndef UNIV_NONINL #include "os0proc.ic" #endif diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h --- a/storage/innobase/include/srv0srv.h 2010-12-04 20:20:28.016566697 +0900 +++ b/storage/innobase/include/srv0srv.h 2010-12-07 16:10:14.956717659 +0900 @@ -171,6 +171,10 @@ extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; +extern uint srv_buffer_pool_shm_key; +extern ibool srv_buffer_pool_shm_is_reused; +extern ibool srv_buffer_pool_shm_checksum; + extern ibool srv_thread_concurrency_timer_based; extern ulint srv_n_file_io_threads; diff -ruN a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h --- a/storage/innobase/include/ut0lst.h 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/include/ut0lst.h 2010-12-07 16:10:14.957785525 +0900 @@ -257,5 +257,48 @@ ut_a(ut_list_node_313 == NULL); \ } while (0) +/********************************************************************//** +Align nodes with moving location. +@param NAME the name of the list +@param TYPE node type +@param BASE base node (not a pointer to it) +@param OFFSET offset moved */ +#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \ +do { \ + ulint ut_list_i_313; \ + TYPE* ut_list_node_313; \ + \ + if ((BASE).start) \ + (BASE).start = (void*)((byte*)((BASE).start) \ + + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\ + if ((BASE).end) \ + (BASE).end = (void*)((byte*)((BASE).end) \ + + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\ + \ + ut_list_node_313 = (BASE).start; \ + \ + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \ + ut_a(ut_list_node_313); \ + if ((ut_list_node_313->NAME).prev) \ + (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\ + + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\ + if ((ut_list_node_313->NAME).next) \ + (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\ + + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\ + ut_list_node_313 = (ut_list_node_313->NAME).next; \ + } \ + \ + ut_a(ut_list_node_313 == NULL); \ + \ + ut_list_node_313 = (BASE).end; \ + \ + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \ + ut_a(ut_list_node_313); \ + ut_list_node_313 = (ut_list_node_313->NAME).prev; \ + } \ + \ + ut_a(ut_list_node_313 == NULL); \ +} while (0) + #endif diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c --- a/storage/innobase/log/log0recv.c 2010-12-04 19:46:40.212513377 +0900 +++ b/storage/innobase/log/log0recv.c 2010-12-07 16:10:14.959785817 +0900 @@ -2912,6 +2912,7 @@ /*==========================*/ { ut_a(!recv_needed_recovery); + ut_a(!srv_buffer_pool_shm_is_reused); recv_needed_recovery = TRUE; diff -ruN a/storage/innobase/os/os0proc.c b/storage/innobase/os/os0proc.c --- a/storage/innobase/os/os0proc.c 2010-11-03 07:01:13.000000000 +0900 +++ b/storage/innobase/os/os0proc.c 2010-12-07 16:10:14.960800123 +0900 @@ -229,3 +229,173 @@ } #endif } + +/****************************************************************//** +Allocates or attaches and reuses shared memory segment. +The content is not cleared automatically. +@return allocated memory */ +UNIV_INTERN +void* +os_shm_alloc( +/*=========*/ + ulint* n, /*!< in/out: number of bytes */ + uint key, + ibool* is_new) +{ + void* ptr; +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H + ulint size; + int shmid; + + *is_new = FALSE; + fprintf(stderr, + "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n", + key, key); +# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX + if (!os_use_large_pages || !os_large_page_size) { + goto skip; + } + + /* Align block size to os_large_page_size */ + ut_ad(ut_is_2pow(os_large_page_size)); + size = ut_2pow_round(*n + (os_large_page_size - 1), + os_large_page_size); + + shmid = shmget((key_t)key, (size_t)size, + IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + if (errno == EEXIST) { + fprintf(stderr, + "InnoDB: HugeTLB: The shared memory segment exists.\n"); + shmid = shmget((key_t)key, (size_t)size, + SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", + size, errno); + goto skip; + } else { + fprintf(stderr, + "InnoDB: HugeTLB: The existent shared memory segment is used.\n"); + } + } else { + fprintf(stderr, + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", + size, errno); + goto skip; + } + } else { + *is_new = TRUE; + fprintf(stderr, + "InnoDB: HugeTLB: A new shared memory segment has been created .\n"); + } + + ptr = shmat(shmid, NULL, 0); + if (ptr == (void *)-1) { + fprintf(stderr, + "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n", + errno); + ptr = NULL; + } + + if (ptr) { + *n = size; + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + return(ptr); + } +skip: + *is_new = FALSE; +# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */ +# ifdef HAVE_GETPAGESIZE + size = getpagesize(); +# else + size = UNIV_PAGE_SIZE; +# endif + /* Align block size to system page size */ + ut_ad(ut_is_2pow(size)); + size = *n = ut_2pow_round(*n + (size - 1), size); + + shmid = shmget((key_t)key, (size_t)size, + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W); + if (shmid < 0) { + if (errno == EEXIST) { + fprintf(stderr, + "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n"); + shmid = shmget((key_t)key, (size_t)size, + SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, + "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", + size, errno); + ptr = NULL; + goto end; + } else { + fprintf(stderr, + "InnoDB: The existent shared memory segment is used.\n"); + } + } else { + fprintf(stderr, + "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", + size, errno); + ptr = NULL; + goto end; + } + } else { + *is_new = TRUE; + fprintf(stderr, + "InnoDB: A new shared memory segment has been created.\n"); + } + + ptr = shmat(shmid, NULL, 0); + if (ptr == (void *)-1) { + fprintf(stderr, + "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n", + errno); + ptr = NULL; + } + + if (ptr) { + *n = size; + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + } +end: +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n"); + ptr = NULL; +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ + return(ptr); +} + +/****************************************************************//** +Detach shared memory segment. */ +UNIV_INTERN +void +os_shm_free( +/*========*/ + void *ptr, /*!< in: pointer returned by + os_shm_alloc() */ + ulint size) /*!< in: size returned by + os_shm_alloc() */ +{ + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + os_fast_mutex_unlock(&ut_list_mutex); + +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H + if (!shmdt(ptr)) { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + } +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n"); +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ +} diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c --- a/storage/innobase/srv/srv0srv.c 2010-12-04 20:20:44.687550693 +0900 +++ b/storage/innobase/srv/srv0srv.c 2010-12-07 16:10:14.962785720 +0900 @@ -235,6 +235,11 @@ UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; +/* key value for shm */ +UNIV_INTERN uint srv_buffer_pool_shm_key = 0; +UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE; +UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE; + /* This parameter is deprecated. Use srv_n_io_[read|write]_threads instead. */ UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c --- a/storage/innobase/srv/srv0start.c 2010-12-04 20:19:29.806482628 +0900 +++ b/storage/innobase/srv/srv0start.c 2010-12-07 16:10:14.964785346 +0900 @@ -1838,6 +1838,8 @@ Note that this is not as heavy weight as it seems. At this point there will be only ONE page in the buf_LRU and there must be no page in the buf_flush list. */ + /* buffer_pool_shm should not be reused when recovery was needed. */ + if (!srv_buffer_pool_shm_is_reused) buf_pool_invalidate(); /* We always try to do a recovery, even if the database had