1 # name : innodb_buffer_pool_shm.patch
3 # maintainer : Yasufumi
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 diff -ruN a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c
9 --- a/storage/innobase/buf/buf0buddy.c 2010-12-04 19:46:39.372513543 +0900
10 +++ b/storage/innobase/buf/buf0buddy.c 2010-12-07 17:56:28.302087851 +0900
12 void* buf, /*!< in: buffer frame to deallocate */
13 ibool have_page_hash_mutex)
15 - const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
16 + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf_pool, buf);
21 buf_block_t* block) /*!< in: buffer frame to allocate */
23 buf_pool_t* buf_pool = buf_pool_from_block(block);
24 - const ulint fold = BUF_POOL_ZIP_FOLD(block);
25 + const ulint fold = BUF_POOL_ZIP_FOLD(buf_pool, block);
26 //ut_ad(buf_pool_mutex_own(buf_pool));
27 ut_ad(!mutex_own(&buf_pool->zip_mutex));
28 ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
29 diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
30 --- a/storage/innobase/buf/buf0buf.c 2010-12-06 20:16:21.726195340 +0900
31 +++ b/storage/innobase/buf/buf0buf.c 2010-12-07 20:40:30.824749814 +0900
35 #include "srv0start.h"
37 +#include "read0read.h"
39 +#include "ha_prototypes.h"
41 /* prototypes for new functions added to ha_innodb.cc */
42 trx_t* innobase_get_trx();
44 was allocated for the frames */
45 buf_block_t* blocks; /*!< array of buffer control blocks */
48 +/* Buffer pool shared memory segment information */
49 +typedef struct buf_shm_info_struct buf_shm_info_t;
51 +struct buf_shm_info_struct {
54 + ibool is_new; /* during initializing */
55 + ibool clean; /* clean shutdowned and free */
56 + ibool reusable; /* reusable */
57 + ulint buf_pool_size; /* backup value */
58 + ulint page_size; /* backup value */
59 + ulint frame_offset; /* offset of the first frame based on chunk->mem */
60 + ulint zip_hash_offset;
65 + buf_pool_t buf_pool_backup;
66 + buf_chunk_t chunk_backup;
71 +#define BUF_SHM_INFO_HEAD "XTRA_SHM"
72 #endif /* !UNIV_HOTBACKUP */
74 /********************************************************************//**
76 #endif /* UNIV_SYNC_DEBUG */
84 + ptrdiff_t frame_offset)
87 + block->frame += frame_offset;
89 + UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block);
91 + block->index = NULL;
92 + block->btr_search_latch = NULL;
95 + /* recreate later */
96 + block->page.in_page_hash = FALSE;
97 + block->page.in_zip_hash = FALSE;
98 +#endif /* UNIV_DEBUG */
100 +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
101 + block->n_pointers = 0;
102 +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
104 + if (block->page.zip.data)
105 + block->page.zip.data += frame_offset;
107 + block->is_hashed = FALSE;
109 +#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
110 + /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
111 + of buffer block mutex/rwlock with performance schema. If
112 + PFS_GROUP_BUFFER_SYNC is defined, skip the registration
113 + since buffer block mutex/rwlock will be registered later in
114 + pfs_register_buffer_block() */
116 + mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
117 + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
118 +#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
119 + mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
120 + rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
121 +#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
123 + ut_ad(rw_lock_validate(&(block->lock)));
125 +#ifdef UNIV_SYNC_DEBUG
126 + rw_lock_create(buf_block_debug_latch_key,
127 + &block->debug_latch, SYNC_NO_ORDER_CHECK);
128 +#endif /* UNIV_SYNC_DEBUG */
131 /********************************************************************//**
132 Allocates a chunk of buffer frames.
133 @return chunk, or NULL on failure */
134 @@ -1001,26 +1082,188 @@
138 + ulint zip_hash_n = 0;
139 + ulint zip_hash_mem_size = 0;
140 + hash_table_t* zip_hash_tmp = NULL;
142 + buf_shm_info_t* shm_info = NULL;
144 /* Round down to a multiple of page size,
145 although it already should be. */
146 mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
148 + srv_buffer_pool_shm_is_reused = FALSE;
150 + if (srv_buffer_pool_shm_key) {
151 + /* zip_hash size */
152 + zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2;
153 + zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n)
154 + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
157 /* Reserve space for the block descriptors. */
158 mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
159 + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
160 + if (srv_buffer_pool_shm_key) {
161 + mem_size += ut_2pow_round(sizeof(buf_shm_info_t)
162 + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
163 + mem_size += zip_hash_mem_size;
166 chunk->mem_size = mem_size;
168 + if (srv_buffer_pool_shm_key) {
172 + ut_a(buf_pool->n_chunks == 1);
175 + "InnoDB: Notice: The innodb_buffer_pool_shm_key option has been specified.\n"
176 + "InnoDB: Do not change the following between restarts of the server while this option is being used:\n"
177 + "InnoDB: * the mysqld executable between restarts of the server.\n"
178 + "InnoDB: * the value of innodb_buffer_pool_size.\n"
179 + "InnoDB: * the value of innodb_page_size.\n"
180 + "InnoDB: * datafiles created by InnoDB during this session.\n"
181 + "InnoDB: Otherwise, data corruption in datafiles may result.\n");
183 + /* FIXME: This is vague id still */
184 + binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get)
185 + + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum)
186 + + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal)
187 + + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str)
188 + + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version)
189 + + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low)
190 + + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func)
191 + + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread)
192 + + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside)
193 + + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity)
194 + + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup);
196 + chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new);
198 + if (UNIV_UNLIKELY(chunk->mem == NULL)) {
202 +#ifdef UNIV_SET_MEM_TO_ZERO
204 + memset(chunk->mem, '\0', chunk->mem_size);
207 + /* for ut_fold_binary_32(), these values should be 32-bit aligned */
208 + ut_a(sizeof(buf_shm_info_t) % 4 == 0);
209 + ut_a((ulint)chunk->mem % 4 == 0);
210 + ut_a(chunk->mem_size % 4 == 0);
212 + shm_info = chunk->mem;
214 + zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size);
217 + strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8);
218 + shm_info->binary_id = binary_id;
219 + shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */
220 + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
221 + shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */
222 + shm_info->buf_pool_size = srv_buf_pool_size;
223 + shm_info->page_size = srv_page_size;
224 + shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size;
225 + shm_info->zip_hash_n = zip_hash_n;
229 + if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) {
231 + "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n");
234 + if (shm_info->binary_id != binary_id) {
236 + "InnoDB: Error: The shared memory segment seems not to be for this binary.\n");
239 + if (shm_info->is_new) {
241 + "InnoDB: Error: The shared memory was not initialized yet.\n");
244 + if (shm_info->buf_pool_size != srv_buf_pool_size) {
246 + "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n",
247 + shm_info->buf_pool_size, srv_buf_pool_size);
250 + if (shm_info->page_size != srv_page_size) {
252 + "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n",
253 + shm_info->page_size, srv_page_size);
256 + if (!shm_info->reusable) {
258 + "InnoDB: Warning: The shared memory has unrecoverable contents.\n"
259 + "InnoDB: The shared memory segment is initialized.\n");
263 + if (!shm_info->clean) {
265 + "InnoDB: Warning: The shared memory was not shut down cleanly.\n"
266 + "InnoDB: The shared memory segment is initialized.\n");
271 + ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size);
272 + ut_a(shm_info->zip_hash_n == zip_hash_n);
274 + /* check checksum */
275 + if (srv_buffer_pool_shm_checksum) {
276 + checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
277 + chunk->mem_size - sizeof(buf_shm_info_t));
279 + checksum = BUF_NO_CHECKSUM_MAGIC;
282 + if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC
283 + && shm_info->checksum != checksum) {
285 + "InnoDB: Error: checksum of the shared memory is not match. "
286 + "(stored=%lu calculated=%lu)\n",
287 + shm_info->checksum, checksum);
291 + /* flag to use the segment. */
292 + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
295 + /* init zip_hash contents */
297 + hash_create_init(zip_hash_tmp, zip_hash_n);
299 + /* adjust offset is done later */
300 + hash_create_reuse(zip_hash_tmp);
302 + srv_buffer_pool_shm_is_reused = TRUE;
305 chunk->mem = os_mem_alloc_large(&chunk->mem_size);
307 if (UNIV_UNLIKELY(chunk->mem == NULL)) {
313 /* Allocate the block descriptors from
314 the start of the memory block. */
315 + if (srv_buffer_pool_shm_key) {
316 + chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t));
318 chunk->blocks = chunk->mem;
321 /* Align a pointer to the first frame. Note that when
322 os_large_page_size is smaller than UNIV_PAGE_SIZE,
323 @@ -1028,8 +1271,13 @@
324 it is bigger, we may allocate more blocks than requested. */
326 frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
327 + if (srv_buffer_pool_shm_key) {
328 + /* reserve zip_hash space and always -1 for reproductibity */
329 + chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1;
331 chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
332 - (frame != chunk->mem);
335 /* Subtract the space needed for block descriptors. */
337 @@ -1043,6 +1291,98 @@
341 + if (shm_info && !(shm_info->is_new)) {
342 + /* convert the shared memory segment for reuse */
343 + ptrdiff_t phys_offset;
344 + ptrdiff_t logi_offset;
345 + ptrdiff_t blocks_offset;
346 + void* previous_frame_address;
348 + if (chunk->size < shm_info->chunk_backup.size) {
350 + "InnoDB: Error: The buffer pool became smaller because of allocated address.\n"
351 + "InnoDB: Retrying may avoid this situation.\n");
352 + shm_info->clean = TRUE; /* release the flag for retrying */
356 + chunk->size = shm_info->chunk_backup.size;
357 + phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset);
358 + logi_offset = frame - chunk->blocks[0].frame;
359 + previous_frame_address = chunk->blocks[0].frame;
360 + blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks;
362 + if (phys_offset || logi_offset || blocks_offset) {
364 + "InnoDB: Buffer pool in the shared memory segment should be converted.\n"
365 + "InnoDB: Previous frames in address : %p\n"
366 + "InnoDB: Previous frames were located : %p\n"
367 + "InnoDB: Current frames should be located: %p\n"
368 + "InnoDB: Pysical offset : %ld (%#lx)\n"
369 + "InnoDB: Logical offset (frames) : %ld (%#lx)\n"
370 + "InnoDB: Logical offset (blocks) : %ld (%#lx)\n",
371 + (byte*)chunk->mem + shm_info->frame_offset,
372 + chunk->blocks[0].frame, frame,
373 + phys_offset, phys_offset, logi_offset, logi_offset,
374 + blocks_offset, blocks_offset);
377 + "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n");
382 + "InnoDB: Aligning physical offset...");
384 + memmove(frame, (byte*)chunk->mem + shm_info->frame_offset,
385 + chunk->size * UNIV_PAGE_SIZE);
392 + block = chunk->blocks;
393 + for (i = chunk->size; i--; ) {
394 + buf_block_reuse(block, logi_offset);
398 + if (logi_offset || blocks_offset) {
400 + "InnoDB: Aligning logical offset...");
403 + /* buf_pool_t buf_pool_backup */
404 + UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list,
405 + previous_frame_address, logi_offset, blocks_offset);
406 + UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free,
407 + previous_frame_address, logi_offset, blocks_offset);
408 + UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU,
409 + previous_frame_address, logi_offset, blocks_offset);
410 + if (shm_info->buf_pool_backup.LRU_old)
411 + shm_info->buf_pool_backup.LRU_old =
412 + (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old)
413 + + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address)
414 + ? logi_offset : blocks_offset));
416 + UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU,
417 + previous_frame_address, logi_offset, blocks_offset);
419 + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean,
420 + previous_frame_address, logi_offset, blocks_offset);
421 + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
422 + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i],
423 + previous_frame_address, logi_offset, blocks_offset);
426 + HASH_OFFSET(zip_hash_tmp, buf_page_t, hash,
427 + previous_frame_address, logi_offset, blocks_offset);
433 /* Init block structs and assign frames for them. Then we
434 assign the frames to the first blocks (we already mapped the
436 @@ -1068,6 +1408,11 @@
438 frame += UNIV_PAGE_SIZE;
443 + shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem;
446 #ifdef PFS_GROUP_BUFFER_SYNC
447 pfs_register_buffer_block(chunk);
448 @@ -1249,6 +1594,8 @@
449 UNIV_MEM_UNDESC(block);
452 + ut_a(!srv_buffer_pool_shm_key);
454 os_mem_free_large(chunk->mem, chunk->mem_size);
457 @@ -1289,7 +1636,7 @@
458 ulint instance_no) /*!< in: id of the instance */
461 - buf_chunk_t* chunk;
462 + buf_chunk_t* chunk = NULL;
464 /* 1. Initialize general fields
465 ------------------------------- */
466 @@ -1335,7 +1682,10 @@
467 buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
469 buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
470 + /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */
471 + if (!srv_buffer_pool_shm_key) {
472 buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
475 buf_pool->last_printout_time = ut_time();
477 @@ -1354,6 +1704,86 @@
479 /* All fields are initialized by mem_zalloc(). */
481 + if (chunk && srv_buffer_pool_shm_key) {
482 + buf_shm_info_t* shm_info;
484 + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
485 + shm_info = chunk->mem;
487 + buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset);
489 + if(shm_info->is_new) {
490 + shm_info->is_new = FALSE; /* initialization was finished */
492 + buf_block_t* block = chunk->blocks;
495 + /* shm_info->buf_pool_backup should be converted */
496 + /* at buf_chunk_init(). So copy simply. */
497 + buf_pool->flush_list = shm_info->buf_pool_backup.flush_list;
498 + buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock;
499 + buf_pool->free = shm_info->buf_pool_backup.free;
500 + buf_pool->LRU = shm_info->buf_pool_backup.LRU;
501 + buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old;
502 + buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len;
503 + buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU;
504 + buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean;
505 + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
506 + buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i];
509 + for (i = 0; i < chunk->size; i++, block++) {
510 + if (buf_block_get_state(block)
511 + == BUF_BLOCK_FILE_PAGE) {
512 + ut_d(block->page.in_page_hash = TRUE);
513 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
514 + buf_page_address_fold(
516 + block->page.offset),
521 + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
522 + b = UT_LIST_GET_NEXT(zip_list, b)) {
523 + ut_ad(!b->in_flush_list);
524 + ut_ad(b->in_LRU_list);
526 + ut_d(b->in_page_hash = TRUE);
527 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
528 + buf_page_address_fold(b->space, b->offset), b);
531 + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
532 + b = UT_LIST_GET_NEXT(flush_list, b)) {
533 + ut_ad(b->in_flush_list);
534 + ut_ad(b->in_LRU_list);
536 + switch (buf_page_get_state(b)) {
537 + case BUF_BLOCK_ZIP_DIRTY:
538 + ut_d(b->in_page_hash = TRUE);
539 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
540 + buf_page_address_fold(b->space,
543 + case BUF_BLOCK_FILE_PAGE:
544 + /* uncompressed page */
546 + case BUF_BLOCK_ZIP_FREE:
547 + case BUF_BLOCK_ZIP_PAGE:
548 + case BUF_BLOCK_NOT_USED:
549 + case BUF_BLOCK_READY_FOR_USE:
550 + case BUF_BLOCK_MEMORY:
551 + case BUF_BLOCK_REMOVE_HASH:
561 mutex_exit(&buf_pool->LRU_list_mutex);
562 rw_lock_x_unlock(&buf_pool->page_hash_latch);
563 buf_pool_mutex_exit(buf_pool);
564 @@ -1373,6 +1803,42 @@
568 + if (srv_buffer_pool_shm_key) {
569 + buf_shm_info_t* shm_info;
571 + ut_a(buf_pool->n_chunks == 1);
573 + chunk = buf_pool->chunks;
574 + shm_info = chunk->mem;
575 + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
577 + /* if opened, close shm. */
578 + if (!shm_info->clean) {
579 + /* validation the shared memory segment doesn't have unrecoverable contents. */
580 + /* Currently, validation became not needed */
581 + shm_info->reusable = TRUE;
583 + memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t));
584 + memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t));
586 + if (srv_fast_shutdown < 2) {
587 + if (srv_buffer_pool_shm_checksum) {
588 + shm_info->checksum =
590 + (byte*)chunk->mem + sizeof(buf_shm_info_t),
591 + chunk->mem_size - sizeof(buf_shm_info_t));
593 + shm_info->checksum = BUF_NO_CHECKSUM_MAGIC;
595 + shm_info->clean = TRUE;
599 + "InnoDB: The shared memory was closed.\n");
602 + os_shm_free(chunk->mem, chunk->mem_size);
604 chunks = buf_pool->chunks;
605 chunk = chunks + buf_pool->n_chunks;
607 @@ -1381,10 +1847,13 @@
608 would fail at shutdown. */
609 os_mem_free_large(chunk->mem, chunk->mem_size);
613 mem_free(buf_pool->chunks);
614 hash_table_free(buf_pool->page_hash);
615 + if (!srv_buffer_pool_shm_key) {
616 hash_table_free(buf_pool->zip_hash);
620 /********************************************************************//**
621 @@ -1668,6 +2137,11 @@
622 //buf_pool_mutex_enter(buf_pool);
623 mutex_enter(&buf_pool->LRU_list_mutex);
625 + if (srv_buffer_pool_shm_key) {
626 + /* Cannot support shrink */
631 if (buf_pool->n_chunks <= 1) {
633 @@ -1848,7 +2322,7 @@
634 zip_hash = hash_create(2 * buf_pool->curr_size);
636 HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
637 - BUF_POOL_ZIP_FOLD_BPAGE);
638 + buf_pool, BUF_POOL_ZIP_FOLD_BPAGE);
640 hash_table_free(buf_pool->zip_hash);
641 buf_pool->zip_hash = zip_hash;
642 @@ -2130,6 +2604,11 @@
644 ulint min_change_size = 1048576 * srv_buf_pool_instances;
646 + if (srv_buffer_pool_shm_key) {
647 + /* Cannot support resize */
651 buf_pool_mutex_enter_all();
653 if (srv_buf_pool_old_size == srv_buf_pool_size) {
654 diff -ruN a/storage/innobase/ha/hash0hash.c b/storage/innobase/ha/hash0hash.c
655 --- a/storage/innobase/ha/hash0hash.c 2010-11-03 07:01:13.000000000 +0900
656 +++ b/storage/innobase/ha/hash0hash.c 2010-12-07 16:10:14.937749140 +0900
660 /*************************************************************//**
671 + prime = ut_find_prime(n);
673 + offset = (sizeof(hash_table_t) + 7) / 8;
676 + return(offset + sizeof(hash_cell_t) * prime);
683 + hash_table_t* table,
689 + prime = ut_find_prime(n);
691 + offset = (sizeof(hash_table_t) + 7) / 8;
694 + table->array = (hash_cell_t*)(((byte*)table) + offset);
695 + table->n_cells = prime;
696 +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
697 + table->adaptive = FALSE;
698 +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
699 + table->n_mutexes = 0;
700 + table->mutexes = NULL;
701 + table->heaps = NULL;
702 + table->heap = NULL;
703 + ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
705 + /* Initialize the cell array */
706 + hash_table_clear(table);
713 + hash_table_t* table)
717 + offset = (sizeof(hash_table_t) + 7) / 8;
720 + table->array = (hash_cell_t*)(((byte*)table) + offset);
721 + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
724 +/*************************************************************//**
725 Frees a hash table. */
728 diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
729 --- a/storage/innobase/handler/ha_innodb.cc 2010-12-06 20:16:21.733263627 +0900
730 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-07 17:56:28.316139830 +0900
732 static my_bool innobase_create_status_file = FALSE;
733 static my_bool innobase_stats_on_metadata = TRUE;
734 static my_bool innobase_use_sys_stats_table = FALSE;
735 +static my_bool innobase_buffer_pool_shm_checksum = TRUE;
738 static char* internal_innobase_data_file_path = NULL;
739 @@ -2620,6 +2621,14 @@
740 srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
741 srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
743 + if (srv_buffer_pool_shm_key && srv_buf_pool_instances > 1) {
745 + "InnoDB: Warning: innodb_buffer_pool_shm_key cannot be used with several innodb_buffer_pool_instances.\n"
746 + "InnoDB: innodb_buffer_pool_instances was set to 1.\n");
747 + srv_buf_pool_instances = 1;
748 + innobase_buffer_pool_instances = 1;
751 srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
753 srv_n_file_io_threads = (ulint) innobase_file_io_threads;
754 @@ -2636,6 +2645,7 @@
755 srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
756 srv_use_checksums = (ibool) innobase_use_checksums;
757 srv_fast_checksum = (ibool) innobase_fast_checksum;
758 + srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum;
760 #ifdef HAVE_LARGE_PAGES
761 if ((os_use_large_pages = (ibool) my_use_large_pages))
762 @@ -11642,6 +11652,16 @@
763 "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
764 NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L);
766 +static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key,
767 + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
768 + "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.",
769 + NULL, NULL, 0, 0, INT_MAX32, 0);
771 +static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
772 + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
773 + "Enable buffer_pool_shm checksum validation (enabled by default).",
776 static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
778 "Helps in performance tuning in heavily concurrent environments.",
779 @@ -11921,6 +11941,8 @@
780 MYSQL_SYSVAR(autoextend_increment),
781 MYSQL_SYSVAR(buffer_pool_size),
782 MYSQL_SYSVAR(buffer_pool_instances),
783 + MYSQL_SYSVAR(buffer_pool_shm_key),
784 + MYSQL_SYSVAR(buffer_pool_shm_checksum),
785 MYSQL_SYSVAR(checksums),
786 MYSQL_SYSVAR(fast_checksum),
787 MYSQL_SYSVAR(commit_concurrency),
788 diff -ruN a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
789 --- a/storage/innobase/include/buf0buf.h 2010-12-06 20:16:21.778264552 +0900
790 +++ b/storage/innobase/include/buf0buf.h 2010-12-07 17:56:28.322749380 +0900
792 #ifndef UNIV_HOTBACKUP
795 +#include "srv0srv.h"
797 /** @name Modes for buf_page_get_gen */
799 @@ -1520,9 +1521,12 @@
800 /**********************************************************************//**
801 Compute the hash fold value for blocks in buf_pool->zip_hash. */
803 -#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE)
804 -#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
805 -#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
806 +/* the fold should be relative when srv_buffer_pool_shm_key is enabled */
807 +#define BUF_POOL_ZIP_FOLD_PTR(bpool, ptr) (!srv_buffer_pool_shm_key\
808 + ?((ulint) (ptr) / UNIV_PAGE_SIZE)\
809 + :((ulint) ((byte*)ptr - (byte*)(buf_page_from_array(bpool, 0)->frame)) / UNIV_PAGE_SIZE))
810 +#define BUF_POOL_ZIP_FOLD(bpool, b) BUF_POOL_ZIP_FOLD_PTR(bpool, (b)->frame)
811 +#define BUF_POOL_ZIP_FOLD_BPAGE(bpool, b) BUF_POOL_ZIP_FOLD(bpool, (buf_block_t*) (b))
814 /** @brief The buffer pool statistics structure. */
815 diff -ruN a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
816 --- a/storage/innobase/include/hash0hash.h 2010-11-03 07:01:13.000000000 +0900
817 +++ b/storage/innobase/include/hash0hash.h 2010-12-07 17:56:28.324726446 +0900
821 ulint n); /*!< in: number of array cells */
823 +/*************************************************************//**
835 + hash_table_t* table,
842 + hash_table_t* table);
844 #ifndef UNIV_HOTBACKUP
845 /*************************************************************//**
846 Creates a mutex array to protect a hash table. */
848 /****************************************************************//**
849 Move all hash table entries from OLD_TABLE to NEW_TABLE. */
851 -#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
852 +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, BPOOL, FOLD_FUNC) \
855 ulint cell_count2222;\
859 NODE_TYPE* next2222 = node2222->PTR_NAME;\
860 - ulint fold2222 = FOLD_FUNC(node2222);\
861 + ulint fold2222 = FOLD_FUNC(BPOOL, node2222);\
863 HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
864 fold2222, node2222);\
870 +/********************************************************************//**
871 +Align nodes with moving location.*/
872 +#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \
875 + ulint cell_count2222;\
877 + cell_count2222 = hash_get_n_cells(TABLE);\
879 + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
880 + NODE_TYPE* node2222;\
882 + if ((TABLE)->array[i2222].node) \
883 + (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \
884 + + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\
885 + node2222 = HASH_GET_FIRST((TABLE), i2222);\
887 + while (node2222) {\
888 + if (node2222->PTR_NAME) \
889 + node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \
890 + + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\
892 + node2222 = node2222->PTR_NAME;\
897 /************************************************************//**
898 Gets the mutex index for a fold value in a hash table.
899 diff -ruN a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
900 --- a/storage/innobase/include/os0proc.h 2010-11-03 07:01:13.000000000 +0900
901 +++ b/storage/innobase/include/os0proc.h 2010-12-07 16:10:14.955718750 +0900
907 +# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
908 +#include <sys/ipc.h>
909 +#include <sys/shm.h>
913 typedef void* os_process_t;
915 ulint size); /*!< in: size returned by
916 os_mem_alloc_large() */
919 +/****************************************************************//**
920 +Allocates or attaches and reuses shared memory segment.
921 +The content is not cleared automatically.
922 +@return allocated memory */
927 + ulint* n, /*!< in/out: number of bytes */
931 +/****************************************************************//**
932 +Detach shared memory segment. */
937 + void *ptr, /*!< in: pointer returned by
939 + ulint size); /*!< in: size returned by
942 #include "os0proc.ic"
944 diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
945 --- a/storage/innobase/include/srv0srv.h 2010-12-04 20:20:28.016566697 +0900
946 +++ b/storage/innobase/include/srv0srv.h 2010-12-07 16:10:14.956717659 +0900
948 extern ulint srv_mem_pool_size;
949 extern ulint srv_lock_table_size;
951 +extern uint srv_buffer_pool_shm_key;
952 +extern ibool srv_buffer_pool_shm_is_reused;
953 +extern ibool srv_buffer_pool_shm_checksum;
955 extern ibool srv_thread_concurrency_timer_based;
957 extern ulint srv_n_file_io_threads;
958 diff -ruN a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
959 --- a/storage/innobase/include/ut0lst.h 2010-11-03 07:01:13.000000000 +0900
960 +++ b/storage/innobase/include/ut0lst.h 2010-12-07 16:10:14.957785525 +0900
962 ut_a(ut_list_node_313 == NULL); \
965 +/********************************************************************//**
966 +Align nodes with moving location.
967 +@param NAME the name of the list
968 +@param TYPE node type
969 +@param BASE base node (not a pointer to it)
970 +@param OFFSET offset moved */
971 +#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \
973 + ulint ut_list_i_313; \
974 + TYPE* ut_list_node_313; \
976 + if ((BASE).start) \
977 + (BASE).start = (void*)((byte*)((BASE).start) \
978 + + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\
980 + (BASE).end = (void*)((byte*)((BASE).end) \
981 + + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\
983 + ut_list_node_313 = (BASE).start; \
985 + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
986 + ut_a(ut_list_node_313); \
987 + if ((ut_list_node_313->NAME).prev) \
988 + (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\
989 + + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\
990 + if ((ut_list_node_313->NAME).next) \
991 + (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\
992 + + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\
993 + ut_list_node_313 = (ut_list_node_313->NAME).next; \
996 + ut_a(ut_list_node_313 == NULL); \
998 + ut_list_node_313 = (BASE).end; \
1000 + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
1001 + ut_a(ut_list_node_313); \
1002 + ut_list_node_313 = (ut_list_node_313->NAME).prev; \
1005 + ut_a(ut_list_node_313 == NULL); \
1010 diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
1011 --- a/storage/innobase/log/log0recv.c 2010-12-04 19:46:40.212513377 +0900
1012 +++ b/storage/innobase/log/log0recv.c 2010-12-07 16:10:14.959785817 +0900
1013 @@ -2912,6 +2912,7 @@
1014 /*==========================*/
1016 ut_a(!recv_needed_recovery);
1017 + ut_a(!srv_buffer_pool_shm_is_reused);
1019 recv_needed_recovery = TRUE;
1021 diff -ruN a/storage/innobase/os/os0proc.c b/storage/innobase/os/os0proc.c
1022 --- a/storage/innobase/os/os0proc.c 2010-11-03 07:01:13.000000000 +0900
1023 +++ b/storage/innobase/os/os0proc.c 2010-12-07 16:10:14.960800123 +0900
1024 @@ -229,3 +229,173 @@
1029 +/****************************************************************//**
1030 +Allocates or attaches and reuses shared memory segment.
1031 +The content is not cleared automatically.
1032 +@return allocated memory */
1037 + ulint* n, /*!< in/out: number of bytes */
1042 +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
1048 + "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n",
1050 +# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
1051 + if (!os_use_large_pages || !os_large_page_size) {
1055 + /* Align block size to os_large_page_size */
1056 + ut_ad(ut_is_2pow(os_large_page_size));
1057 + size = ut_2pow_round(*n + (os_large_page_size - 1),
1058 + os_large_page_size);
1060 + shmid = shmget((key_t)key, (size_t)size,
1061 + IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W);
1063 + if (errno == EEXIST) {
1065 + "InnoDB: HugeTLB: The shared memory segment exists.\n");
1066 + shmid = shmget((key_t)key, (size_t)size,
1067 + SHM_HUGETLB | SHM_R | SHM_W);
1070 + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
1075 + "InnoDB: HugeTLB: The existent shared memory segment is used.\n");
1079 + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
1086 + "InnoDB: HugeTLB: A new shared memory segment has been created .\n");
1089 + ptr = shmat(shmid, NULL, 0);
1090 + if (ptr == (void *)-1) {
1092 + "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n",
1099 + os_fast_mutex_lock(&ut_list_mutex);
1100 + ut_total_allocated_memory += size;
1101 + os_fast_mutex_unlock(&ut_list_mutex);
1102 + UNIV_MEM_ALLOC(ptr, size);
1107 +# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */
1108 +# ifdef HAVE_GETPAGESIZE
1109 + size = getpagesize();
1111 + size = UNIV_PAGE_SIZE;
1113 + /* Align block size to system page size */
1114 + ut_ad(ut_is_2pow(size));
1115 + size = *n = ut_2pow_round(*n + (size - 1), size);
1117 + shmid = shmget((key_t)key, (size_t)size,
1118 + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W);
1120 + if (errno == EEXIST) {
1122 + "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n");
1123 + shmid = shmget((key_t)key, (size_t)size,
1127 + "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
1133 + "InnoDB: The existent shared memory segment is used.\n");
1137 + "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
1145 + "InnoDB: A new shared memory segment has been created.\n");
1148 + ptr = shmat(shmid, NULL, 0);
1149 + if (ptr == (void *)-1) {
1151 + "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n",
1158 + os_fast_mutex_lock(&ut_list_mutex);
1159 + ut_total_allocated_memory += size;
1160 + os_fast_mutex_unlock(&ut_list_mutex);
1161 + UNIV_MEM_ALLOC(ptr, size);
1164 +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1165 + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
1167 +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1171 +/****************************************************************//**
1172 +Detach shared memory segment. */
1177 + void *ptr, /*!< in: pointer returned by
1179 + ulint size) /*!< in: size returned by
1182 + os_fast_mutex_lock(&ut_list_mutex);
1183 + ut_a(ut_total_allocated_memory >= size);
1184 + os_fast_mutex_unlock(&ut_list_mutex);
1186 +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
1187 + if (!shmdt(ptr)) {
1188 + os_fast_mutex_lock(&ut_list_mutex);
1189 + ut_a(ut_total_allocated_memory >= size);
1190 + ut_total_allocated_memory -= size;
1191 + os_fast_mutex_unlock(&ut_list_mutex);
1192 + UNIV_MEM_FREE(ptr, size);
1194 +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1195 + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
1196 +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1198 diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
1199 --- a/storage/innobase/srv/srv0srv.c 2010-12-04 20:20:44.687550693 +0900
1200 +++ b/storage/innobase/srv/srv0srv.c 2010-12-07 16:10:14.962785720 +0900
1201 @@ -233,6 +233,11 @@
1202 UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
1203 UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
1205 +/* key value for shm */
1206 +UNIV_INTERN uint srv_buffer_pool_shm_key = 0;
1207 +UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE;
1208 +UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE;
1210 /* This parameter is deprecated. Use srv_n_io_[read|write]_threads
1212 UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
1213 diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
1214 --- a/storage/innobase/srv/srv0start.c 2010-12-04 20:19:29.806482628 +0900
1215 +++ b/storage/innobase/srv/srv0start.c 2010-12-07 16:10:14.964785346 +0900
1216 @@ -1759,6 +1759,8 @@
1217 Note that this is not as heavy weight as it seems. At
1218 this point there will be only ONE page in the buf_LRU
1219 and there must be no page in the buf_flush list. */
1220 + /* buffer_pool_shm should not be reused when recovery was needed. */
1221 + if (!srv_buffer_pool_shm_is_reused)
1222 buf_pool_invalidate();
1224 /* We always try to do a recovery, even if the database had