1 # name : innodb_buffer_pool_shm.patch
3 # maintainer : Yasufumi
6 # Any small change to this file in the main branch
7 # should be done or reviewed by the maintainer!
8 diff -ruN a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c
9 --- a/storage/innobase/buf/buf0buddy.c 2010-12-04 19:46:39.372513543 +0900
10 +++ b/storage/innobase/buf/buf0buddy.c 2010-12-07 17:56:28.302087851 +0900
12 void* buf, /*!< in: buffer frame to deallocate */
13 ibool have_page_hash_mutex)
15 - const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
16 + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf_pool, buf);
21 buf_block_t* block) /*!< in: buffer frame to allocate */
23 buf_pool_t* buf_pool = buf_pool_from_block(block);
24 - const ulint fold = BUF_POOL_ZIP_FOLD(block);
25 + const ulint fold = BUF_POOL_ZIP_FOLD(buf_pool, block);
26 //ut_ad(buf_pool_mutex_own(buf_pool));
27 ut_ad(!mutex_own(&buf_pool->zip_mutex));
28 ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
29 diff -ruN a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
30 --- a/storage/innobase/buf/buf0buf.c 2010-12-06 20:16:21.726195340 +0900
31 +++ b/storage/innobase/buf/buf0buf.c 2010-12-07 20:40:30.824749814 +0900
35 #include "srv0start.h"
37 +#include "read0read.h"
39 +#include "ha_prototypes.h"
41 /* prototypes for new functions added to ha_innodb.cc */
42 trx_t* innobase_get_trx();
44 // was allocated for the frames */
45 // buf_block_t* blocks; /*!< array of buffer control blocks */
48 +/* Buffer pool shared memory segment information */
49 +typedef struct buf_shm_info_struct buf_shm_info_t;
51 +struct buf_shm_info_struct {
54 + ibool is_new; /* during initializing */
55 + ibool clean; /* clean shutdowned and free */
56 + ibool reusable; /* reusable */
57 + ulint buf_pool_size; /* backup value */
58 + ulint page_size; /* backup value */
59 + ulint frame_offset; /* offset of the first frame based on chunk->mem */
60 + ulint zip_hash_offset;
65 + buf_pool_t buf_pool_backup;
66 + buf_chunk_t chunk_backup;
71 +#define BUF_SHM_INFO_HEAD "XTRA_SHM"
72 #endif /* !UNIV_HOTBACKUP */
74 /********************************************************************//**
76 #endif /* UNIV_SYNC_DEBUG */
84 + ptrdiff_t frame_offset)
87 + block->frame += frame_offset;
89 + UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block);
91 + block->index = NULL;
92 + block->btr_search_latch = NULL;
95 + /* recreate later */
96 + block->page.in_page_hash = FALSE;
97 + block->page.in_zip_hash = FALSE;
98 +#endif /* UNIV_DEBUG */
100 +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
101 + block->n_pointers = 0;
102 +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
104 + if (block->page.zip.data)
105 + block->page.zip.data += frame_offset;
107 + block->is_hashed = FALSE;
109 +#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
110 + /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
111 + of buffer block mutex/rwlock with performance schema. If
112 + PFS_GROUP_BUFFER_SYNC is defined, skip the registration
113 + since buffer block mutex/rwlock will be registered later in
114 + pfs_register_buffer_block() */
116 + mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
117 + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
118 +#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
119 + mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
120 + rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
121 +#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
123 + ut_ad(rw_lock_validate(&(block->lock)));
125 +#ifdef UNIV_SYNC_DEBUG
126 + rw_lock_create(buf_block_debug_latch_key,
127 + &block->debug_latch, SYNC_NO_ORDER_CHECK);
128 +#endif /* UNIV_SYNC_DEBUG */
131 /********************************************************************//**
132 Allocates a chunk of buffer frames.
133 @return chunk, or NULL on failure */
134 @@ -1001,26 +1082,190 @@
138 + ulint zip_hash_n = 0;
139 + ulint zip_hash_mem_size = 0;
140 + hash_table_t* zip_hash_tmp = NULL;
143 + buf_shm_info_t* shm_info = NULL;
145 /* Round down to a multiple of page size,
146 although it already should be. */
147 mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
148 + size_target = (mem_size / UNIV_PAGE_SIZE) - 1;
150 + srv_buffer_pool_shm_is_reused = FALSE;
152 + if (srv_buffer_pool_shm_key) {
153 + /* zip_hash size */
154 + zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2;
155 + zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n)
156 + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
159 /* Reserve space for the block descriptors. */
160 mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
161 + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
162 + if (srv_buffer_pool_shm_key) {
163 + mem_size += ut_2pow_round(sizeof(buf_shm_info_t)
164 + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
165 + mem_size += zip_hash_mem_size;
168 chunk->mem_size = mem_size;
170 + if (srv_buffer_pool_shm_key) {
174 + ut_a(buf_pool->n_chunks == 1);
177 + "InnoDB: Notice: The innodb_buffer_pool_shm_key option has been specified.\n"
178 + "InnoDB: Do not change the following between restarts of the server while this option is being used:\n"
179 + "InnoDB: * the mysqld executable between restarts of the server.\n"
180 + "InnoDB: * the value of innodb_buffer_pool_size.\n"
181 + "InnoDB: * the value of innodb_page_size.\n"
182 + "InnoDB: * datafiles created by InnoDB during this session.\n"
183 + "InnoDB: Otherwise, data corruption in datafiles may result.\n");
185 + /* FIXME: This is vague id still */
186 + binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get)
187 + + (ulint) ((byte*)os_file_get_last_error - (byte*)buf_calc_page_new_checksum)
188 + + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal)
189 + + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str)
190 + + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version)
191 + + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low)
192 + + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func)
193 + + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread)
194 + + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside)
195 + + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity)
196 + + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup);
198 + chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new);
200 + if (UNIV_UNLIKELY(chunk->mem == NULL)) {
204 +#ifdef UNIV_SET_MEM_TO_ZERO
206 + memset(chunk->mem, '\0', chunk->mem_size);
209 + /* for ut_fold_binary_32(), these values should be 32-bit aligned */
210 + ut_a(sizeof(buf_shm_info_t) % 4 == 0);
211 + ut_a((ulint)chunk->mem % 4 == 0);
212 + ut_a(chunk->mem_size % 4 == 0);
214 + shm_info = chunk->mem;
216 + zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size);
219 + strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8);
220 + shm_info->binary_id = binary_id;
221 + shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */
222 + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
223 + shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */
224 + shm_info->buf_pool_size = srv_buf_pool_size;
225 + shm_info->page_size = srv_page_size;
226 + shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size;
227 + shm_info->zip_hash_n = zip_hash_n;
231 + if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) {
233 + "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n");
236 + if (shm_info->binary_id != binary_id) {
238 + "InnoDB: Error: The shared memory segment seems not to be for this binary.\n");
241 + if (shm_info->is_new) {
243 + "InnoDB: Error: The shared memory was not initialized yet.\n");
246 + if (shm_info->buf_pool_size != srv_buf_pool_size) {
248 + "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n",
249 + shm_info->buf_pool_size, srv_buf_pool_size);
252 + if (shm_info->page_size != srv_page_size) {
254 + "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n",
255 + shm_info->page_size, srv_page_size);
258 + if (!shm_info->reusable) {
260 + "InnoDB: Warning: The shared memory has unrecoverable contents.\n"
261 + "InnoDB: The shared memory segment is initialized.\n");
265 + if (!shm_info->clean) {
267 + "InnoDB: Warning: The shared memory was not shut down cleanly.\n"
268 + "InnoDB: The shared memory segment is initialized.\n");
273 + ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size);
274 + ut_a(shm_info->zip_hash_n == zip_hash_n);
276 + /* check checksum */
277 + if (srv_buffer_pool_shm_checksum) {
278 + checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
279 + chunk->mem_size - sizeof(buf_shm_info_t));
281 + checksum = BUF_NO_CHECKSUM_MAGIC;
284 + if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC
285 + && shm_info->checksum != checksum) {
287 + "InnoDB: Error: checksum of the shared memory is not match. "
288 + "(stored=%lu calculated=%lu)\n",
289 + shm_info->checksum, checksum);
293 + /* flag to use the segment. */
294 + shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
297 + /* init zip_hash contents */
299 + hash_create_init(zip_hash_tmp, zip_hash_n);
301 + /* adjust offset is done later */
302 + hash_create_reuse(zip_hash_tmp);
304 + srv_buffer_pool_shm_is_reused = TRUE;
307 chunk->mem = os_mem_alloc_large(&chunk->mem_size);
309 if (UNIV_UNLIKELY(chunk->mem == NULL)) {
315 /* Allocate the block descriptors from
316 the start of the memory block. */
317 + if (srv_buffer_pool_shm_key) {
318 + chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t));
320 chunk->blocks = chunk->mem;
323 /* Align a pointer to the first frame. Note that when
324 os_large_page_size is smaller than UNIV_PAGE_SIZE,
325 @@ -1028,8 +1273,13 @@
326 it is bigger, we may allocate more blocks than requested. */
328 frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
329 + if (srv_buffer_pool_shm_key) {
330 + /* reserve zip_hash space and always -1 for reproductibity */
331 + chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1;
333 chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
334 - (frame != chunk->mem);
337 /* Subtract the space needed for block descriptors. */
339 @@ -1043,6 +1293,102 @@
343 + if (chunk->size > size_target) {
344 + chunk->size = size_target;
347 + if (shm_info && !(shm_info->is_new)) {
348 + /* convert the shared memory segment for reuse */
349 + ptrdiff_t phys_offset;
350 + ptrdiff_t logi_offset;
351 + ptrdiff_t blocks_offset;
352 + void* previous_frame_address;
354 + if (chunk->size < shm_info->chunk_backup.size) {
356 + "InnoDB: Error: The buffer pool became smaller because of allocated address.\n"
357 + "InnoDB: Retrying may avoid this situation.\n");
358 + shm_info->clean = TRUE; /* release the flag for retrying */
362 + chunk->size = shm_info->chunk_backup.size;
363 + phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset);
364 + logi_offset = frame - chunk->blocks[0].frame;
365 + previous_frame_address = chunk->blocks[0].frame;
366 + blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks;
368 + if (phys_offset || logi_offset || blocks_offset) {
370 + "InnoDB: Buffer pool in the shared memory segment should be converted.\n"
371 + "InnoDB: Previous frames in address : %p\n"
372 + "InnoDB: Previous frames were located : %p\n"
373 + "InnoDB: Current frames should be located: %p\n"
374 + "InnoDB: Pysical offset : %ld (%#lx)\n"
375 + "InnoDB: Logical offset (frames) : %ld (%#lx)\n"
376 + "InnoDB: Logical offset (blocks) : %ld (%#lx)\n",
377 + (byte*)chunk->mem + shm_info->frame_offset,
378 + chunk->blocks[0].frame, frame,
379 + phys_offset, phys_offset, logi_offset, logi_offset,
380 + blocks_offset, blocks_offset);
383 + "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n");
388 + "InnoDB: Aligning physical offset...");
390 + memmove(frame, (byte*)chunk->mem + shm_info->frame_offset,
391 + chunk->size * UNIV_PAGE_SIZE);
398 + block = chunk->blocks;
399 + for (i = chunk->size; i--; ) {
400 + buf_block_reuse(block, logi_offset);
404 + if (logi_offset || blocks_offset) {
406 + "InnoDB: Aligning logical offset...");
409 + /* buf_pool_t buf_pool_backup */
410 + UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list,
411 + previous_frame_address, logi_offset, blocks_offset);
412 + UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free,
413 + previous_frame_address, logi_offset, blocks_offset);
414 + UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU,
415 + previous_frame_address, logi_offset, blocks_offset);
416 + if (shm_info->buf_pool_backup.LRU_old)
417 + shm_info->buf_pool_backup.LRU_old =
418 + (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old)
419 + + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address)
420 + ? logi_offset : blocks_offset));
422 + UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU,
423 + previous_frame_address, logi_offset, blocks_offset);
425 + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean,
426 + previous_frame_address, logi_offset, blocks_offset);
427 + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
428 + UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i],
429 + previous_frame_address, logi_offset, blocks_offset);
432 + HASH_OFFSET(zip_hash_tmp, buf_page_t, hash,
433 + previous_frame_address, logi_offset, blocks_offset);
439 /* Init block structs and assign frames for them. Then we
440 assign the frames to the first blocks (we already mapped the
442 @@ -1068,6 +1414,11 @@
444 frame += UNIV_PAGE_SIZE;
449 + shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem;
452 #ifdef PFS_GROUP_BUFFER_SYNC
453 pfs_register_buffer_block(chunk);
454 @@ -1249,6 +1600,8 @@
455 UNIV_MEM_UNDESC(block);
458 + ut_a(!srv_buffer_pool_shm_key);
460 os_mem_free_large(chunk->mem, chunk->mem_size);
463 @@ -1289,7 +1642,7 @@
464 ulint instance_no) /*!< in: id of the instance */
467 - buf_chunk_t* chunk;
468 + buf_chunk_t* chunk = NULL;
470 /* 1. Initialize general fields
471 ------------------------------- */
472 @@ -1335,7 +1688,10 @@
473 buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
475 buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
476 + /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */
477 + if (!srv_buffer_pool_shm_key) {
478 buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
481 buf_pool->last_printout_time = ut_time();
483 @@ -1354,6 +1710,86 @@
485 /* All fields are initialized by mem_zalloc(). */
487 + if (chunk && srv_buffer_pool_shm_key) {
488 + buf_shm_info_t* shm_info;
490 + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
491 + shm_info = chunk->mem;
493 + buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset);
495 + if(shm_info->is_new) {
496 + shm_info->is_new = FALSE; /* initialization was finished */
498 + buf_block_t* block = chunk->blocks;
501 + /* shm_info->buf_pool_backup should be converted */
502 + /* at buf_chunk_init(). So copy simply. */
503 + buf_pool->flush_list = shm_info->buf_pool_backup.flush_list;
504 + buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock;
505 + buf_pool->free = shm_info->buf_pool_backup.free;
506 + buf_pool->LRU = shm_info->buf_pool_backup.LRU;
507 + buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old;
508 + buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len;
509 + buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU;
510 + buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean;
511 + for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
512 + buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i];
515 + for (i = 0; i < chunk->size; i++, block++) {
516 + if (buf_block_get_state(block)
517 + == BUF_BLOCK_FILE_PAGE) {
518 + ut_d(block->page.in_page_hash = TRUE);
519 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
520 + buf_page_address_fold(
522 + block->page.offset),
527 + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
528 + b = UT_LIST_GET_NEXT(zip_list, b)) {
529 + ut_ad(!b->in_flush_list);
530 + ut_ad(b->in_LRU_list);
532 + ut_d(b->in_page_hash = TRUE);
533 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
534 + buf_page_address_fold(b->space, b->offset), b);
537 + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
538 + b = UT_LIST_GET_NEXT(flush_list, b)) {
539 + ut_ad(b->in_flush_list);
540 + ut_ad(b->in_LRU_list);
542 + switch (buf_page_get_state(b)) {
543 + case BUF_BLOCK_ZIP_DIRTY:
544 + ut_d(b->in_page_hash = TRUE);
545 + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
546 + buf_page_address_fold(b->space,
549 + case BUF_BLOCK_FILE_PAGE:
550 + /* uncompressed page */
552 + case BUF_BLOCK_ZIP_FREE:
553 + case BUF_BLOCK_ZIP_PAGE:
554 + case BUF_BLOCK_NOT_USED:
555 + case BUF_BLOCK_READY_FOR_USE:
556 + case BUF_BLOCK_MEMORY:
557 + case BUF_BLOCK_REMOVE_HASH:
567 mutex_exit(&buf_pool->LRU_list_mutex);
568 rw_lock_x_unlock(&buf_pool->page_hash_latch);
569 buf_pool_mutex_exit(buf_pool);
570 @@ -1373,6 +1809,42 @@
574 + if (srv_buffer_pool_shm_key) {
575 + buf_shm_info_t* shm_info;
577 + ut_a(buf_pool->n_chunks == 1);
579 + chunk = buf_pool->chunks;
580 + shm_info = chunk->mem;
581 + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
583 + /* if opened, close shm. */
584 + if (!shm_info->clean) {
585 + /* validation the shared memory segment doesn't have unrecoverable contents. */
586 + /* Currently, validation became not needed */
587 + shm_info->reusable = TRUE;
589 + memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t));
590 + memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t));
592 + if (srv_fast_shutdown < 2) {
593 + if (srv_buffer_pool_shm_checksum) {
594 + shm_info->checksum =
596 + (byte*)chunk->mem + sizeof(buf_shm_info_t),
597 + chunk->mem_size - sizeof(buf_shm_info_t));
599 + shm_info->checksum = BUF_NO_CHECKSUM_MAGIC;
601 + shm_info->clean = TRUE;
605 + "InnoDB: The shared memory was closed.\n");
608 + os_shm_free(chunk->mem, chunk->mem_size);
610 chunks = buf_pool->chunks;
611 chunk = chunks + buf_pool->n_chunks;
613 @@ -1381,10 +1853,13 @@
614 would fail at shutdown. */
615 os_mem_free_large(chunk->mem, chunk->mem_size);
619 mem_free(buf_pool->chunks);
620 hash_table_free(buf_pool->page_hash);
621 + if (!srv_buffer_pool_shm_key) {
622 hash_table_free(buf_pool->zip_hash);
626 /********************************************************************//**
627 @@ -1668,6 +2143,11 @@
628 //buf_pool_mutex_enter(buf_pool);
629 mutex_enter(&buf_pool->LRU_list_mutex);
631 + if (srv_buffer_pool_shm_key) {
632 + /* Cannot support shrink */
637 if (buf_pool->n_chunks <= 1) {
639 @@ -1848,7 +2328,7 @@
640 zip_hash = hash_create(2 * buf_pool->curr_size);
642 HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
643 - BUF_POOL_ZIP_FOLD_BPAGE);
644 + buf_pool, BUF_POOL_ZIP_FOLD_BPAGE);
646 hash_table_free(buf_pool->zip_hash);
647 buf_pool->zip_hash = zip_hash;
648 @@ -2130,6 +2610,11 @@
650 ulint min_change_size = 1048576 * srv_buf_pool_instances;
652 + if (srv_buffer_pool_shm_key) {
653 + /* Cannot support resize */
657 buf_pool_mutex_enter_all();
659 if (srv_buf_pool_old_size == srv_buf_pool_size) {
660 diff -ruN a/storage/innobase/ha/hash0hash.c b/storage/innobase/ha/hash0hash.c
661 --- a/storage/innobase/ha/hash0hash.c 2010-11-03 07:01:13.000000000 +0900
662 +++ b/storage/innobase/ha/hash0hash.c 2010-12-07 16:10:14.937749140 +0900
666 /*************************************************************//**
677 + prime = ut_find_prime(n);
679 + offset = (sizeof(hash_table_t) + 7) / 8;
682 + return(offset + sizeof(hash_cell_t) * prime);
689 + hash_table_t* table,
695 + prime = ut_find_prime(n);
697 + offset = (sizeof(hash_table_t) + 7) / 8;
700 + table->array = (hash_cell_t*)(((byte*)table) + offset);
701 + table->n_cells = prime;
702 +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
703 + table->adaptive = FALSE;
704 +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
705 + table->n_mutexes = 0;
706 + table->mutexes = NULL;
707 + table->heaps = NULL;
708 + table->heap = NULL;
709 + ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
711 + /* Initialize the cell array */
712 + hash_table_clear(table);
719 + hash_table_t* table)
723 + offset = (sizeof(hash_table_t) + 7) / 8;
726 + table->array = (hash_cell_t*)(((byte*)table) + offset);
727 + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
730 +/*************************************************************//**
731 Frees a hash table. */
734 diff -ruN a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
735 --- a/storage/innobase/handler/ha_innodb.cc 2010-12-06 20:16:21.733263627 +0900
736 +++ b/storage/innobase/handler/ha_innodb.cc 2010-12-07 17:56:28.316139830 +0900
738 static my_bool innobase_create_status_file = FALSE;
739 static my_bool innobase_stats_on_metadata = TRUE;
740 static my_bool innobase_use_sys_stats_table = FALSE;
741 +static my_bool innobase_buffer_pool_shm_checksum = TRUE;
744 static char* internal_innobase_data_file_path = NULL;
745 @@ -2624,6 +2625,14 @@
746 srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
747 srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
749 + if (srv_buffer_pool_shm_key && srv_buf_pool_instances > 1) {
751 + "InnoDB: Warning: innodb_buffer_pool_shm_key cannot be used with several innodb_buffer_pool_instances.\n"
752 + "InnoDB: innodb_buffer_pool_instances was set to 1.\n");
753 + srv_buf_pool_instances = 1;
754 + innobase_buffer_pool_instances = 1;
757 srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
759 srv_n_file_io_threads = (ulint) innobase_file_io_threads;
760 @@ -2640,6 +2649,7 @@
761 srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
762 srv_use_checksums = (ibool) innobase_use_checksums;
763 srv_fast_checksum = (ibool) innobase_fast_checksum;
764 + srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum;
766 #ifdef HAVE_LARGE_PAGES
767 if ((os_use_large_pages = (ibool) my_use_large_pages))
768 @@ -11648,6 +11658,16 @@
769 "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
770 NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L);
772 +static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key,
773 + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
774 + "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.",
775 + NULL, NULL, 0, 0, INT_MAX32, 0);
777 +static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
778 + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
779 + "Enable buffer_pool_shm checksum validation (enabled by default).",
782 static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
784 "Helps in performance tuning in heavily concurrent environments.",
785 @@ -11939,6 +11959,8 @@
786 MYSQL_SYSVAR(autoextend_increment),
787 MYSQL_SYSVAR(buffer_pool_size),
788 MYSQL_SYSVAR(buffer_pool_instances),
789 + MYSQL_SYSVAR(buffer_pool_shm_key),
790 + MYSQL_SYSVAR(buffer_pool_shm_checksum),
791 MYSQL_SYSVAR(checksums),
792 MYSQL_SYSVAR(fast_checksum),
793 MYSQL_SYSVAR(commit_concurrency),
794 diff -ruN a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
795 --- a/storage/innobase/include/buf0buf.h 2010-12-06 20:16:21.778264552 +0900
796 +++ b/storage/innobase/include/buf0buf.h 2010-12-07 17:56:28.322749380 +0900
798 #ifndef UNIV_HOTBACKUP
801 +#include "srv0srv.h"
803 /** @name Modes for buf_page_get_gen */
805 @@ -1592,9 +1593,12 @@
806 /**********************************************************************//**
807 Compute the hash fold value for blocks in buf_pool->zip_hash. */
809 -#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE)
810 -#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
811 -#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
812 +/* the fold should be relative when srv_buffer_pool_shm_key is enabled */
813 +#define BUF_POOL_ZIP_FOLD_PTR(bpool, ptr) (!srv_buffer_pool_shm_key\
814 + ?((ulint) (ptr) / UNIV_PAGE_SIZE)\
815 + :((ulint) ((byte*)ptr - (byte*)(buf_page_from_array(bpool, 0)->frame)) / UNIV_PAGE_SIZE))
816 +#define BUF_POOL_ZIP_FOLD(bpool, b) BUF_POOL_ZIP_FOLD_PTR(bpool, (b)->frame)
817 +#define BUF_POOL_ZIP_FOLD_BPAGE(bpool, b) BUF_POOL_ZIP_FOLD(bpool, (buf_block_t*) (b))
820 /** A chunk of buffers. The buffer pool is allocated in chunks. */
821 diff -ruN a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
822 --- a/storage/innobase/include/hash0hash.h 2010-11-03 07:01:13.000000000 +0900
823 +++ b/storage/innobase/include/hash0hash.h 2010-12-07 17:56:28.324726446 +0900
827 ulint n); /*!< in: number of array cells */
829 +/*************************************************************//**
841 + hash_table_t* table,
848 + hash_table_t* table);
850 #ifndef UNIV_HOTBACKUP
851 /*************************************************************//**
852 Creates a mutex array to protect a hash table. */
854 /****************************************************************//**
855 Move all hash table entries from OLD_TABLE to NEW_TABLE. */
857 -#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
858 +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, BPOOL, FOLD_FUNC) \
861 ulint cell_count2222;\
865 NODE_TYPE* next2222 = node2222->PTR_NAME;\
866 - ulint fold2222 = FOLD_FUNC(node2222);\
867 + ulint fold2222 = FOLD_FUNC(BPOOL, node2222);\
869 HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
870 fold2222, node2222);\
876 +/********************************************************************//**
877 +Align nodes with moving location.*/
878 +#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \
881 + ulint cell_count2222;\
883 + cell_count2222 = hash_get_n_cells(TABLE);\
885 + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
886 + NODE_TYPE* node2222;\
888 + if ((TABLE)->array[i2222].node) \
889 + (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \
890 + + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\
891 + node2222 = HASH_GET_FIRST((TABLE), i2222);\
893 + while (node2222) {\
894 + if (node2222->PTR_NAME) \
895 + node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \
896 + + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\
898 + node2222 = node2222->PTR_NAME;\
903 /************************************************************//**
904 Gets the mutex index for a fold value in a hash table.
905 diff -ruN a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
906 --- a/storage/innobase/include/os0proc.h 2010-11-03 07:01:13.000000000 +0900
907 +++ b/storage/innobase/include/os0proc.h 2010-12-07 16:10:14.955718750 +0900
913 +# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
914 +#include <sys/ipc.h>
915 +#include <sys/shm.h>
919 typedef void* os_process_t;
921 ulint size); /*!< in: size returned by
922 os_mem_alloc_large() */
925 +/****************************************************************//**
926 +Allocates or attaches and reuses shared memory segment.
927 +The content is not cleared automatically.
928 +@return allocated memory */
933 + ulint* n, /*!< in/out: number of bytes */
937 +/****************************************************************//**
938 +Detach shared memory segment. */
943 + void *ptr, /*!< in: pointer returned by
945 + ulint size); /*!< in: size returned by
948 #include "os0proc.ic"
950 diff -ruN a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
951 --- a/storage/innobase/include/srv0srv.h 2010-12-04 20:20:28.016566697 +0900
952 +++ b/storage/innobase/include/srv0srv.h 2010-12-07 16:10:14.956717659 +0900
954 extern ulint srv_mem_pool_size;
955 extern ulint srv_lock_table_size;
957 +extern uint srv_buffer_pool_shm_key;
958 +extern ibool srv_buffer_pool_shm_is_reused;
959 +extern ibool srv_buffer_pool_shm_checksum;
961 extern ibool srv_thread_concurrency_timer_based;
963 extern ulint srv_n_file_io_threads;
964 diff -ruN a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
965 --- a/storage/innobase/include/ut0lst.h 2010-11-03 07:01:13.000000000 +0900
966 +++ b/storage/innobase/include/ut0lst.h 2010-12-07 16:10:14.957785525 +0900
968 ut_a(ut_list_node_313 == NULL); \
971 +/********************************************************************//**
972 +Align nodes with moving location.
973 +@param NAME the name of the list
974 +@param TYPE node type
975 +@param BASE base node (not a pointer to it)
976 +@param OFFSET offset moved */
977 +#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \
979 + ulint ut_list_i_313; \
980 + TYPE* ut_list_node_313; \
982 + if ((BASE).start) \
983 + (BASE).start = (void*)((byte*)((BASE).start) \
984 + + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\
986 + (BASE).end = (void*)((byte*)((BASE).end) \
987 + + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\
989 + ut_list_node_313 = (BASE).start; \
991 + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
992 + ut_a(ut_list_node_313); \
993 + if ((ut_list_node_313->NAME).prev) \
994 + (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\
995 + + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\
996 + if ((ut_list_node_313->NAME).next) \
997 + (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\
998 + + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\
999 + ut_list_node_313 = (ut_list_node_313->NAME).next; \
1002 + ut_a(ut_list_node_313 == NULL); \
1004 + ut_list_node_313 = (BASE).end; \
1006 + for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
1007 + ut_a(ut_list_node_313); \
1008 + ut_list_node_313 = (ut_list_node_313->NAME).prev; \
1011 + ut_a(ut_list_node_313 == NULL); \
1016 diff -ruN a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
1017 --- a/storage/innobase/log/log0recv.c 2010-12-04 19:46:40.212513377 +0900
1018 +++ b/storage/innobase/log/log0recv.c 2010-12-07 16:10:14.959785817 +0900
1019 @@ -2912,6 +2912,7 @@
1020 /*==========================*/
1022 ut_a(!recv_needed_recovery);
1023 + ut_a(!srv_buffer_pool_shm_is_reused);
1025 recv_needed_recovery = TRUE;
1027 diff -ruN a/storage/innobase/os/os0proc.c b/storage/innobase/os/os0proc.c
1028 --- a/storage/innobase/os/os0proc.c 2010-11-03 07:01:13.000000000 +0900
1029 +++ b/storage/innobase/os/os0proc.c 2010-12-07 16:10:14.960800123 +0900
1030 @@ -229,3 +229,173 @@
1035 +/****************************************************************//**
1036 +Allocates or attaches and reuses shared memory segment.
1037 +The content is not cleared automatically.
1038 +@return allocated memory */
1043 + ulint* n, /*!< in/out: number of bytes */
1048 +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
1054 + "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n",
1056 +# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
1057 + if (!os_use_large_pages || !os_large_page_size) {
1061 + /* Align block size to os_large_page_size */
1062 + ut_ad(ut_is_2pow(os_large_page_size));
1063 + size = ut_2pow_round(*n + (os_large_page_size - 1),
1064 + os_large_page_size);
1066 + shmid = shmget((key_t)key, (size_t)size,
1067 + IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W);
1069 + if (errno == EEXIST) {
1071 + "InnoDB: HugeTLB: The shared memory segment exists.\n");
1072 + shmid = shmget((key_t)key, (size_t)size,
1073 + SHM_HUGETLB | SHM_R | SHM_W);
1076 + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
1081 + "InnoDB: HugeTLB: The existent shared memory segment is used.\n");
1085 + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
1092 + "InnoDB: HugeTLB: A new shared memory segment has been created .\n");
1095 + ptr = shmat(shmid, NULL, 0);
1096 + if (ptr == (void *)-1) {
1098 + "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n",
1105 + os_fast_mutex_lock(&ut_list_mutex);
1106 + ut_total_allocated_memory += size;
1107 + os_fast_mutex_unlock(&ut_list_mutex);
1108 + UNIV_MEM_ALLOC(ptr, size);
1113 +# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */
1114 +# ifdef HAVE_GETPAGESIZE
1115 + size = getpagesize();
1117 + size = UNIV_PAGE_SIZE;
1119 + /* Align block size to system page size */
1120 + ut_ad(ut_is_2pow(size));
1121 + size = *n = ut_2pow_round(*n + (size - 1), size);
1123 + shmid = shmget((key_t)key, (size_t)size,
1124 + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W);
1126 + if (errno == EEXIST) {
1128 + "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n");
1129 + shmid = shmget((key_t)key, (size_t)size,
1133 + "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
1139 + "InnoDB: The existent shared memory segment is used.\n");
1143 + "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
1151 + "InnoDB: A new shared memory segment has been created.\n");
1154 + ptr = shmat(shmid, NULL, 0);
1155 + if (ptr == (void *)-1) {
1157 + "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n",
1164 + os_fast_mutex_lock(&ut_list_mutex);
1165 + ut_total_allocated_memory += size;
1166 + os_fast_mutex_unlock(&ut_list_mutex);
1167 + UNIV_MEM_ALLOC(ptr, size);
1170 +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1171 + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
1173 +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1177 +/****************************************************************//**
1178 +Detach shared memory segment. */
1183 + void *ptr, /*!< in: pointer returned by
1185 + ulint size) /*!< in: size returned by
1188 + os_fast_mutex_lock(&ut_list_mutex);
1189 + ut_a(ut_total_allocated_memory >= size);
1190 + os_fast_mutex_unlock(&ut_list_mutex);
1192 +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
1193 + if (!shmdt(ptr)) {
1194 + os_fast_mutex_lock(&ut_list_mutex);
1195 + ut_a(ut_total_allocated_memory >= size);
1196 + ut_total_allocated_memory -= size;
1197 + os_fast_mutex_unlock(&ut_list_mutex);
1198 + UNIV_MEM_FREE(ptr, size);
1200 +#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1201 + fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
1202 +#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
1204 diff -ruN a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
1205 --- a/storage/innobase/srv/srv0srv.c 2010-12-04 20:20:44.687550693 +0900
1206 +++ b/storage/innobase/srv/srv0srv.c 2010-12-07 16:10:14.962785720 +0900
1207 @@ -235,6 +235,11 @@
1208 UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
1209 UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
1211 +/* key value for shm */
1212 +UNIV_INTERN uint srv_buffer_pool_shm_key = 0;
1213 +UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE;
1214 +UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE;
1216 /* This parameter is deprecated. Use srv_n_io_[read|write]_threads
1218 UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
1219 diff -ruN a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
1220 --- a/storage/innobase/srv/srv0start.c 2010-12-04 20:19:29.806482628 +0900
1221 +++ b/storage/innobase/srv/srv0start.c 2010-12-07 16:10:14.964785346 +0900
1222 @@ -1835,6 +1835,8 @@
1223 Note that this is not as heavy weight as it seems. At
1224 this point there will be only ONE page in the buf_LRU
1225 and there must be no page in the buf_flush list. */
1226 + /* buffer_pool_shm should not be reused when recovery was needed. */
1227 + if (!srv_buffer_pool_shm_is_reused)
1228 buf_pool_invalidate();
1230 /* We always try to do a recovery, even if the database had