# name : innodb_split_buf_pool_mutex.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! diff -ruN a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c --- a/storage/innodb_plugin/btr/btr0cur.c 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/btr/btr0cur.c 2010-08-27 16:11:40.593021205 +0900 @@ -3764,7 +3764,8 @@ mtr_commit(mtr); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); mutex_enter(&block->mutex); /* Only free the block if it is still allocated to @@ -3775,17 +3776,22 @@ && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all, NULL) + if (buf_LRU_free_block(&block->page, all, NULL, TRUE) != BUF_LRU_FREED - && all && block->page.zip.data) { + && all && block->page.zip.data + /* Now, buf_LRU_free_block() may release mutex temporarily */ + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_block(&block->page, FALSE, NULL); + buf_LRU_free_block(&block->page, FALSE, NULL, TRUE); } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); mutex_exit(&block->mutex); } diff -ruN a/storage/innodb_plugin/btr/btr0sea.c b/storage/innodb_plugin/btr/btr0sea.c --- a/storage/innodb_plugin/btr/btr0sea.c 2010-08-27 16:11:12.151975789 +0900 +++ b/storage/innodb_plugin/btr/btr0sea.c 2010-08-27 16:11:40.593021205 +0900 @@ -1199,7 +1199,7 @@ ulint* offsets; rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); table = btr_search_sys->hash_index; @@ -1285,7 +1285,7 @@ bpage = UT_LIST_GET_PREV(LRU, bpage); } - buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { @@ -1878,7 +1878,8 @@ rec_offs_init(offsets_); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); cell_count = hash_get_n_cells(btr_search_sys->hash_index); @@ -1886,11 +1887,13 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if ((i != 0) && ((i % chunk_size) == 0)) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); } node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; @@ -1997,11 +2000,13 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if (i != 0) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); } if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { @@ -2009,7 +2014,8 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); diff -ruN a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c --- a/storage/innodb_plugin/buf/buf0buddy.c 2010-08-27 15:54:59.015990108 +0900 +++ b/storage/innodb_plugin/buf/buf0buddy.c 2010-08-27 16:11:40.596022762 +0900 @@ -82,10 +82,11 @@ if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); #endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); - UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage); #ifdef UNIV_DEBUG_VALGRIND if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); @@ -103,8 +104,8 @@ ulint i) /*!< in: index of buf_pool->zip_free[] */ { #ifdef UNIV_DEBUG_VALGRIND - buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); @@ -113,9 +114,10 @@ ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); #endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); - UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage); #ifdef UNIV_DEBUG_VALGRIND if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); @@ -134,12 +136,13 @@ { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_a(i < BUF_BUDDY_SIZES); #ifndef UNIV_DEBUG_VALGRIND /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], + ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], ut_ad(buf_page_get_state(ut_list_node_313) == BUF_BLOCK_ZIP_FREE))); #endif /* !UNIV_DEBUG_VALGRIND */ @@ -182,16 +185,19 @@ void buf_buddy_block_free( /*=================*/ - void* buf) /*!< in: buffer frame to deallocate */ + void* buf, /*!< in: buffer frame to deallocate */ + ibool have_page_hash_mutex) { const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); buf_page_t* bpage; buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + mutex_enter(&zip_hash_mutex); + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY && bpage->in_zip_hash && !bpage->in_page_hash), @@ -203,12 +209,14 @@ ut_d(bpage->in_zip_hash = FALSE); HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + mutex_exit(&zip_hash_mutex); + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); block = (buf_block_t*) bpage; mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); mutex_exit(&block->mutex); ut_ad(buf_buddy_n_frames > 0); @@ -224,7 +232,7 @@ buf_block_t* block) /*!< in: buffer frame to allocate */ { const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); @@ -236,7 +244,10 @@ ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); ut_d(block->page.in_zip_hash = TRUE); + + mutex_enter(&zip_hash_mutex); HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); + mutex_exit(&zip_hash_mutex); ut_d(buf_buddy_n_frames++); } @@ -270,7 +281,7 @@ bpage->state = BUF_BLOCK_ZIP_FREE; #ifndef UNIV_DEBUG_VALGRIND /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], + ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], ut_ad(buf_page_get_state( ut_list_node_313) == BUF_BLOCK_ZIP_FREE))); @@ -292,24 +303,28 @@ /*================*/ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that will be assigned + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ + mutex_enter(&zip_free_mutex); block = buf_buddy_alloc_zip(i); if (block) { goto func_exit; } + + mutex_exit(&zip_free_mutex); } /* Try allocating from the buf_pool->free list. */ @@ -326,18 +341,29 @@ } /* Try replacing an uncompressed page in the buffer pool. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_unlock(&page_hash_latch); + } block = buf_LRU_get_free_block(0); *lru = TRUE; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_lock(&page_hash_latch); + } alloc_big: buf_buddy_block_register(block); + mutex_enter(&zip_free_mutex); block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES); func_exit: buf_buddy_stat[i].used++; + mutex_exit(&zip_free_mutex); + return(block); } @@ -353,7 +379,10 @@ { buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: @@ -362,7 +391,7 @@ case BUF_BLOCK_FILE_PAGE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: - ut_error; + /* ut_error; */ /* optimistic */ case BUF_BLOCK_ZIP_DIRTY: /* Cannot relocate dirty pages. */ return(FALSE); @@ -372,9 +401,17 @@ } mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&zip_free_mutex); if (!buf_page_can_relocate(bpage)) { mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); + return(FALSE); + } + + if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) { + mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); return(FALSE); } @@ -382,18 +419,21 @@ ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); /* relocate buf_pool->zip_clean */ - b = UT_LIST_GET_PREV(list, dpage); - UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); + mutex_enter(&flush_list_mutex); + b = UT_LIST_GET_PREV(zip_list, dpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage); if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage); + UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage); } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage); } + mutex_exit(&flush_list_mutex); UNIV_MEM_INVALID(bpage, sizeof *bpage); mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); return(TRUE); } @@ -406,13 +446,15 @@ /*===============*/ void* src, /*!< in: block to relocate */ void* dst, /*!< in: free block to relocate to */ - ulint i) /*!< in: index of buf_pool->zip_free[] */ + ulint i, /*!< in: index of buf_pool->zip_free[] */ + ibool have_page_hash_mutex) { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); @@ -434,6 +476,12 @@ /* This is a compressed page. */ mutex_t* mutex; + if (!have_page_hash_mutex) { + mutex_exit(&zip_free_mutex); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + /* The src block may be split into smaller blocks, some of which may be free. Thus, the mach_read_from_4() calls below may attempt to read @@ -458,6 +506,11 @@ added to buf_pool->page_hash yet. Obviously, it cannot be relocated. */ + if (!have_page_hash_mutex) { + mutex_enter(&zip_free_mutex); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(FALSE); } @@ -467,18 +520,27 @@ For the sake of simplicity, give up. */ ut_ad(page_zip_get_size(&bpage->zip) < size); + if (!have_page_hash_mutex) { + mutex_enter(&zip_free_mutex); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(FALSE); } + /* To keep latch order */ + if (have_page_hash_mutex) + mutex_exit(&zip_free_mutex); + /* The block must have been allocated, but it may contain uninitialized data. */ UNIV_MEM_ASSERT_W(src, size); - mutex = buf_page_get_mutex(bpage); + mutex = buf_page_get_mutex_enter(bpage); - mutex_enter(mutex); + mutex_enter(&zip_free_mutex); - if (buf_page_can_relocate(bpage)) { + if (mutex && buf_page_can_relocate(bpage)) { /* Relocate the compressed page. */ ut_a(bpage->zip.data == src); memcpy(dst, src, size); @@ -493,10 +555,22 @@ buddy_stat->relocated_usec += ut_time_us(NULL) - usec; } + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(TRUE); } - mutex_exit(mutex); + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } + + if (mutex) { + mutex_exit(mutex); + } } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { /* This must be a buf_page_t object. */ #if UNIV_WORD_SIZE == 4 @@ -505,10 +579,31 @@ about uninitialized pad bytes. */ UNIV_MEM_ASSERT_RW(src, size); #endif + + mutex_exit(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + if (buf_buddy_relocate_block(src, dst)) { + mutex_enter(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } goto success; } + + mutex_enter(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } } return(FALSE); @@ -522,13 +617,15 @@ /*===============*/ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], + ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ + ibool have_page_hash_mutex) { buf_page_t* bpage; buf_page_t* buddy; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(buf_buddy_stat[i].used > 0); @@ -539,7 +636,9 @@ ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); if (i == BUF_BUDDY_SIZES) { - buf_buddy_block_free(buf); + mutex_exit(&zip_free_mutex); + buf_buddy_block_free(buf, have_page_hash_mutex); + mutex_enter(&zip_free_mutex); return; } @@ -584,7 +683,7 @@ ut_a(bpage != buf); { - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); bpage = next; } @@ -593,13 +692,13 @@ #ifndef UNIV_DEBUG_VALGRIND buddy_nonfree: /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], + ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], ut_ad(buf_page_get_state(ut_list_node_313) == BUF_BLOCK_ZIP_FREE))); #endif /* UNIV_DEBUG_VALGRIND */ /* The buddy is not free. Is there a free block of this size? */ - bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); if (bpage) { /* Remove the block from the free list, because a successful @@ -609,7 +708,7 @@ buf_buddy_remove_from_free(bpage, i); /* Try to relocate the buddy of buf to the free block. */ - if (buf_buddy_relocate(buddy, bpage, i)) { + if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) { ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); goto buddy_free2; @@ -629,14 +728,14 @@ (Parts of the buddy can be free in buf_pool->zip_free[j] with j < i.) */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], + ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], ut_ad(buf_page_get_state( ut_list_node_313) == BUF_BLOCK_ZIP_FREE && ut_list_node_313 != buddy))); #endif /* !UNIV_DEBUG_VALGRIND */ - if (buf_buddy_relocate(buddy, buf, i)) { + if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) { buf = bpage; UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); diff -ruN a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c --- a/storage/innodb_plugin/buf/buf0buf.c 2010-08-27 15:55:39.385322978 +0900 +++ b/storage/innodb_plugin/buf/buf0buf.c 2010-08-27 16:11:40.603021006 +0900 @@ -251,6 +251,12 @@ /** mutex protecting the buffer pool struct and control blocks, except the read-write lock in them */ UNIV_INTERN mutex_t buf_pool_mutex; +UNIV_INTERN mutex_t LRU_list_mutex; +UNIV_INTERN mutex_t flush_list_mutex; +UNIV_INTERN rw_lock_t page_hash_latch; +UNIV_INTERN mutex_t free_list_mutex; +UNIV_INTERN mutex_t zip_free_mutex; +UNIV_INTERN mutex_t zip_hash_mutex; /** mutex protecting the control blocks of compressed-only pages (of type buf_page_t, not buf_block_t) */ UNIV_INTERN mutex_t buf_pool_zip_mutex; @@ -661,9 +667,9 @@ block->page.in_zip_hash = FALSE; block->page.in_flush_list = FALSE; block->page.in_free_list = FALSE; - block->in_unzip_LRU_list = FALSE; #endif /* UNIV_DEBUG */ block->page.in_LRU_list = FALSE; + block->in_unzip_LRU_list = FALSE; #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG block->n_pointers = 0; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -748,8 +754,10 @@ memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif /* Add the block to the free list */ - UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + mutex_enter(&free_list_mutex); + UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&free_list_mutex); block++; frame += UNIV_PAGE_SIZE; @@ -774,7 +782,7 @@ ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); block = chunk->blocks; @@ -826,7 +834,7 @@ ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /*optimistic...*/ block = chunk->blocks; @@ -878,7 +886,7 @@ ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ block = chunk->blocks; @@ -904,7 +912,7 @@ buf_block_t* block; const buf_block_t* block_end; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ block_end = chunk->blocks + chunk->size; @@ -916,8 +924,10 @@ ut_ad(!block->in_unzip_LRU_list); ut_ad(!block->page.in_flush_list); /* Remove the block from the free list. */ + mutex_enter(&free_list_mutex); ut_ad(block->page.in_free_list); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); + mutex_exit(&free_list_mutex); /* Free the latches. */ mutex_free(&block->mutex); @@ -947,8 +957,17 @@ /* 1. Initialize general fields ------------------------------- */ mutex_create(&buf_pool_mutex, SYNC_BUF_POOL); + mutex_create(&LRU_list_mutex, SYNC_BUF_LRU_LIST); + mutex_create(&flush_list_mutex, SYNC_BUF_FLUSH_LIST); + rw_lock_create(&page_hash_latch, SYNC_BUF_PAGE_HASH); + mutex_create(&free_list_mutex, SYNC_BUF_FREE_LIST); + mutex_create(&zip_free_mutex, SYNC_BUF_ZIP_FREE); + mutex_create(&zip_hash_mutex, SYNC_BUF_ZIP_HASH); + mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); buf_pool_mutex_enter(); buf_pool->n_chunks = 1; @@ -983,6 +1002,8 @@ --------------------------- */ /* All fields are initialized by mem_zalloc(). */ + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_pool_mutex_exit(); btr_search_sys_create(buf_pool->curr_size @@ -1120,7 +1141,11 @@ buf_page_t* b; ulint fold; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); ut_a(bpage->buf_fix_count == 0); @@ -1204,7 +1229,8 @@ try_again: btr_search_disable(); /* Empty the adaptive hash index again */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); shrink_again: if (buf_pool->n_chunks <= 1) { @@ -1275,7 +1301,7 @@ buf_LRU_make_block_old(&block->page); dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE, NULL) + } else if (buf_LRU_free_block(&block->page, TRUE, NULL, FALSE) != BUF_LRU_FREED) { nonfree++; } @@ -1283,7 +1309,8 @@ mutex_exit(&block->mutex); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Request for a flush of the chunk if it helps. Do not flush if there are non-free blocks, since @@ -1332,7 +1359,8 @@ func_done: srv_buf_pool_old_size = srv_buf_pool_size; func_exit: - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); btr_search_enable(); } @@ -1350,7 +1378,11 @@ hash_table_t* zip_hash; buf_page_t* b; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + mutex_enter(&flush_list_mutex); + /* Free, create, and populate the hash table. */ hash_table_free(buf_pool->page_hash); @@ -1392,7 +1424,7 @@ in buf_pool->flush_list. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_ad(!b->in_flush_list); ut_ad(b->in_LRU_list); @@ -1404,7 +1436,7 @@ } for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); ut_ad(b->in_LRU_list); ut_ad(b->in_page_hash); @@ -1430,7 +1462,10 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&flush_list_mutex); } /********************************************************************//** @@ -1440,17 +1475,20 @@ buf_pool_resize(void) /*=================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); if (srv_buf_pool_old_size == srv_buf_pool_size) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return; } if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Disable adaptive hash indexes and empty the index in order to free up memory in the buffer pool chunks. */ @@ -1484,7 +1522,8 @@ } srv_buf_pool_old_size = srv_buf_pool_size; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } buf_pool_page_hash_rebuild(); @@ -1500,13 +1539,15 @@ /*================*/ buf_page_t* bpage) /*!< in: buffer block of a file page */ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); ut_a(buf_page_in_file(bpage)); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } /********************************************************************//** @@ -1528,14 +1569,20 @@ ut_a(buf_page_in_file(bpage)); if (buf_page_peek_if_too_old(bpage)) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } else if (!access_time) { ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); + //buf_pool_mutex_enter(); + if (block_mutex) { buf_page_set_accessed(bpage, time_ms); - buf_pool_mutex_exit(); + mutex_exit(block_mutex); + } + //buf_pool_mutex_exit(); } } @@ -1551,7 +1598,8 @@ { buf_block_t* block; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -1559,7 +1607,8 @@ block->check_index_page_at_flush = FALSE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); } /********************************************************************//** @@ -1577,7 +1626,8 @@ buf_block_t* block; ibool is_hashed; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -1587,7 +1637,8 @@ is_hashed = block->is_hashed; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(is_hashed); } @@ -1608,7 +1659,8 @@ { buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); @@ -1616,7 +1668,8 @@ bpage->file_page_was_freed = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage); } @@ -1636,7 +1689,8 @@ { buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); @@ -1644,7 +1698,8 @@ bpage->file_page_was_freed = FALSE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage); } @@ -1678,8 +1733,9 @@ buf_pool->stat.n_page_gets++; for (;;) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); lookup: + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); if (bpage) { break; @@ -1687,7 +1743,8 @@ /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); buf_read_page(space, zip_size, offset); @@ -1699,29 +1756,34 @@ if (UNIV_UNLIKELY(!bpage->zip.data)) { /* There is no compressed page. */ err_exit: - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(NULL); } + block_mutex = buf_page_get_mutex_enter(bpage); + + rw_lock_s_unlock(&page_hash_latch); + switch (buf_page_get_state(bpage)) { case BUF_BLOCK_NOT_USED: case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: case BUF_BLOCK_ZIP_FREE: + if (block_mutex) + mutex_exit(block_mutex); break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - block_mutex = &buf_pool_zip_mutex; - mutex_enter(block_mutex); + ut_a(block_mutex == &buf_pool_zip_mutex); bpage->buf_fix_count++; goto got_block; case BUF_BLOCK_FILE_PAGE: - block_mutex = &((buf_block_t*) bpage)->mutex; - mutex_enter(block_mutex); + ut_a(block_mutex == &((buf_block_t*) bpage)->mutex); /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE, NULL) + if (buf_LRU_free_block(bpage, FALSE, NULL, FALSE) == BUF_LRU_FREED) { mutex_exit(block_mutex); @@ -1740,7 +1802,7 @@ must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; access_time = buf_page_is_accessed(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); mutex_exit(block_mutex); @@ -1995,7 +2057,7 @@ const buf_block_t* block) /*!< in: pointer to block, not dereferenced */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) { /* The pointer should be aligned. */ @@ -2029,6 +2091,7 @@ ulint fix_type; ibool must_read; ulint retries = 0; + mutex_t* block_mutex; ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); @@ -2046,9 +2109,11 @@ buf_pool->stat.n_page_gets++; loop: block = guess; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + /* If the guess is a compressed page descriptor that has been allocated by buf_buddy_alloc(), it may have been invalidated by buf_buddy_relocate(). In that @@ -2057,11 +2122,15 @@ the guess may be pointing to a buffer pool chunk that has been released when resizing the buffer pool. */ - if (!buf_block_is_uncompressed(block) + if (!block_mutex) { + block = guess = NULL; + } else if (!buf_block_is_uncompressed(block) || offset != block->page.offset || space != block->page.space || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(block_mutex); + block = guess = NULL; } else { ut_ad(!block->page.in_zip_hash); @@ -2070,14 +2139,20 @@ } if (block == NULL) { + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); + if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + ut_a(block_mutex); + } + rw_lock_s_unlock(&page_hash_latch); } loop2: if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (mode == BUF_GET_IF_IN_POOL) { @@ -2120,7 +2195,8 @@ if (must_read && mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); return(NULL); } @@ -2130,38 +2206,50 @@ ibool success; case BUF_BLOCK_FILE_PAGE: + if (block_mutex == &buf_pool_zip_mutex) { + /* it is wrong mutex... */ + mutex_exit(block_mutex); + goto loop; + } break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: + ut_ad(block_mutex == &buf_pool_zip_mutex); bpage = &block->page; /* Protect bpage->buf_fix_count. */ - mutex_enter(&buf_pool_zip_mutex); + /* Already proteced here. */ + //mutex_enter(&buf_pool_zip_mutex); if (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by buf_page_init_for_read(). */ - mutex_exit(&buf_pool_zip_mutex); + //mutex_exit(&buf_pool_zip_mutex); wait_until_unfixed: /* The block is buffer-fixed or I/O-fixed. Try again later. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); os_thread_sleep(WAIT_FOR_READ); goto loop; } /* Allocate an uncompressed page. */ - buf_pool_mutex_exit(); - mutex_exit(&buf_pool_zip_mutex); + //buf_pool_mutex_exit(); + //mutex_exit(&buf_pool_zip_mutex); + mutex_exit(block_mutex); block = buf_LRU_get_free_block(0); ut_a(block); + block_mutex = &block->mutex; - buf_pool_mutex_enter(); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + mutex_enter(block_mutex); { buf_page_t* hash_bpage @@ -2172,35 +2260,49 @@ while buf_pool_mutex was released. Free the block that was allocated. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + mutex_exit(block_mutex); block = (buf_block_t*) hash_bpage; + if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + ut_a(block_mutex); + } + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&LRU_list_mutex); goto loop2; } } + mutex_enter(&buf_pool_zip_mutex); + if (UNIV_UNLIKELY (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) { + mutex_exit(&buf_pool_zip_mutex); /* The block was buffer-fixed or I/O-fixed while buf_pool_mutex was not held by this thread. Free the block that was allocated and try again. This should be extremely unlikely. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + //mutex_exit(&block->mutex); + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&LRU_list_mutex); goto wait_until_unfixed; } /* Move the compressed page from bpage to block, and uncompress it. */ - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&flush_list_mutex); buf_relocate(bpage, &block->page); + + rw_lock_x_unlock(&page_hash_latch); + buf_block_init_low(block); block->lock_hash_val = lock_rec_hash(space, offset); @@ -2209,7 +2311,7 @@ if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { - UT_LIST_REMOVE(list, buf_pool->zip_clean, + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, &block->page); ut_ad(!block->page.in_flush_list); } else { @@ -2218,6 +2320,8 @@ &block->page); } + mutex_exit(&flush_list_mutex); + /* Buffer-fix, I/O-fix, and X-latch the block for the duration of the decompression. Also add the block to the unzip_LRU list. */ @@ -2226,19 +2330,24 @@ /* Insert at the front of unzip_LRU list */ buf_unzip_LRU_add_block(block, FALSE); + mutex_exit(&LRU_list_mutex); + block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); rw_lock_x_lock_func(&block->lock, 0, file, line); UNIV_MEM_INVALID(bpage, sizeof *bpage); - mutex_exit(&block->mutex); + mutex_exit(block_mutex); mutex_exit(&buf_pool_zip_mutex); + + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_unzip++; + mutex_exit(&buf_pool_mutex); - buf_buddy_free(bpage, sizeof *bpage); + buf_buddy_free(bpage, sizeof *bpage, FALSE); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); /* Decompress the page and apply buffered operations while not holding buf_pool_mutex or block->mutex. */ @@ -2251,12 +2360,15 @@ } /* Unfix and unlatch the block. */ - buf_pool_mutex_enter(); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(); + block_mutex = &block->mutex; + mutex_enter(block_mutex); block->page.buf_fix_count--; buf_block_set_io_fix(block, BUF_IO_NONE); - mutex_exit(&block->mutex); + + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_unzip--; + mutex_exit(&buf_pool_mutex); rw_lock_x_unlock(&block->lock); break; @@ -2271,7 +2383,7 @@ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - mutex_enter(&block->mutex); + //mutex_enter(&block->mutex); #if UNIV_WORD_SIZE == 4 /* On 32-bit systems, there is no padding in buf_page_t. On other systems, Valgrind could complain about uninitialized pad @@ -2305,13 +2417,14 @@ buf_block_buf_fix_inc(block, file, line); - mutex_exit(&block->mutex); + //mutex_exit(&block->mutex); /* Check if this is the first access to the page */ access_time = buf_page_is_accessed(&block->page); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); buf_page_set_accessed_make_young(&block->page, access_time); @@ -2539,9 +2652,11 @@ mutex_exit(&block->mutex); if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); buf_LRU_make_block_young(&block->page); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } else if (!buf_page_is_accessed(&block->page)) { /* Above, we do a dirty read on purpose, to avoid mutex contention. The field buf_page_t::access_time @@ -2549,9 +2664,11 @@ field must be protected by mutex, however. */ ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&block->mutex); buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&block->mutex); } ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); @@ -2617,16 +2734,19 @@ ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = buf_block_hash_get(space_id, page_no); if (!block) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(NULL); } mutex_enter(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); @@ -2713,7 +2833,10 @@ { buf_page_t* hash_page; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(&(block->mutex))); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); @@ -2746,7 +2869,8 @@ (const void*) hash_page, (const void*) block); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -2825,16 +2949,24 @@ ut_ad(block); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); if (buf_page_hash_get(space, offset)) { /* The page is already in the buffer pool. */ err_exit: if (block) { mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); } + else { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } bpage = NULL; goto func_exit; @@ -2854,6 +2986,8 @@ mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); @@ -2881,7 +3015,7 @@ been added to buf_pool->LRU and buf_pool->page_hash. */ mutex_exit(&block->mutex); - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -2894,6 +3028,7 @@ buf_unzip_LRU_add_block(block, TRUE); } + mutex_exit(&LRU_list_mutex); mutex_exit(&block->mutex); } else { /* Defer buf_buddy_alloc() until after the block has @@ -2905,8 +3040,8 @@ control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ - data = buf_buddy_alloc(zip_size, &lru); - bpage = buf_buddy_alloc(sizeof *bpage, &lru); + data = buf_buddy_alloc(zip_size, &lru, TRUE); + bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool_mutex. Thus, we must @@ -2915,8 +3050,11 @@ && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { /* The block was added by some other thread. */ - buf_buddy_free(bpage, sizeof *bpage); - buf_buddy_free(data, zip_size); + buf_buddy_free(bpage, sizeof *bpage, TRUE); + buf_buddy_free(data, zip_size, TRUE); + + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); bpage = NULL; goto func_exit; @@ -2946,18 +3084,26 @@ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, buf_page_address_fold(space, offset), bpage); + rw_lock_x_unlock(&page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); + mutex_enter(&flush_list_mutex); buf_LRU_insert_zip_clean(bpage); + mutex_exit(&flush_list_mutex); + + mutex_exit(&LRU_list_mutex); buf_page_set_io_fix(bpage, BUF_IO_READ); mutex_exit(&buf_pool_zip_mutex); } + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_reads++; + mutex_exit(&buf_pool_mutex); func_exit: - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -2995,7 +3141,9 @@ free_block = buf_LRU_get_free_block(0); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -3008,7 +3156,9 @@ #endif /* UNIV_DEBUG_FILE_ACCESSES */ /* Page can be found in buf_pool */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_block_free(free_block); @@ -3030,6 +3180,7 @@ mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&page_hash_latch); /* The block must be put to the LRU list */ buf_LRU_add_block(&block->page, FALSE); @@ -3056,7 +3207,7 @@ the reacquisition of buf_pool_mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3074,7 +3225,8 @@ buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -3124,6 +3276,8 @@ enum buf_io_fix io_type; const ibool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + enum buf_flush flush_type; + mutex_t* block_mutex; ut_a(buf_page_in_file(bpage)); @@ -3257,8 +3411,17 @@ } } - buf_pool_mutex_enter(); - mutex_enter(buf_page_get_mutex(bpage)); + //buf_pool_mutex_enter(); + if (io_type == BUF_IO_WRITE) { + flush_type = buf_page_get_flush_type(bpage); + /* to keep consistency at buf_LRU_insert_zip_clean() */ + //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_enter(&LRU_list_mutex); + //} + } + block_mutex = buf_page_get_mutex_enter(bpage); + ut_a(block_mutex); + mutex_enter(&buf_pool_mutex); #ifdef UNIV_IBUF_COUNT_DEBUG if (io_type == BUF_IO_WRITE || uncompressed) { @@ -3298,6 +3461,11 @@ buf_flush_write_complete(bpage); + /* to keep consistency at buf_LRU_insert_zip_clean() */ + //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_exit(&LRU_list_mutex); + //} + if (uncompressed) { rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); @@ -3320,8 +3488,9 @@ } #endif /* UNIV_DEBUG */ - mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); + mutex_exit(block_mutex); + //buf_pool_mutex_exit(); } /*********************************************************************//** @@ -3368,7 +3537,8 @@ freed = buf_LRU_search_and_free_block(100); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); @@ -3381,7 +3551,8 @@ memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat)); buf_refresh_io_stats(); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG @@ -3406,7 +3577,10 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + /* for keep the new latch order, it cannot validate correctly... */ chunk = buf_pool->chunks; @@ -3505,7 +3679,7 @@ /* Check clean compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); switch (buf_page_get_io_fix(b)) { case BUF_IO_NONE: @@ -3530,8 +3704,9 @@ /* Check dirty compressed-only blocks. */ + mutex_enter(&flush_list_mutex); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); switch (buf_page_get_state(b)) { @@ -3576,6 +3751,7 @@ } ut_a(buf_page_hash_get(b->space, b->offset) == b); } + mutex_exit(&flush_list_mutex); mutex_exit(&buf_pool_zip_mutex); @@ -3587,19 +3763,27 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + /* because of latching order with block->mutex, we cannot get free_list_mutex before that */ +/* if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { fprintf(stderr, "Free list len %lu, free blocks %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); ut_error; } +*/ + /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */ +/* ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); +*/ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); @@ -3633,7 +3817,10 @@ index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + mutex_enter(&flush_list_mutex); fprintf(stderr, "buf_pool size %lu\n" @@ -3700,7 +3887,10 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); + mutex_exit(&flush_list_mutex); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -3739,7 +3929,7 @@ ulint i; ulint fixed_pages_number = 0; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); chunk = buf_pool->chunks; @@ -3773,7 +3963,7 @@ /* Traverse the lists of clean and dirty compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); @@ -3783,8 +3973,9 @@ } } + mutex_enter(&flush_list_mutex); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); switch (buf_page_get_state(b)) { @@ -3807,9 +3998,10 @@ break; } } + mutex_exit(&flush_list_mutex); mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); return(fixed_pages_number); } @@ -3867,7 +4059,11 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + mutex_enter(&buf_pool_mutex); + mutex_enter(&flush_list_mutex); fprintf(file, "Buffer pool size %lu\n" @@ -3966,7 +4162,11 @@ buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip); buf_refresh_io_stats(); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); + mutex_exit(&buf_pool_mutex); + mutex_exit(&flush_list_mutex); } /**********************************************************************//** @@ -3993,7 +4193,7 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); /* optimistic */ chunk = buf_pool->chunks; @@ -4010,7 +4210,7 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); /* optimistic */ return(TRUE); } @@ -4026,7 +4226,8 @@ { ibool ret; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + buf_pool->n_flush[BUF_FLUSH_LIST] @@ -4036,7 +4237,8 @@ ret = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(ret); } @@ -4051,11 +4253,13 @@ { ulint len; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&free_list_mutex); len = UT_LIST_GET_LEN(buf_pool->free); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&free_list_mutex); return(len); } diff -ruN a/storage/innodb_plugin/buf/buf0flu.c b/storage/innodb_plugin/buf/buf0flu.c --- a/storage/innodb_plugin/buf/buf0flu.c 2010-08-27 15:54:59.022021357 +0900 +++ b/storage/innodb_plugin/buf/buf0flu.c 2010-08-27 16:11:40.607020890 +0900 @@ -102,7 +102,8 @@ const ib_rbt_node_t* c_node; const ib_rbt_node_t* p_node; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); /* Insert this buffer into the rbt. */ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); @@ -132,7 +133,8 @@ ibool ret = FALSE; #endif /* UNIV_DEBUG */ - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); #ifdef UNIV_DEBUG ret = #endif /* UNIV_DEBUG */ @@ -199,12 +201,14 @@ buf_flush_init_flush_rbt(void) /*==========================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); /* Create red black tree for speedy insertions in flush list. */ buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*), buf_flush_block_cmp); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); } /********************************************************************//** @@ -214,7 +218,8 @@ buf_flush_free_flush_rbt(void) /*==========================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_flush_validate_low()); @@ -223,7 +228,8 @@ rbt_free(buf_pool->flush_rbt); buf_pool->flush_rbt = NULL; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); } /********************************************************************//** @@ -234,7 +240,9 @@ /*=============================*/ buf_block_t* block) /*!< in/out: block which is modified */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification <= block->page.oldest_modification)); @@ -252,7 +260,7 @@ ut_ad(!block->page.in_zip_hash); ut_ad(!block->page.in_flush_list); ut_d(block->page.in_flush_list = TRUE); - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); #ifdef UNIV_DEBUG_VALGRIND { @@ -283,7 +291,9 @@ buf_page_t* prev_b; buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.in_LRU_list); @@ -324,14 +334,14 @@ > block->page.oldest_modification) { ut_ad(b->in_flush_list); prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + b = UT_LIST_GET_NEXT(flush_list, b); } } if (prev_b == NULL) { - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); } else { - UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, + UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, &block->page); } @@ -352,7 +362,7 @@ buf_page_in_file(bpage) and in the LRU list */ { //ut_ad(buf_pool_mutex_own()); - //ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); //ut_ad(bpage->in_LRU_list); /* optimistic use */ if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) { @@ -387,12 +397,12 @@ buf_page_in_file(bpage) */ enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { - ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_a(buf_page_in_file(bpage)); + //ut_ad(buf_pool_mutex_own()); /*optimistic...*/ ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST); - if (bpage->oldest_modification != 0 + if (buf_page_in_file(bpage) && bpage->oldest_modification != 0 && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { ut_ad(bpage->in_flush_list); @@ -421,8 +431,11 @@ /*=============*/ buf_page_t* bpage) /*!< in: pointer to the block in question */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + mutex_enter(&flush_list_mutex); + ut_ad(bpage->in_flush_list); switch (buf_page_get_state(bpage)) { @@ -433,15 +446,16 @@ case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: + mutex_exit(&flush_list_mutex); ut_error; return; case BUF_BLOCK_ZIP_DIRTY: buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); buf_LRU_insert_zip_clean(bpage); break; case BUF_BLOCK_FILE_PAGE: - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); break; } @@ -456,8 +470,9 @@ bpage->oldest_modification = 0; - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, + ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list))); + mutex_exit(&flush_list_mutex); } /********************************************************************//** @@ -474,7 +489,8 @@ buf_page_t* prev; buf_page_t* prev_b = NULL; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); @@ -492,18 +508,18 @@ because we assert on in_flush_list in comparison function. */ ut_d(bpage->in_flush_list = FALSE); - prev = UT_LIST_GET_PREV(list, bpage); - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + prev = UT_LIST_GET_PREV(flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); if (prev) { ut_ad(prev->in_flush_list); UT_LIST_INSERT_AFTER( - list, + flush_list, buf_pool->flush_list, prev, dpage); } else { UT_LIST_ADD_FIRST( - list, + flush_list, buf_pool->flush_list, dpage); } @@ -977,7 +993,9 @@ io_fixed and oldest_modification != 0. Thus, it cannot be relocated in the buffer pool or removed from flush_list or LRU_list. */ - ut_ad(!buf_pool_mutex_own()); + //ut_ad(!buf_pool_mutex_own()); + ut_ad(!mutex_own(&LRU_list_mutex)); + ut_ad(!mutex_own(&flush_list_mutex)); ut_ad(!mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); ut_ad(bpage->oldest_modification != 0); @@ -1137,12 +1155,19 @@ ibool is_uncompressed; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX) + || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED)); +#endif ut_ad(buf_page_in_file(bpage)); block_mutex = buf_page_get_mutex(bpage); ut_ad(mutex_own(block_mutex)); + mutex_enter(&buf_pool_mutex); + rw_lock_s_unlock(&page_hash_latch); + ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); buf_page_set_io_fix(bpage, BUF_IO_WRITE); @@ -1173,7 +1198,8 @@ } mutex_exit(block_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); /* Even though bpage is not protected by any mutex at this point, it is safe to access bpage, because it is @@ -1210,7 +1236,8 @@ immediately. */ mutex_exit(block_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); break; default: @@ -1275,7 +1302,8 @@ high = fil_space_get_size(space); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); for (i = low; i < high; i++) { @@ -1294,11 +1322,9 @@ if (flush_type != BUF_FLUSH_LRU || i == offset || buf_page_is_old(bpage)) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); - if (buf_flush_ready_for_flush(bpage, flush_type) + if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type) && (i == offset || !bpage->buf_fix_count)) { /* We only try to flush those neighbors != offset where the buf fix count is @@ -1312,14 +1338,16 @@ ut_ad(!mutex_own(block_mutex)); count++; - buf_pool_mutex_enter(); - } else { + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); + } else if (block_mutex) { mutex_exit(block_mutex); } } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(count); } @@ -1350,9 +1378,11 @@ min_n), otherwise ignored */ { buf_page_t* bpage; + buf_page_t* prev_bpage = NULL; ulint page_count = 0; ulint space; ulint offset; + ulint remaining = 0; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); @@ -1360,20 +1390,28 @@ ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if ((buf_pool->n_flush[flush_type] > 0) || (buf_pool->init_flush[flush_type] == TRUE)) { /* There is already a flush batch of the same type running */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(ULINT_UNDEFINED); } buf_pool->init_flush[flush_type] = TRUE; + mutex_exit(&buf_pool_mutex); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&LRU_list_mutex); + } + for (;;) { flush_next: /* If we have flushed enough, leave the loop */ @@ -1390,7 +1428,13 @@ } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_enter(&flush_list_mutex); + remaining = UT_LIST_GET_LEN(buf_pool->flush_list); bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + if (bpage) { + prev_bpage = UT_LIST_GET_PREV(flush_list, bpage); + } + mutex_exit(&flush_list_mutex); if (!bpage || bpage->oldest_modification >= lsn_limit) { /* We have flushed enough */ @@ -1407,26 +1451,35 @@ function a pointer to a block in the list! */ do { - mutex_t*block_mutex = buf_page_get_mutex(bpage); + mutex_t*block_mutex = buf_page_get_mutex_enter(bpage); ibool ready; - ut_a(buf_page_in_file(bpage)); + //ut_a(buf_page_in_file(bpage)); - mutex_enter(block_mutex); - ready = buf_flush_ready_for_flush(bpage, flush_type); - mutex_exit(block_mutex); + if (block_mutex) { + ready = buf_flush_ready_for_flush(bpage, flush_type); + mutex_exit(block_mutex); + } else { + ready = FALSE; + } if (ready) { space = buf_page_get_space(bpage); offset = buf_page_get_page_no(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&LRU_list_mutex); + } /* Try to flush also all the neighbors */ page_count += buf_flush_try_neighbors( space, offset, flush_type, srv_flush_neighbor_pages); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&LRU_list_mutex); + } goto flush_next; } else if (flush_type == BUF_FLUSH_LRU) { @@ -1434,16 +1487,35 @@ } else { ut_ad(flush_type == BUF_FLUSH_LIST); - bpage = UT_LIST_GET_PREV(list, bpage); - ut_ad(!bpage || bpage->in_flush_list); + mutex_enter(&flush_list_mutex); + bpage = UT_LIST_GET_PREV(flush_list, bpage); + //ut_ad(!bpage || bpage->in_flush_list); /* optimistic */ + if (bpage != prev_bpage) { + /* the search may warp.. retrying */ + bpage = NULL; + } + if (bpage) { + prev_bpage = UT_LIST_GET_PREV(flush_list, bpage); + } + mutex_exit(&flush_list_mutex); + remaining--; } } while (bpage != NULL); + if (remaining) + goto flush_next; + /* If we could not find anything to flush, leave the loop */ break; } + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&LRU_list_mutex); + } + + mutex_enter(&buf_pool_mutex); + buf_pool->init_flush[flush_type] = FALSE; if (buf_pool->n_flush[flush_type] == 0) { @@ -1453,7 +1525,8 @@ os_event_set(buf_pool->no_flush[flush_type]); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_flush_buffered_writes(); @@ -1514,7 +1587,7 @@ retry: //buf_pool_mutex_enter(); if (have_LRU_mutex) - buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); @@ -1531,15 +1604,15 @@ bpage = UT_LIST_GET_LAST(buf_pool->LRU); continue; } - block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); - if (buf_flush_ready_for_replace(bpage)) { + if (block_mutex && buf_flush_ready_for_replace(bpage)) { n_replaceable++; } - mutex_exit(block_mutex); + if (block_mutex) { + mutex_exit(block_mutex); + } distance++; @@ -1548,7 +1621,7 @@ //buf_pool_mutex_exit(); if (have_LRU_mutex) - buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { @@ -1715,7 +1788,7 @@ buf_page_t* bpage; const ib_rbt_node_t* rnode = NULL; - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, + UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list)); bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); @@ -1730,7 +1803,7 @@ while (bpage != NULL) { const ib_uint64_t om = bpage->oldest_modification; ut_ad(bpage->in_flush_list); - ut_a(buf_page_in_file(bpage)); + //ut_a(buf_page_in_file(bpage)); /* optimistic */ ut_a(om > 0); if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { @@ -1742,7 +1815,7 @@ rnode = rbt_next(buf_pool->flush_rbt, rnode); } - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(flush_list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); } @@ -1764,11 +1837,13 @@ { ibool ret; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); ret = buf_flush_validate_low(); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); return(ret); } diff -ruN a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c --- a/storage/innodb_plugin/buf/buf0lru.c 2010-08-27 15:54:59.025058614 +0900 +++ b/storage/innodb_plugin/buf/buf0lru.c 2010-08-27 16:11:40.611021077 +0900 @@ -145,8 +145,9 @@ void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block); /*!< in: block, must contain a file page and + buf_block_t* block, /*!< in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex); /******************************************************************//** Determines if the unzip_LRU list should be used for evicting a victim @@ -154,16 +155,21 @@ @return TRUE if should use unzip_LRU */ UNIV_INLINE ibool -buf_LRU_evict_from_unzip_LRU(void) +buf_LRU_evict_from_unzip_LRU( + ibool have_LRU_mutex) /*==============================*/ { ulint io_avg; ulint unzip_avg; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + if (!have_LRU_mutex) + mutex_enter(&LRU_list_mutex); /* If the unzip_LRU list is empty, we can only use the LRU. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(FALSE); } @@ -172,14 +178,20 @@ decompressed pages in the buffer pool. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(FALSE); } /* If eviction hasn't started yet, we assume by default that a workload is disk bound. */ if (buf_pool->freed_page_clock == 0) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(TRUE); } + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); /* Calculate the average over past intervals, and add the values of the current interval. */ @@ -245,19 +257,23 @@ page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); scan_again: num_entries = 0; bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); buf_page_t* prev_bpage; - mutex_enter(block_mutex); prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + if (!block_mutex) { + goto next_page; + } + ut_a(buf_page_in_file(bpage)); if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE @@ -285,12 +301,14 @@ } /* Array full. We release the buf_pool_mutex to obey the latching order. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); num_entries = 0; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); } else { mutex_exit(block_mutex); } @@ -315,7 +333,8 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); @@ -343,7 +362,9 @@ buf_LRU_drop_page_hash_for_tablespace(id); scan_again: - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); all_freed = TRUE; @@ -371,8 +392,16 @@ all_freed = FALSE; } else { - mutex_t* block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); + + if (!block_mutex) { + /* It may be impossible case... + Something wrong, so will be scan_again */ + + all_freed = FALSE; + + goto next_page_no_mutex; + } if (bpage->buf_fix_count > 0) { @@ -431,7 +460,9 @@ ulint page_no; ulint zip_size; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); zip_size = buf_page_get_zip_size(bpage); page_no = buf_page_get_page_no(bpage); @@ -456,7 +487,7 @@ if (buf_LRU_block_remove_hashed_page(bpage, TRUE) != BUF_BLOCK_ZIP_FREE) { buf_LRU_block_free_hashed_page((buf_block_t*) - bpage); + bpage, TRUE); } else { /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() @@ -488,7 +519,9 @@ bpage = prev_bpage; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); if (!all_freed) { os_thread_sleep(20000); @@ -507,7 +540,9 @@ { buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); /* Find the first successor of bpage in the LRU list @@ -515,17 +550,17 @@ b = bpage; do { b = UT_LIST_GET_NEXT(LRU, b); - } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + } while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list)); /* Insert bpage before b, i.e., after the predecessor of b. */ if (b) { - b = UT_LIST_GET_PREV(list, b); + b = UT_LIST_GET_PREV(zip_list, b); } if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage); + UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage); } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage); } } @@ -537,16 +572,17 @@ ibool buf_LRU_free_from_unzip_LRU_list( /*=============================*/ - ulint n_iterations) /*!< in: how many times this has been called + ulint n_iterations, /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; we will search n_iterations / 5 of the unzip_LRU list, or nothing if n_iterations >= 5 */ + ibool have_LRU_mutex) { buf_block_t* block; ulint distance; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* optimistic */ /* Theoratically it should be much easier to find a victim from unzip_LRU as we can choose even a dirty block (as we'll @@ -556,7 +592,7 @@ if we have done five iterations so far. */ if (UNIV_UNLIKELY(n_iterations >= 5) - || !buf_LRU_evict_from_unzip_LRU()) { + || !buf_LRU_evict_from_unzip_LRU(have_LRU_mutex)) { return(FALSE); } @@ -564,18 +600,25 @@ distance = 100 + (n_iterations * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5; +restart: for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { enum buf_lru_free_block_status freed; + mutex_enter(&block->mutex); + if (!block->in_unzip_LRU_list || !block->page.in_LRU_list + || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(&block->mutex); + goto restart; + } + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); - mutex_enter(&block->mutex); - freed = buf_LRU_free_block(&block->page, FALSE, NULL); + freed = buf_LRU_free_block(&block->page, FALSE, NULL, have_LRU_mutex); mutex_exit(&block->mutex); switch (freed) { @@ -608,20 +651,22 @@ ibool buf_LRU_free_from_common_LRU_list( /*==============================*/ - ulint n_iterations) /*!< in: how many times this has been called + ulint n_iterations, /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; if n_iterations < 10, then we search n_iterations / 10 * buf_pool->curr_size pages from the end of the LRU list */ + ibool have_LRU_mutex) { buf_page_t* bpage; ulint distance; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* optimistic */ distance = 100 + (n_iterations * buf_pool->curr_size) / 10; +restart: for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { @@ -629,14 +674,23 @@ enum buf_lru_free_block_status freed; unsigned accessed; mutex_t* block_mutex - = buf_page_get_mutex(bpage); + = buf_page_get_mutex_enter(bpage); + + if (!block_mutex) { + goto restart; + } + + if (!bpage->in_LRU_list + || !buf_page_in_file(bpage)) { + mutex_exit(block_mutex); + goto restart; + } ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); - mutex_enter(block_mutex); accessed = buf_page_is_accessed(bpage); - freed = buf_LRU_free_block(bpage, TRUE, NULL); + freed = buf_LRU_free_block(bpage, TRUE, NULL, have_LRU_mutex); mutex_exit(block_mutex); switch (freed) { @@ -685,22 +739,33 @@ n_iterations / 5 of the unzip_LRU list. */ { ibool freed = FALSE; + ibool have_LRU_mutex = FALSE; + + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + have_LRU_mutex = TRUE; - buf_pool_mutex_enter(); + /* optimistic search... */ + //buf_pool_mutex_enter(); + if (have_LRU_mutex) + mutex_enter(&LRU_list_mutex); - freed = buf_LRU_free_from_unzip_LRU_list(n_iterations); + freed = buf_LRU_free_from_unzip_LRU_list(n_iterations, have_LRU_mutex); if (!freed) { - freed = buf_LRU_free_from_common_LRU_list(n_iterations); + freed = buf_LRU_free_from_common_LRU_list(n_iterations, have_LRU_mutex); } + mutex_enter(&buf_pool_mutex); if (!freed) { buf_pool->LRU_flush_ended = 0; } else if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } + mutex_exit(&buf_pool_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(freed); } @@ -718,18 +783,22 @@ buf_LRU_try_free_flushed_blocks(void) /*=================================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); while (buf_pool->LRU_flush_ended > 0) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_LRU_search_and_free_block(1); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); } /******************************************************************//** @@ -744,7 +813,9 @@ { ibool ret = FALSE; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) { @@ -752,7 +823,9 @@ ret = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); return(ret); } @@ -768,9 +841,10 @@ { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free); + mutex_enter(&free_list_mutex); + block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free); if (block) { ut_ad(block->page.in_free_list); @@ -778,7 +852,9 @@ ut_ad(!block->page.in_flush_list); ut_ad(!block->page.in_LRU_list); ut_a(!buf_page_in_file(&block->page)); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); + + mutex_exit(&free_list_mutex); mutex_enter(&block->mutex); @@ -786,6 +862,8 @@ UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); mutex_exit(&block->mutex); + } else { + mutex_exit(&free_list_mutex); } return(block); @@ -809,7 +887,7 @@ ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { @@ -889,14 +967,16 @@ if (UNIV_UNLIKELY(zip_size)) { ibool lru; page_zip_set_size(&block->page.zip, zip_size); - block->page.zip.data = buf_buddy_alloc(zip_size, &lru); + mutex_enter(&LRU_list_mutex); + block->page.zip.data = buf_buddy_alloc(zip_size, &lru, FALSE); + mutex_exit(&LRU_list_mutex); UNIV_MEM_DESC(block->page.zip.data, zip_size, block); } else { page_zip_set_size(&block->page.zip, 0); block->page.zip.data = NULL; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -908,7 +988,7 @@ /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); freed = buf_LRU_search_and_free_block(n_iterations); @@ -957,18 +1037,21 @@ os_aio_simulated_wake_handler_threads(); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_LRU_try_free_flushed_blocks(); } else { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); } if (n_iterations > 10) { @@ -993,7 +1076,8 @@ ulint new_len; ut_a(buf_pool->LRU_old); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_ad(buf_LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); ut_ad(buf_LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); #if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) @@ -1058,7 +1142,8 @@ { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use @@ -1091,13 +1176,14 @@ ut_ad(buf_pool); ut_ad(bpage); ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); if (buf_page_belongs_to_unzip_LRU(bpage)) { buf_block_t* block = (buf_block_t*) bpage; ut_ad(block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = FALSE); + block->in_unzip_LRU_list = FALSE; UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block); } @@ -1113,7 +1199,8 @@ { ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1188,12 +1275,13 @@ { ut_ad(buf_pool); ut_ad(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); ut_ad(!block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = TRUE); + block->in_unzip_LRU_list = TRUE; if (old) { UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block); @@ -1212,7 +1300,8 @@ { ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1261,7 +1350,8 @@ { ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); ut_ad(!bpage->in_LRU_list); @@ -1338,7 +1428,8 @@ /*=====================*/ buf_page_t* bpage) /*!< in: control block */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); if (bpage->old) { buf_pool->stat.n_pages_made_young++; @@ -1380,18 +1471,19 @@ buf_page_t* bpage, /*!< in: block to be freed */ ibool zip, /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released) + ibool* buf_pool_mutex_released, /*!< in: pointer to a variable that will be assigned TRUE if buf_pool_mutex was temporarily released, or NULL */ + ibool have_LRU_mutex) { buf_page_t* b = NULL; mutex_t* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + //ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); #if UNIV_WORD_SIZE == 4 /* On 32-bit systems, there is no padding in buf_page_t. On @@ -1400,7 +1492,7 @@ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); #endif - if (!buf_page_can_relocate(bpage)) { + if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ return(BUF_LRU_NOT_FREED); @@ -1432,15 +1524,15 @@ If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - buf_pool_mutex_exit_forbid(); - b = buf_buddy_alloc(sizeof *b, NULL); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + b = buf_buddy_alloc(sizeof *b, NULL, FALSE); + //buf_pool_mutex_exit_allow(); if (UNIV_UNLIKELY(!b)) { return(BUF_LRU_CANNOT_RELOCATE); } - memcpy(b, bpage, sizeof *b); + //memcpy(b, bpage, sizeof *b); } #ifdef UNIV_DEBUG @@ -1451,6 +1543,39 @@ } #endif /* UNIV_DEBUG */ + /* not to break latch order, must re-enter block_mutex */ + mutex_exit(block_mutex); + + if (!have_LRU_mutex) + mutex_enter(&LRU_list_mutex); /* optimistic */ + rw_lock_x_lock(&page_hash_latch); + mutex_enter(block_mutex); + + /* recheck states of block */ + if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage) + || !buf_page_can_relocate(bpage)) { +not_freed: + if (b) { + buf_buddy_free(b, sizeof *b, TRUE); + } + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + return(BUF_LRU_NOT_FREED); + } else if (zip || !bpage->zip.data) { + if (bpage->oldest_modification) + goto not_freed; + } else if (bpage->oldest_modification) { + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); + goto not_freed; + } + } + + if (b) { + memcpy(b, bpage, sizeof *b); + } + if (buf_LRU_block_remove_hashed_page(bpage, zip) != BUF_BLOCK_ZIP_FREE) { ut_a(bpage->buf_fix_count == 0); @@ -1462,6 +1587,10 @@ ut_a(!buf_page_hash_get(bpage->space, bpage->offset)); + while (prev_b && !prev_b->in_LRU_list) { + prev_b = UT_LIST_GET_PREV(LRU, prev_b); + } + b->state = b->oldest_modification ? BUF_BLOCK_ZIP_DIRTY : BUF_BLOCK_ZIP_PAGE; @@ -1537,12 +1666,14 @@ buf_LRU_add_block_low(b, buf_page_is_old(b)); } + mutex_enter(&flush_list_mutex); if (b->state == BUF_BLOCK_ZIP_PAGE) { buf_LRU_insert_zip_clean(b); } else { /* Relocate on buf_pool->flush_list. */ buf_flush_relocate_on_flush_list(bpage, b); } + mutex_exit(&flush_list_mutex); bpage->zip.data = NULL; page_zip_set_size(&bpage->zip, 0); @@ -1558,7 +1689,9 @@ *buf_pool_mutex_released = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); mutex_exit(block_mutex); /* Remove possible adaptive hash index on the page. @@ -1590,7 +1723,9 @@ : BUF_NO_CHECKSUM_MAGIC); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + if (have_LRU_mutex) + mutex_enter(&LRU_list_mutex); mutex_enter(block_mutex); if (b) { @@ -1600,13 +1735,17 @@ mutex_exit(&buf_pool_zip_mutex); } - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE); } else { /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() when it returns BUF_BLOCK_ZIP_FREE. */ ut_ad(block_mutex == &buf_pool_zip_mutex); mutex_enter(block_mutex); + + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); } return(BUF_LRU_FREED); @@ -1618,12 +1757,13 @@ void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block) /*!< in: block, must not contain a file page */ + buf_block_t* block, /*!< in: block, must not contain a file page */ + ibool have_page_hash_mutex) { void* data; ut_ad(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&block->mutex)); switch (buf_block_get_state(block)) { @@ -1657,15 +1797,17 @@ if (data) { block->page.zip.data = NULL; mutex_exit(&block->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&block->page.zip)); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&block->page.zip), have_page_hash_mutex); + //buf_pool_mutex_exit_allow(); mutex_enter(&block->mutex); page_zip_set_size(&block->page.zip, 0); } - UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page)); + mutex_enter(&free_list_mutex); + UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&free_list_mutex); UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); } @@ -1692,7 +1834,11 @@ { const buf_page_t* hashed_bpage; ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -1798,7 +1944,9 @@ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -1821,14 +1969,14 @@ ut_a(bpage->zip.data); ut_a(buf_page_get_zip_size(bpage)); - UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage); mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit_forbid(); + //buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, - page_zip_get_size(&bpage->zip)); - buf_buddy_free(bpage, sizeof(*bpage)); - buf_pool_mutex_exit_allow(); + page_zip_get_size(&bpage->zip), TRUE); + buf_buddy_free(bpage, sizeof(*bpage), TRUE); + //buf_pool_mutex_exit_allow(); UNIV_MEM_UNDESC(bpage); return(BUF_BLOCK_ZIP_FREE); @@ -1850,9 +1998,9 @@ ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); mutex_exit(&((buf_block_t*) bpage)->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&bpage->zip)); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&bpage->zip), TRUE); + //buf_pool_mutex_exit_allow(); mutex_enter(&((buf_block_t*) bpage)->mutex); page_zip_set_size(&bpage->zip, 0); } @@ -1878,15 +2026,16 @@ void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block) /*!< in: block, must contain a file page and + buf_block_t* block, /*!< in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&block->mutex)); buf_block_set_state(block, BUF_BLOCK_MEMORY); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); } /**********************************************************************//** @@ -1912,7 +2061,8 @@ } if (adjust) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); if (ratio != buf_LRU_old_ratio) { buf_LRU_old_ratio = ratio; @@ -1923,7 +2073,8 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } else { buf_LRU_old_ratio = ratio; } @@ -1948,7 +2099,8 @@ goto func_exit; } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); /* Update the index. */ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; @@ -1962,7 +2114,8 @@ /* Put current entry in the array. */ memcpy(item, &buf_LRU_stat_cur, sizeof *item); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); func_exit: /* Clear the current entry. */ @@ -1984,7 +2137,8 @@ ulint new_len; ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -2044,16 +2198,22 @@ ut_a(buf_pool->LRU_old_len == old_len); - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, + mutex_exit(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + + UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free, ut_ad(ut_list_node_313->in_free_list)); for (bpage = UT_LIST_GET_FIRST(buf_pool->free); bpage != NULL; - bpage = UT_LIST_GET_NEXT(list, bpage)) { + bpage = UT_LIST_GET_NEXT(free, bpage)) { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); } + mutex_exit(&free_list_mutex); + mutex_enter(&LRU_list_mutex); + UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU, ut_ad(ut_list_node_313->in_unzip_LRU_list && ut_list_node_313->page.in_LRU_list)); @@ -2067,7 +2227,8 @@ ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return(TRUE); } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -2083,7 +2244,8 @@ const buf_page_t* bpage; ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); bpage = UT_LIST_GET_FIRST(buf_pool->LRU); @@ -2140,6 +2302,7 @@ bpage = UT_LIST_GET_NEXT(LRU, bpage); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ diff -ruN a/storage/innodb_plugin/buf/buf0rea.c b/storage/innodb_plugin/buf/buf0rea.c --- a/storage/innodb_plugin/buf/buf0rea.c 2010-08-27 15:54:59.027059378 +0900 +++ b/storage/innodb_plugin/buf/buf0rea.c 2010-08-27 16:11:40.614021339 +0900 @@ -290,10 +290,12 @@ tablespace_version = fil_space_get_version(space); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (high > fil_space_get_size(space)) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); /* The area is not whole, return */ return(0); @@ -301,10 +303,12 @@ if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(0); } + mutex_exit(&buf_pool_mutex); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -323,6 +327,7 @@ fail_count = 0; + rw_lock_s_lock(&page_hash_latch); for (i = low; i < high; i++) { bpage = buf_page_hash_get(space, i); @@ -350,7 +355,8 @@ if (fail_count > threshold) { /* Too many failures: return */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(0); } @@ -365,7 +371,8 @@ bpage = buf_page_hash_get(space, offset); if (bpage == NULL) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(0); } @@ -391,7 +398,8 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); if ((offset == low) && (succ_offset == offset + 1)) { diff -ruN a/storage/innodb_plugin/handler/i_s.cc b/storage/innodb_plugin/handler/i_s.cc --- a/storage/innodb_plugin/handler/i_s.cc 2010-08-27 15:59:21.753412068 +0900 +++ b/storage/innodb_plugin/handler/i_s.cc 2010-08-27 16:11:40.617020805 +0900 @@ -2230,7 +2230,8 @@ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&zip_free_mutex); for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { buf_buddy_stat_t* buddy_stat = &buf_buddy_stat[x]; @@ -2256,7 +2257,8 @@ } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&zip_free_mutex); DBUG_RETURN(status); } diff -ruN a/storage/innodb_plugin/handler/innodb_patch_info.h b/storage/innodb_plugin/handler/innodb_patch_info.h --- a/storage/innodb_plugin/handler/innodb_patch_info.h 2010-08-27 16:11:12.167183642 +0900 +++ b/storage/innodb_plugin/handler/innodb_patch_info.h 2010-08-27 16:11:40.614990183 +0900 @@ -33,5 +33,6 @@ {"innodb_overwrite_relay_log_info","overwrite relay-log.info when slave recovery","Building as plugin, it is not used.","http://www.percona.com/docs/wiki/percona-xtradb:innodb_overwrite_relay_log_info"}, {"innodb_thread_concurrency_timer_based","use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0)","",""}, {"innodb_dict_size_limit","Limit dictionary cache size","Variable innodb_dict_size_limit in bytes","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_split_buf_pool_mutex","More fix of buffer_pool mutex","Spliting buf_pool_mutex and optimizing based on innodb_opt_lru_count","http://www.percona.com/docs/wiki/percona-xtradb"}, {NULL, NULL, NULL, NULL} }; diff -ruN a/storage/innodb_plugin/include/buf0buddy.h b/storage/innodb_plugin/include/buf0buddy.h --- a/storage/innodb_plugin/include/buf0buddy.h 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/buf0buddy.h 2010-08-27 16:11:40.618988049 +0900 @@ -49,10 +49,11 @@ buf_buddy_alloc( /*============*/ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /*!< in: pointer to a variable that will be assigned + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) __attribute__((malloc)); /**********************************************************************//** @@ -63,7 +64,8 @@ /*===========*/ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) __attribute__((nonnull)); /** Statistics of buddy blocks of a given size. */ diff -ruN a/storage/innodb_plugin/include/buf0buddy.ic b/storage/innodb_plugin/include/buf0buddy.ic --- a/storage/innodb_plugin/include/buf0buddy.ic 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/buf0buddy.ic 2010-08-27 16:11:40.619989772 +0900 @@ -44,10 +44,11 @@ /*================*/ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that will be assigned + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) __attribute__((malloc)); /**********************************************************************//** @@ -58,8 +59,9 @@ /*===============*/ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], + ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ + ibool have_page_hash_mutex) __attribute__((nonnull)); /**********************************************************************//** @@ -96,14 +98,15 @@ buf_buddy_alloc( /*============*/ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /*!< in: pointer to a variable that will be assigned + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); + return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex)); } /**********************************************************************//** @@ -114,11 +117,24 @@ /*===========*/ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - buf_buddy_free_low(buf, buf_buddy_get_slot(size)); + if (!have_page_hash_mutex) { + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + + mutex_enter(&zip_free_mutex); + buf_buddy_free_low(buf, buf_buddy_get_slot(size), TRUE); + mutex_exit(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } } #ifdef UNIV_MATERIALIZE diff -ruN a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h --- a/storage/innodb_plugin/include/buf0buf.h 2010-08-27 15:55:39.399063353 +0900 +++ b/storage/innodb_plugin/include/buf0buf.h 2010-08-27 16:11:40.622020552 +0900 @@ -713,6 +713,15 @@ const buf_page_t* bpage) /*!< in: pointer to control block */ __attribute__((pure)); +/************************************************************************* +Gets the mutex of a block and enter the mutex with consistency. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex_enter( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + /*********************************************************************//** Get the flush type of a page. @return flush type */ @@ -1066,7 +1075,7 @@ All these are protected by buf_pool_mutex. */ /* @{ */ - UT_LIST_NODE_T(buf_page_t) list; + /* UT_LIST_NODE_T(buf_page_t) list; */ /*!< based on state, this is a list node, protected only by buf_pool_mutex, in one of the @@ -1086,6 +1095,10 @@ BUF_BLOCK_REMOVE_HASH or BUF_BLOCK_READY_IN_USE. */ + /* resplit for optimistic use */ + UT_LIST_NODE_T(buf_page_t) free; + UT_LIST_NODE_T(buf_page_t) flush_list; + UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */ #ifdef UNIV_DEBUG ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list; when buf_pool_mutex is free, the @@ -1166,11 +1179,11 @@ a block is in the unzip_LRU list if page.state == BUF_BLOCK_FILE_PAGE and page.zip.data != NULL */ -#ifdef UNIV_DEBUG +//#ifdef UNIV_DEBUG ibool in_unzip_LRU_list;/*!< TRUE if the page is in the decompressed LRU list; used in debugging */ -#endif /* UNIV_DEBUG */ +//#endif /* UNIV_DEBUG */ mutex_t mutex; /*!< mutex protecting this block: state (also protected by the buffer pool mutex), io_fix, buf_fix_count, @@ -1446,6 +1459,12 @@ /** mutex protecting the buffer pool struct and control blocks, except the read-write lock in them */ extern mutex_t buf_pool_mutex; +extern mutex_t LRU_list_mutex; +extern mutex_t flush_list_mutex; +extern rw_lock_t page_hash_latch; +extern mutex_t free_list_mutex; +extern mutex_t zip_free_mutex; +extern mutex_t zip_hash_mutex; /** mutex protecting the control blocks of compressed-only pages (of type buf_page_t, not buf_block_t) */ extern mutex_t buf_pool_zip_mutex; diff -ruN a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic --- a/storage/innodb_plugin/include/buf0buf.ic 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/buf0buf.ic 2010-08-27 16:11:40.624990413 +0900 @@ -121,7 +121,9 @@ buf_page_t* bpage; ib_uint64_t lsn; - buf_pool_mutex_enter(); +try_again: + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); bpage = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -130,9 +132,14 @@ } else { ut_ad(bpage->in_flush_list); lsn = bpage->oldest_modification; + if (lsn == 0) { + mutex_exit(&flush_list_mutex); + goto try_again; + } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); /* The returned answer may be out of date: the flush_list can change after the mutex has been released. */ @@ -252,7 +259,7 @@ case BUF_BLOCK_ZIP_FREE: /* This is a free page in buf_pool->zip_free[]. Such pages should only be accessed by the buddy allocator. */ - ut_error; + /* ut_error; */ /* optimistic */ break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -295,7 +302,7 @@ { switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: - ut_error; + /* ut_error; */ /* optimistic */ return(NULL); case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -305,6 +312,28 @@ } } +/************************************************************************* +Gets the mutex of a block and enter the mutex with consistency. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex_enter( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + mutex_t* block_mutex; + + while(1) { + block_mutex = buf_page_get_mutex(bpage); + if (!block_mutex) + return block_mutex; + + mutex_enter(block_mutex); + if (block_mutex == buf_page_get_mutex(bpage)) + return block_mutex; + mutex_exit(block_mutex); + } +} + /*********************************************************************//** Get the flush type of a page. @return flush type */ @@ -400,7 +429,7 @@ buf_page_t* bpage, /*!< in/out: control block */ enum buf_io_fix io_fix) /*!< in: io_fix state */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); bpage->io_fix = io_fix; @@ -428,12 +457,13 @@ /*==================*/ const buf_page_t* bpage) /*!< control block being relocated */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + /* optimistic */ + //ut_ad(bpage->in_LRU_list); - return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE && bpage->buf_fix_count == 0); } @@ -447,7 +477,7 @@ const buf_page_t* bpage) /*!< in: control block */ { ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* This is used in optimistic */ return(bpage->old); } @@ -462,7 +492,8 @@ ibool old) /*!< in: old */ { ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_ad(bpage->in_LRU_list); #ifdef UNIV_LRU_DEBUG @@ -509,7 +540,8 @@ ulint time_ms) /*!< in: ut_time_ms() */ { ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); if (!bpage->access_time) { /* Make this the time of the first access. */ @@ -741,17 +773,17 @@ /*===========*/ buf_block_t* block) /*!< in, own: block to be freed */ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); mutex_enter(&block->mutex); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); } #endif /* !UNIV_HOTBACKUP */ @@ -799,17 +831,17 @@ page frame */ { ib_uint64_t lsn; - mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); - mutex_enter(block_mutex); - - if (buf_page_in_file(bpage)) { + if (block_mutex && buf_page_in_file(bpage)) { lsn = bpage->newest_modification; } else { lsn = 0; } - mutex_exit(block_mutex); + if (block_mutex) { + mutex_exit(block_mutex); + } return(lsn); } @@ -825,7 +857,7 @@ buf_block_t* block) /*!< in: block */ { #ifdef UNIV_SYNC_DEBUG - ut_ad((buf_pool_mutex_own() + ut_ad((mutex_own(&LRU_list_mutex) && (block->page.buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ @@ -917,7 +949,11 @@ ulint fold; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX) + || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED)); +#endif /* Look for the page in the hash table */ @@ -972,11 +1008,13 @@ { const buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage != NULL); } @@ -1038,11 +1076,14 @@ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_a(block->page.buf_fix_count > 0); + /* buf_flush_note_modification() should be called before this function. */ +/* if (rw_latch == RW_X_LATCH && mtr->modifications) { buf_pool_mutex_enter(); buf_flush_note_modification(block, mtr); buf_pool_mutex_exit(); } +*/ mutex_enter(&block->mutex); diff -ruN a/storage/innodb_plugin/include/buf0flu.ic b/storage/innodb_plugin/include/buf0flu.ic --- a/storage/innodb_plugin/include/buf0flu.ic 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/buf0flu.ic 2010-08-27 16:11:40.625993554 +0900 @@ -55,13 +55,23 @@ buf_block_t* block, /*!< in: block which is modified */ mtr_t* mtr) /*!< in: mtr */ { + ibool use_LRU_mutex = FALSE; + + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + use_LRU_mutex = TRUE; + + if (use_LRU_mutex) + mutex_enter(&LRU_list_mutex); + + mutex_enter(&block->mutex); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mtr->start_lsn != 0); ut_ad(mtr->modifications); @@ -70,16 +80,23 @@ block->page.newest_modification = mtr->end_lsn; if (!block->page.oldest_modification) { + mutex_enter(&flush_list_mutex); block->page.oldest_modification = mtr->start_lsn; ut_ad(block->page.oldest_modification != 0); buf_flush_insert_into_flush_list(block); + mutex_exit(&flush_list_mutex); } else { ut_ad(block->page.oldest_modification <= mtr->start_lsn); } + mutex_exit(&block->mutex); + ++srv_buf_pool_write_requests; + + if (use_LRU_mutex) + mutex_exit(&LRU_list_mutex); } /********************************************************************//** @@ -94,6 +111,16 @@ ib_uint64_t end_lsn) /*!< in: end lsn of the last mtr in the set of mtr's */ { + ibool use_LRU_mutex = FALSE; + + if(UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + use_LRU_mutex = TRUE; + + if (use_LRU_mutex) + mutex_enter(&LRU_list_mutex); + + mutex_enter(&(block->mutex)); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); @@ -101,23 +128,28 @@ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); ut_ad(block->page.newest_modification <= end_lsn); block->page.newest_modification = end_lsn; if (!block->page.oldest_modification) { + mutex_enter(&flush_list_mutex); block->page.oldest_modification = start_lsn; ut_ad(block->page.oldest_modification != 0); buf_flush_insert_sorted_into_flush_list(block); + mutex_exit(&flush_list_mutex); } else { ut_ad(block->page.oldest_modification <= start_lsn); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (use_LRU_mutex) + mutex_exit(&LRU_list_mutex); + mutex_exit(&(block->mutex)); } #endif /* !UNIV_HOTBACKUP */ diff -ruN a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h --- a/storage/innodb_plugin/include/buf0lru.h 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/buf0lru.h 2010-08-27 16:11:40.627990038 +0900 @@ -112,10 +112,11 @@ buf_page_t* bpage, /*!< in: block to be freed */ ibool zip, /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released); + ibool* buf_pool_mutex_released, /*!< in: pointer to a variable that will be assigned TRUE if buf_pool_mutex was temporarily released, or NULL */ + ibool have_LRU_mutex); /******************************************************************//** Try to free a replaceable block. @return TRUE if found and freed */ @@ -157,7 +158,8 @@ void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block); /*!< in: block, must not contain a file page */ + buf_block_t* block, /*!< in: block, must not contain a file page */ + ibool have_page_hash_mutex); /******************************************************************//** Adds a block to the LRU list. */ UNIV_INTERN diff -ruN a/storage/innodb_plugin/include/sync0sync.h b/storage/innodb_plugin/include/sync0sync.h --- a/storage/innodb_plugin/include/sync0sync.h 2010-08-04 02:24:19.000000000 +0900 +++ b/storage/innodb_plugin/include/sync0sync.h 2010-08-27 16:11:40.628990180 +0900 @@ -487,8 +487,14 @@ SYNC_SEARCH_SYS, as memory allocation can call routines there! Otherwise the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_LRU_LIST 157 +#define SYNC_BUF_PAGE_HASH 156 +#define SYNC_BUF_BLOCK 155 +#define SYNC_BUF_FREE_LIST 153 +#define SYNC_BUF_ZIP_FREE 152 +#define SYNC_BUF_ZIP_HASH 151 #define SYNC_BUF_POOL 150 -#define SYNC_BUF_BLOCK 149 +#define SYNC_BUF_FLUSH_LIST 149 #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_THR_LOCAL 133 @@ -519,7 +525,7 @@ os_fast_mutex; /*!< We use this OS mutex in place of lock_word when atomic operations are not enabled */ #endif - ulint waiters; /*!< This ulint is set to 1 if there are (or + volatile ulint waiters; /*!< This ulint is set to 1 if there are (or may be) threads waiting in the global wait array for this mutex to be released. Otherwise, this is 0. */ diff -ruN a/storage/innodb_plugin/mtr/mtr0mtr.c b/storage/innodb_plugin/mtr/mtr0mtr.c --- a/storage/innodb_plugin/mtr/mtr0mtr.c 2010-08-04 02:24:20.000000000 +0900 +++ b/storage/innodb_plugin/mtr/mtr0mtr.c 2010-08-27 16:11:40.631020912 +0900 @@ -105,6 +105,38 @@ } } +UNIV_INLINE +void +mtr_memo_note_modification_all( +/*===========================*/ + mtr_t* mtr) /* in: mtr */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + ut_ad(mtr->modifications); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + slot = dyn_array_get_element(memo, offset); + + if (UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } + } +} + /************************************************************//** Writes the contents of a mini-transaction log, if any, to the database log. */ static @@ -188,6 +220,8 @@ if (write_log) { mtr_log_reserve_and_write(mtr); + + mtr_memo_note_modification_all(mtr); } /* We first update the modification info to buffer pages, and only @@ -198,11 +232,13 @@ required when we insert modified buffer pages in to the flush list which must be sorted on oldest_modification. */ - mtr_memo_pop_all(mtr); - if (write_log) { log_release(); } + + /* All unlocking has been moved here, after log_sys mutex release. */ + mtr_memo_pop_all(mtr); + #endif /* !UNIV_HOTBACKUP */ ut_d(mtr->state = MTR_COMMITTED); @@ -273,6 +309,12 @@ slot = dyn_array_get_element(memo, offset); if ((object == slot->object) && (type == slot->type)) { + if (mtr->modifications && + UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } mtr_memo_slot_release(mtr, slot); diff -ruN a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c --- a/storage/innodb_plugin/srv/srv0srv.c 2010-08-27 16:11:12.194989878 +0900 +++ b/storage/innodb_plugin/srv/srv0srv.c 2010-08-27 16:11:40.634022489 +0900 @@ -2829,7 +2829,7 @@ mutex_exit(&(log_sys->mutex)); - buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); level = 0; bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); @@ -2851,7 +2851,7 @@ bpl = 0; } - buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); if (!srv_use_doublewrite_buf) { /* flush is faster than when doublewrite */ diff -ruN a/storage/innodb_plugin/sync/sync0sync.c b/storage/innodb_plugin/sync/sync0sync.c --- a/storage/innodb_plugin/sync/sync0sync.c 2010-08-04 02:24:20.000000000 +0900 +++ b/storage/innodb_plugin/sync/sync0sync.c 2010-08-27 16:11:40.636021261 +0900 @@ -254,7 +254,7 @@ mutex->lock_word = 0; #endif mutex->event = os_event_create(NULL); - mutex_set_waiters(mutex, 0); + mutex->waiters = 0; #ifdef UNIV_DEBUG mutex->magic_n = MUTEX_MAGIC_N; #endif /* UNIV_DEBUG */ @@ -432,6 +432,15 @@ mutex_t* mutex, /*!< in: mutex */ ulint n) /*!< in: value to set */ { +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + ut_ad(mutex); + + if (n) { + os_compare_and_swap_ulint(&mutex->waiters, 0, 1); + } else { + os_compare_and_swap_ulint(&mutex->waiters, 1, 0); + } +#else volatile ulint* ptr; /* declared volatile to ensure that the value is stored to memory */ ut_ad(mutex); @@ -440,6 +449,7 @@ *ptr = n; /* Here we assume that the write of a single word in memory is atomic */ +#endif } /******************************************************************//** @@ -1153,6 +1163,12 @@ case SYNC_TRX_SYS_HEADER: case SYNC_FILE_FORMAT_TAG: case SYNC_DOUBLEWRITE: + case SYNC_BUF_LRU_LIST: + case SYNC_BUF_FLUSH_LIST: + case SYNC_BUF_PAGE_HASH: + case SYNC_BUF_FREE_LIST: + case SYNC_BUF_ZIP_FREE: + case SYNC_BUF_ZIP_HASH: case SYNC_BUF_POOL: case SYNC_SEARCH_SYS: case SYNC_SEARCH_SYS_CONF: @@ -1181,7 +1197,7 @@ buffer block (block->mutex or buf_pool_zip_mutex). */ if (!sync_thread_levels_g(array, level, FALSE)) { ut_a(sync_thread_levels_g(array, level - 1, TRUE)); - ut_a(sync_thread_levels_contain(array, SYNC_BUF_POOL)); + ut_a(sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST)); } break; case SYNC_REC_LOCK: