# name : innodb_split_buf_pool_mutex.patch # introduced : 11 or before # maintainer : Yasufumi # #!!! notice !!! # Any small change to this file in the main branch # should be done or reviewed by the maintainer! --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -4070,7 +4070,8 @@ mtr_commit(mtr); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); mutex_enter(&block->mutex); /* Only free the block if it is still allocated to @@ -4081,16 +4082,21 @@ && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (!buf_LRU_free_block(&block->page, all) - && all && block->page.zip.data) { + if (!buf_LRU_free_block(&block->page, all, TRUE) + && all && block->page.zip.data + /* Now, buf_LRU_free_block() may release mutex temporarily */ + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_block(&block->page, FALSE); + buf_LRU_free_block(&block->page, FALSE, TRUE); } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); mutex_exit(&block->mutex); } --- a/storage/innobase/btr/btr0sea.c +++ b/storage/innobase/btr/btr0sea.c @@ -1972,7 +1972,7 @@ rec_offs_init(offsets_); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); + buf_pool_page_hash_x_lock_all(); cell_count = hash_get_n_cells(btr_search_sys->hash_index); @@ -1980,11 +1980,11 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if ((i != 0) && ((i % chunk_size) == 0)) { - buf_pool_mutex_exit_all(); + buf_pool_page_hash_x_unlock_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); + buf_pool_page_hash_x_lock_all(); } node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; @@ -2093,11 +2093,11 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if (i != 0) { - buf_pool_mutex_exit_all(); + buf_pool_page_hash_x_unlock_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); + buf_pool_page_hash_x_lock_all(); } if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { @@ -2105,7 +2105,7 @@ } } - buf_pool_mutex_exit_all(); + buf_pool_page_hash_x_unlock_all(); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); --- a/storage/innobase/buf/buf0buddy.c +++ b/storage/innobase/buf/buf0buddy.c @@ -58,7 +58,7 @@ /** Validate a given zip_free list. */ #define BUF_BUDDY_LIST_VALIDATE(b, i) \ - UT_LIST_VALIDATE(list, buf_page_t, \ + UT_LIST_VALIDATE(zip_list, buf_page_t, \ b->zip_free[i], \ ut_ad(buf_page_get_state( \ ut_list_node_313) \ @@ -75,10 +75,11 @@ ulint i) /*!< in: index of buf_pool->zip_free[] */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); - UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage); } /**********************************************************************//** @@ -93,16 +94,17 @@ buf_pool->zip_free[] */ { #ifdef UNIV_DEBUG - buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE); ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); #endif /* UNIV_DEBUG */ - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); - UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage); } /**********************************************************************//** @@ -117,7 +119,8 @@ { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_a(i < BUF_BUDDY_SIZES); ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); @@ -159,16 +162,19 @@ buf_buddy_block_free( /*=================*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ - void* buf) /*!< in: buffer frame to deallocate */ + void* buf, /*!< in: buffer frame to deallocate */ + ibool have_page_hash_mutex) { const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); buf_page_t* bpage; buf_block_t* block; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + mutex_enter(&buf_pool->zip_hash_mutex); + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY && bpage->in_zip_hash && !bpage->in_page_hash), @@ -180,12 +186,14 @@ ut_d(bpage->in_zip_hash = FALSE); HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + mutex_exit(&buf_pool->zip_hash_mutex); + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); block = (buf_block_t*) bpage; mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); mutex_exit(&block->mutex); ut_ad(buf_pool->buddy_n_frames > 0); @@ -202,7 +210,7 @@ { buf_pool_t* buf_pool = buf_pool_from_block(block); const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); @@ -214,7 +222,10 @@ ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); ut_d(block->page.in_zip_hash = TRUE); + + mutex_enter(&buf_pool->zip_hash_mutex); HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); + mutex_exit(&buf_pool->zip_hash_mutex); ut_d(buf_pool->buddy_n_frames++); } @@ -268,26 +279,30 @@ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ + ibool have_page_hash_mutex) { buf_block_t* block; ut_ad(lru); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ + mutex_enter(&buf_pool->zip_free_mutex); block = buf_buddy_alloc_zip(buf_pool, i); if (block) { goto func_exit; } + mutex_exit(&buf_pool->zip_free_mutex); } /* Try allocating from the buf_pool->free list. */ @@ -299,19 +314,30 @@ } /* Try replacing an uncompressed page in the buffer pool. */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } block = buf_LRU_get_free_block(buf_pool); *lru = TRUE; - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_lock(&buf_pool->page_hash_latch); + } alloc_big: buf_buddy_block_register(block); + mutex_enter(&buf_pool->zip_free_mutex); block = buf_buddy_alloc_from( buf_pool, block->frame, i, BUF_BUDDY_SIZES); func_exit: buf_pool->buddy_stat[i].used++; + mutex_exit(&buf_pool->zip_free_mutex); + return(block); } @@ -325,8 +351,9 @@ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ void* src, /*!< in: block to relocate */ void* dst, /*!< in: free block to relocate to */ - ulint i) /*!< in: index of + ulint i, /*!< in: index of buf_pool->zip_free[] */ + ibool have_page_hash_mutex) { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; @@ -334,13 +361,20 @@ ulint space; ulint page_no; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); UNIV_MEM_ASSERT_W(dst, size); + if (!have_page_hash_mutex) { + mutex_exit(&buf_pool->zip_free_mutex); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); + } + /* We assume that all memory from buf_buddy_alloc() is used for compressed page frames. */ @@ -374,6 +408,11 @@ added to buf_pool->page_hash yet. Obviously, it cannot be relocated. */ + if (!have_page_hash_mutex) { + mutex_enter(&buf_pool->zip_free_mutex); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } return(FALSE); } @@ -383,18 +422,27 @@ For the sake of simplicity, give up. */ ut_ad(page_zip_get_size(&bpage->zip) < size); + if (!have_page_hash_mutex) { + mutex_enter(&buf_pool->zip_free_mutex); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } return(FALSE); } + /* To keep latch order */ + if (have_page_hash_mutex) + mutex_exit(&buf_pool->zip_free_mutex); + /* The block must have been allocated, but it may contain uninitialized data. */ UNIV_MEM_ASSERT_W(src, size); - mutex = buf_page_get_mutex(bpage); + mutex = buf_page_get_mutex_enter(bpage); - mutex_enter(mutex); + mutex_enter(&buf_pool->zip_free_mutex); - if (buf_page_can_relocate(bpage)) { + if (mutex && buf_page_can_relocate(bpage)) { /* Relocate the compressed page. */ ullint usec = ut_time_us(NULL); ut_a(bpage->zip.data == src); @@ -409,10 +457,22 @@ buddy_stat->relocated_usec += ut_time_us(NULL) - usec; } + + if (!have_page_hash_mutex) { + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } return(TRUE); } - mutex_exit(mutex); + if (!have_page_hash_mutex) { + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } + + if (mutex) { + mutex_exit(mutex); + } return(FALSE); } @@ -425,13 +485,15 @@ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], + ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ + ibool have_page_hash_mutex) { buf_page_t* bpage; buf_page_t* buddy; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); @@ -443,7 +505,9 @@ ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE; if (i == BUF_BUDDY_SIZES) { - buf_buddy_block_free(buf_pool, buf); + mutex_exit(&buf_pool->zip_free_mutex); + buf_buddy_block_free(buf_pool, buf, have_page_hash_mutex); + mutex_enter(&buf_pool->zip_free_mutex); return; } @@ -491,7 +555,7 @@ ut_a(bpage != buf); UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i); - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(zip_list, bpage); } #ifndef UNIV_DEBUG_VALGRIND @@ -501,7 +565,7 @@ ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); /* The buddy is not free. Is there a free block of this size? */ - bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); if (bpage) { @@ -510,7 +574,7 @@ buf_buddy_remove_from_free(buf_pool, bpage, i); /* Try to relocate the buddy of buf to the free block. */ - if (buf_buddy_relocate(buf_pool, buddy, bpage, i)) { + if (buf_buddy_relocate(buf_pool, buddy, bpage, i, have_page_hash_mutex)) { buddy->state = BUF_BLOCK_ZIP_FREE; goto buddy_is_free; --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -263,6 +263,7 @@ #ifdef UNIV_PFS_RWLOCK /* Keys to register buffer block related rwlocks and mutexes with performance schema */ +UNIV_INTERN mysql_pfs_key_t buf_pool_page_hash_key; UNIV_INTERN mysql_pfs_key_t buf_block_lock_key; # ifdef UNIV_SYNC_DEBUG UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key; @@ -273,6 +274,10 @@ UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key; UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key; UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_free_list_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_zip_free_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_zip_hash_mutex_key; UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key; #endif /* UNIV_PFS_MUTEX */ @@ -890,9 +895,13 @@ block->page.in_zip_hash = FALSE; block->page.in_flush_list = FALSE; block->page.in_free_list = FALSE; - block->in_unzip_LRU_list = FALSE; #endif /* UNIV_DEBUG */ + block->page.flush_list.prev = NULL; + block->page.flush_list.next = NULL; + block->page.zip_list.prev = NULL; + block->page.zip_list.next = NULL; block->page.in_LRU_list = FALSE; + block->in_unzip_LRU_list = FALSE; #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG block->n_pointers = 0; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -997,9 +1006,11 @@ memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif /* Add the block to the free list */ - UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + mutex_enter(&buf_pool->free_list_mutex); + UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&buf_pool->free_list_mutex); ut_ad(buf_pool_from_block(block) == buf_pool); block++; @@ -1054,7 +1065,8 @@ buf_chunk_t* chunk = buf_pool->chunks; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); for (n = buf_pool->n_chunks; n--; chunk++) { buf_block_t* block = buf_chunk_contains_zip(chunk, data); @@ -1160,9 +1172,21 @@ ------------------------------- */ mutex_create(buf_pool_mutex_key, &buf_pool->mutex, SYNC_BUF_POOL); + mutex_create(buf_pool_LRU_list_mutex_key, + &buf_pool->LRU_list_mutex, SYNC_BUF_LRU_LIST); + rw_lock_create(buf_pool_page_hash_key, + &buf_pool->page_hash_latch, SYNC_BUF_PAGE_HASH); + mutex_create(buf_pool_free_list_mutex_key, + &buf_pool->free_list_mutex, SYNC_BUF_FREE_LIST); + mutex_create(buf_pool_zip_free_mutex_key, + &buf_pool->zip_free_mutex, SYNC_BUF_ZIP_FREE); + mutex_create(buf_pool_zip_hash_mutex_key, + &buf_pool->zip_hash_mutex, SYNC_BUF_ZIP_HASH); mutex_create(buf_pool_zip_mutex_key, &buf_pool->zip_mutex, SYNC_BUF_BLOCK); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); buf_pool_mutex_enter(buf_pool); if (buf_pool_size > 0) { @@ -1175,6 +1199,8 @@ mem_free(chunk); mem_free(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_pool_mutex_exit(buf_pool); return(DB_ERROR); @@ -1205,6 +1231,8 @@ /* All fields are initialized by mem_zalloc(). */ + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_pool_mutex_exit(buf_pool); return(DB_SUCCESS); @@ -1376,7 +1404,11 @@ ulint fold; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); ut_a(bpage->buf_fix_count == 0); @@ -1487,21 +1519,32 @@ buf_page_t* bpage; ulint i; buf_pool_t* buf_pool = buf_pool_get(space, offset); + mutex_t* block_mutex; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + rw_lock_x_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); if (UNIV_LIKELY_NULL(bpage)) { + + block_mutex = buf_page_get_mutex_enter(bpage); + ut_a(block_mutex); + if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) { /* The page was loaded meanwhile. */ + rw_lock_x_unlock(&buf_pool->page_hash_latch); return(bpage); } /* Add to an existing watch. */ bpage->buf_fix_count++; + rw_lock_x_unlock(&buf_pool->page_hash_latch); + mutex_exit(block_mutex); return(NULL); } + /* buf_pool->watch is protected by zip_mutex for now */ + mutex_enter(&buf_pool->zip_mutex); for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) { bpage = &buf_pool->watch[i]; @@ -1525,10 +1568,12 @@ bpage->space = space; bpage->offset = offset; bpage->buf_fix_count = 1; - + bpage->buf_pool_index = buf_pool_index(buf_pool); ut_d(bpage->in_page_hash = TRUE); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, bpage); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + mutex_exit(&buf_pool->zip_mutex); return(NULL); case BUF_BLOCK_ZIP_PAGE: ut_ad(bpage->in_page_hash); @@ -1546,6 +1591,8 @@ ut_error; /* Fix compiler warning */ + rw_lock_x_unlock(&buf_pool->page_hash_latch); + mutex_exit(&buf_pool->zip_mutex); return(NULL); } @@ -1563,7 +1610,11 @@ space, offset) */ buf_page_t* watch) /*!< in/out: sentinel for watch */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX)); +#endif + ut_ad(mutex_own(&buf_pool->zip_mutex)); /* for now */ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch); ut_d(watch->in_page_hash = FALSE); @@ -1585,28 +1636,31 @@ buf_pool_t* buf_pool = buf_pool_get(space, offset); ulint fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_x_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); /* The page must exist because buf_pool_watch_set() increments buf_fix_count. */ ut_a(bpage); if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) { - mutex_t* mutex = buf_page_get_mutex(bpage); + mutex_t* mutex = buf_page_get_mutex_enter(bpage); - mutex_enter(mutex); ut_a(bpage->buf_fix_count > 0); bpage->buf_fix_count--; mutex_exit(mutex); } else { + mutex_enter(&buf_pool->zip_mutex); ut_a(bpage->buf_fix_count > 0); if (UNIV_LIKELY(!--bpage->buf_fix_count)) { buf_pool_watch_remove(buf_pool, fold, bpage); } + mutex_exit(&buf_pool->zip_mutex); } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_x_unlock(&buf_pool->page_hash_latch); } /****************************************************************//** @@ -1626,14 +1680,16 @@ buf_pool_t* buf_pool = buf_pool_get(space, offset); ulint fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); /* The page must exist because buf_pool_watch_set() increments buf_fix_count. */ ut_a(bpage); ret = !buf_pool_watch_is_sentinel(buf_pool, bpage); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(ret); } @@ -1650,13 +1706,15 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); ut_a(buf_page_in_file(bpage)); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /********************************************************************//** @@ -1680,14 +1738,20 @@ ut_a(buf_page_in_file(bpage)); if (buf_page_peek_if_too_old(bpage)) { - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } else if (!access_time) { ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(buf_pool); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); + //buf_pool_mutex_enter(buf_pool); + if (block_mutex) { buf_page_set_accessed(bpage, time_ms); - buf_pool_mutex_exit(buf_pool); + mutex_exit(block_mutex); + } + //buf_pool_mutex_exit(buf_pool); } } @@ -1704,7 +1768,8 @@ buf_block_t* block; buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset); @@ -1713,7 +1778,8 @@ block->check_index_page_at_flush = FALSE; } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); } #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG @@ -1733,7 +1799,8 @@ buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get(buf_pool, space, offset); @@ -1744,7 +1811,8 @@ bpage->file_page_was_freed = TRUE; } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(bpage); } @@ -1765,7 +1833,8 @@ buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get(buf_pool, space, offset); @@ -1774,7 +1843,8 @@ bpage->file_page_was_freed = FALSE; } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(bpage); } @@ -1806,8 +1876,9 @@ buf_pool->stat.n_page_gets++; for (;;) { - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); lookup: + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage) { ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); @@ -1816,7 +1887,8 @@ /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); buf_read_page(space, zip_size, offset); @@ -1828,10 +1900,15 @@ if (UNIV_UNLIKELY(!bpage->zip.data)) { /* There is no compressed page. */ err_exit: - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(NULL); } + block_mutex = buf_page_get_mutex_enter(bpage); + + rw_lock_s_unlock(&buf_pool->page_hash_latch); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); switch (buf_page_get_state(bpage)) { @@ -1840,24 +1917,43 @@ case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: case BUF_BLOCK_ZIP_FREE: + if (block_mutex) + mutex_exit(block_mutex); break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - block_mutex = &buf_pool->zip_mutex; - mutex_enter(block_mutex); + ut_a(block_mutex == &buf_pool->zip_mutex); bpage->buf_fix_count++; goto got_block; case BUF_BLOCK_FILE_PAGE: - block_mutex = &((buf_block_t*) bpage)->mutex; + ut_a(block_mutex == &((buf_block_t*) bpage)->mutex); + + /* release mutex to obey to latch-order */ + mutex_exit(block_mutex); + + /* get LRU_list_mutex for buf_LRU_free_block() */ + mutex_enter(&buf_pool->LRU_list_mutex); mutex_enter(block_mutex); - /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE)) { + if (UNIV_UNLIKELY(bpage->space != space + || bpage->offset != offset + || !bpage->in_LRU_list + || !bpage->zip.data)) { + /* someone should interrupt, retry */ + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_exit(block_mutex); + goto lookup; + } + /* Discard the uncompressed page frame if possible. */ + if (buf_LRU_free_block(bpage, FALSE, TRUE)) { + mutex_exit(&buf_pool->LRU_list_mutex); mutex_exit(block_mutex); goto lookup; } + mutex_exit(&buf_pool->LRU_list_mutex); + buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__); goto got_block; @@ -1870,7 +1966,7 @@ must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; access_time = buf_page_is_accessed(bpage); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); mutex_exit(block_mutex); @@ -2181,7 +2277,7 @@ const buf_block_t* block) /*!< in: pointer to block, not dereferenced */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) { /* The pointer should be aligned. */ @@ -2217,6 +2313,7 @@ ulint fix_type; ibool must_read; ulint retries = 0; + mutex_t* block_mutex = NULL; buf_pool_t* buf_pool = buf_pool_get(space, offset); ut_ad(mtr); @@ -2250,18 +2347,24 @@ fold = buf_page_address_fold(space, offset); loop: block = guess; - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + /* If the guess is a compressed page descriptor that has been allocated by buf_page_alloc_descriptor(), it may have been freed by buf_relocate(). */ - if (!buf_block_is_uncompressed(buf_pool, block) + if (!block_mutex) { + block = guess = NULL; + } else if (!buf_block_is_uncompressed(buf_pool, block) || offset != block->page.offset || space != block->page.space || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(block_mutex); + block = guess = NULL; } else { ut_ad(!block->page.in_zip_hash); @@ -2270,12 +2373,19 @@ } if (block == NULL) { + rw_lock_s_lock(&buf_pool->page_hash_latch); block = (buf_block_t*) buf_page_hash_get_low( buf_pool, space, offset, fold); + if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + ut_a(block_mutex); + } + rw_lock_s_unlock(&buf_pool->page_hash_latch); } loop2: if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) { + mutex_exit(block_mutex); block = NULL; } @@ -2287,12 +2397,14 @@ space, offset, fold); if (UNIV_LIKELY_NULL(block)) { - + block_mutex = buf_page_get_mutex((buf_page_t*)block); + ut_a(block_mutex); + ut_ad(mutex_own(block_mutex)); goto got_block; } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL @@ -2345,7 +2457,8 @@ /* The page is being read to buffer pool, but we cannot wait around for the read to complete. */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(block_mutex); return(NULL); } @@ -2355,38 +2468,49 @@ ibool success; case BUF_BLOCK_FILE_PAGE: + if (block_mutex == &buf_pool->zip_mutex) { + /* it is wrong mutex... */ + mutex_exit(block_mutex); + goto loop; + } break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: + ut_ad(block_mutex == &buf_pool->zip_mutex); bpage = &block->page; /* Protect bpage->buf_fix_count. */ - mutex_enter(&buf_pool->zip_mutex); + //mutex_enter(&buf_pool->zip_mutex); if (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by buf_page_init_for_read(). */ - mutex_exit(&buf_pool->zip_mutex); + //mutex_exit(&buf_pool->zip_mutex); wait_until_unfixed: /* The block is buffer-fixed or I/O-fixed. Try again later. */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(block_mutex); os_thread_sleep(WAIT_FOR_READ); goto loop; } /* Allocate an uncompressed page. */ - buf_pool_mutex_exit(buf_pool); - mutex_exit(&buf_pool->zip_mutex); + //buf_pool_mutex_exit(buf_pool); + //mutex_exit(&buf_pool->zip_mutex); + mutex_exit(block_mutex); block = buf_LRU_get_free_block(buf_pool); ut_a(block); + block_mutex = &block->mutex; - buf_pool_mutex_enter(buf_pool); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); + mutex_enter(block_mutex); { buf_page_t* hash_bpage; @@ -2399,35 +2523,47 @@ while buf_pool->mutex was released. Free the block that was allocated. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + mutex_exit(block_mutex); block = (buf_block_t*) hash_bpage; + if (block) { + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + ut_a(block_mutex); + } + rw_lock_x_unlock(&buf_pool->page_hash_latch); + mutex_exit(&buf_pool->LRU_list_mutex); goto loop2; } } + mutex_enter(&buf_pool->zip_mutex); + if (UNIV_UNLIKELY (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) { + mutex_exit(&buf_pool->zip_mutex); /* The block was buffer-fixed or I/O-fixed while buf_pool->mutex was not held by this thread. Free the block that was allocated and try again. This should be extremely unlikely. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + //mutex_exit(&block->mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + mutex_exit(&buf_pool->LRU_list_mutex); goto wait_until_unfixed; } /* Move the compressed page from bpage to block, and uncompress it. */ - mutex_enter(&buf_pool->zip_mutex); - buf_relocate(bpage, &block->page); + + rw_lock_x_unlock(&buf_pool->page_hash_latch); + buf_block_init_low(block); block->lock_hash_val = lock_rec_hash(space, offset); @@ -2437,7 +2573,7 @@ if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - UT_LIST_REMOVE(list, buf_pool->zip_clean, + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, &block->page); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ ut_ad(!block->page.in_flush_list); @@ -2455,18 +2591,23 @@ /* Insert at the front of unzip_LRU list */ buf_unzip_LRU_add_block(block, FALSE); + mutex_exit(&buf_pool->LRU_list_mutex); + block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); rw_lock_x_lock_func(&block->lock, 0, file, line); UNIV_MEM_INVALID(bpage, sizeof *bpage); - mutex_exit(&block->mutex); + mutex_exit(block_mutex); mutex_exit(&buf_pool->zip_mutex); - buf_pool->n_pend_unzip++; + buf_pool_mutex_enter(buf_pool); + buf_pool->n_pend_unzip++; buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + buf_page_free_descriptor(bpage); /* Decompress the page and apply buffered operations @@ -2480,12 +2621,15 @@ } /* Unfix and unlatch the block. */ - buf_pool_mutex_enter(buf_pool); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(buf_pool); + block_mutex = &block->mutex; + mutex_enter(block_mutex); block->page.buf_fix_count--; buf_block_set_io_fix(block, BUF_IO_NONE); - mutex_exit(&block->mutex); + + buf_pool_mutex_enter(buf_pool); buf_pool->n_pend_unzip--; + buf_pool_mutex_exit(buf_pool); rw_lock_x_unlock(&block->lock); break; @@ -2501,7 +2645,7 @@ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - mutex_enter(&block->mutex); + //mutex_enter(&block->mutex); #if UNIV_WORD_SIZE == 4 /* On 32-bit systems, there is no padding in buf_page_t. On other systems, Valgrind could complain about uninitialized pad @@ -2514,8 +2658,8 @@ /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE)) { - mutex_exit(&block->mutex); + if (buf_LRU_free_block(&block->page, TRUE, FALSE)) { + mutex_exit(block_mutex); if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { /* Set the watch, as it would have been set if the page were not in the @@ -2524,6 +2668,9 @@ space, offset, fold); if (UNIV_LIKELY_NULL(block)) { + block_mutex = buf_page_get_mutex((buf_page_t*)block); + ut_a(block_mutex); + ut_ad(mutex_own(block_mutex)); /* The page entered the buffer pool for some reason. Try to @@ -2531,7 +2678,7 @@ goto got_block; } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); fprintf(stderr, "innodb_change_buffering_debug evict %u %u\n", (unsigned) space, (unsigned) offset); @@ -2553,13 +2700,14 @@ ut_a(mode == BUF_GET_POSSIBLY_FREED || !block->page.file_page_was_freed); #endif - mutex_exit(&block->mutex); + //mutex_exit(&block->mutex); /* Check if this is the first access to the page */ access_time = buf_page_is_accessed(&block->page); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(block_mutex); if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) { buf_page_set_accessed_make_young(&block->page, access_time); @@ -2792,9 +2940,11 @@ buf_pool = buf_pool_from_block(block); if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) { - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); buf_LRU_make_block_young(&block->page); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } else if (!buf_page_is_accessed(&block->page)) { /* Above, we do a dirty read on purpose, to avoid mutex contention. The field buf_page_t::access_time @@ -2802,9 +2952,11 @@ field must be protected by mutex, however. */ ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&block->mutex); buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&block->mutex); } ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD); @@ -2871,18 +3023,21 @@ ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); block = buf_block_hash_get(buf_pool, space_id, page_no); if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(NULL); } ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); mutex_enter(&block->mutex); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); @@ -2972,7 +3127,10 @@ buf_page_t* hash_page; ut_ad(buf_pool == buf_pool_get(space, offset)); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(&(block->mutex))); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); @@ -3001,11 +3159,14 @@ if (UNIV_LIKELY(!hash_page)) { } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) { /* Preserve the reference count. */ - ulint buf_fix_count = hash_page->buf_fix_count; + ulint buf_fix_count; + mutex_enter(&buf_pool->zip_mutex); + buf_fix_count = hash_page->buf_fix_count; ut_a(buf_fix_count > 0); block->page.buf_fix_count += buf_fix_count; buf_pool_watch_remove(buf_pool, fold, hash_page); + mutex_exit(&buf_pool->zip_mutex); } else { fprintf(stderr, "InnoDB: Error: page %lu %lu already found" @@ -3015,7 +3176,8 @@ (const void*) hash_page, (const void*) block); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(&block->mutex); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -3098,7 +3260,9 @@ fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold); if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) { @@ -3107,9 +3271,15 @@ err_exit: if (block) { mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); } + else { + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } bpage = NULL; goto func_exit; @@ -3132,6 +3302,8 @@ buf_page_init(buf_pool, space, offset, fold, block); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); @@ -3159,7 +3331,7 @@ been added to buf_pool->LRU and buf_pool->page_hash. */ mutex_exit(&block->mutex); - data = buf_buddy_alloc(buf_pool, zip_size, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3172,13 +3344,14 @@ buf_unzip_LRU_add_block(block, TRUE); } + mutex_exit(&buf_pool->LRU_list_mutex); mutex_exit(&block->mutex); } else { /* The compressed page must be allocated before the control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ - data = buf_buddy_alloc(buf_pool, zip_size, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru, TRUE); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool->mutex. Thus, we must @@ -3194,7 +3367,10 @@ /* The block was added by some other thread. */ watch_page = NULL; - buf_buddy_free(buf_pool, data, zip_size); + buf_buddy_free(buf_pool, data, zip_size, TRUE); + + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); bpage = NULL; goto func_exit; @@ -3242,20 +3418,26 @@ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, bpage); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + mutex_exit(&buf_pool->LRU_list_mutex); + buf_page_set_io_fix(bpage, BUF_IO_READ); mutex_exit(&buf_pool->zip_mutex); } + buf_pool_mutex_enter(buf_pool); buf_pool->n_pend_reads++; -func_exit: buf_pool_mutex_exit(buf_pool); +func_exit: + //buf_pool_mutex_exit(buf_pool); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -3297,7 +3479,9 @@ fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); block = (buf_block_t*) buf_page_hash_get_low( buf_pool, space, offset, fold); @@ -3313,7 +3497,9 @@ #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ /* Page can be found in buf_pool */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_block_free(free_block); @@ -3335,6 +3521,7 @@ mutex_enter(&block->mutex); buf_page_init(buf_pool, space, offset, fold, block); + rw_lock_x_unlock(&buf_pool->page_hash_latch); /* The block must be put to the LRU list */ buf_LRU_add_block(&block->page, FALSE); @@ -3361,7 +3548,7 @@ the reacquisition of buf_pool->mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(buf_pool, zip_size, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3379,7 +3566,8 @@ buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -3434,7 +3622,9 @@ ibool ret = TRUE; /* First unfix and release lock on the bpage */ - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); mutex_enter(buf_page_get_mutex(bpage)); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); ut_ad(bpage->buf_fix_count == 0); @@ -3455,11 +3645,15 @@ ret = FALSE; } + buf_pool_mutex_enter(buf_pool); ut_ad(buf_pool->n_pend_reads > 0); buf_pool->n_pend_reads--; + buf_pool_mutex_exit(buf_pool); mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); return(ret); } @@ -3477,6 +3671,8 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); const ibool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + ibool have_LRU_mutex = FALSE; + mutex_t* block_mutex; ut_a(buf_page_in_file(bpage)); @@ -3619,8 +3815,26 @@ } } + if (io_type == BUF_IO_WRITE + && (buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY + || buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU)) { + /* to keep consistency at buf_LRU_insert_zip_clean() */ + have_LRU_mutex = TRUE; /* optimistic */ + } +retry_mutex: + if (have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); + ut_a(block_mutex); + if (io_type == BUF_IO_WRITE + && (buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY + || buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) + && !have_LRU_mutex) { + mutex_exit(block_mutex); + have_LRU_mutex = TRUE; + goto retry_mutex; + } buf_pool_mutex_enter(buf_pool); - mutex_enter(buf_page_get_mutex(bpage)); #ifdef UNIV_IBUF_COUNT_DEBUG if (io_type == BUF_IO_WRITE || uncompressed) { @@ -3643,6 +3857,7 @@ the x-latch to this OS thread: do not let this confuse you in debugging! */ + ut_a(!have_LRU_mutex); ut_ad(buf_pool->n_pend_reads > 0); buf_pool->n_pend_reads--; buf_pool->stat.n_pages_read++; @@ -3660,6 +3875,9 @@ buf_flush_write_complete(bpage); + if (have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); + if (uncompressed) { rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); @@ -3682,8 +3900,8 @@ } #endif /* UNIV_DEBUG */ - mutex_exit(buf_page_get_mutex(bpage)); buf_pool_mutex_exit(buf_pool); + mutex_exit(block_mutex); } /*********************************************************************//** @@ -3700,7 +3918,9 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); chunk = buf_pool->chunks; @@ -3717,7 +3937,9 @@ } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); return(TRUE); } @@ -3765,7 +3987,8 @@ freed = buf_LRU_search_and_free_block(buf_pool, 100); } - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); @@ -3778,7 +4001,8 @@ memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat)); buf_refresh_io_stats(buf_pool); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /*********************************************************************//** @@ -3820,7 +4044,10 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); + /* for keep the new latch order, it cannot validate correctly... */ chunk = buf_pool->chunks; @@ -3918,7 +4145,7 @@ /* Check clean compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); switch (buf_page_get_io_fix(b)) { case BUF_IO_NONE: @@ -3950,7 +4177,7 @@ buf_flush_list_mutex_enter(buf_pool); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); ut_a(b->oldest_modification); n_flush++; @@ -4010,6 +4237,8 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + /* because of latching order with block->mutex, we cannot get needed mutexes before that */ +/* if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { fprintf(stderr, "Free list len %lu, free blocks %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->free), @@ -4020,8 +4249,11 @@ ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); +*/ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate(buf_pool)); @@ -4077,7 +4309,9 @@ index_ids = mem_alloc(size * sizeof *index_ids); counts = mem_alloc(sizeof(ulint) * size); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->free_list_mutex); buf_flush_list_mutex_enter(buf_pool); fprintf(stderr, @@ -4146,7 +4380,9 @@ } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_exit(&buf_pool->free_list_mutex); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -4203,7 +4439,7 @@ buf_chunk_t* chunk; ulint fixed_pages_number = 0; - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); chunk = buf_pool->chunks; @@ -4237,7 +4473,7 @@ /* Traverse the lists of clean and dirty compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); @@ -4249,7 +4485,7 @@ buf_flush_list_mutex_enter(buf_pool); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); switch (buf_page_get_state(b)) { @@ -4275,7 +4511,7 @@ buf_flush_list_mutex_exit(buf_pool); mutex_exit(&buf_pool->zip_mutex); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); return(fixed_pages_number); } @@ -4433,6 +4669,8 @@ /* Find appropriate pool_info to store stats for this buffer pool */ pool_info = &all_pool_info[pool_id]; + mutex_enter(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->free_list_mutex); buf_pool_mutex_enter(buf_pool); buf_flush_list_mutex_enter(buf_pool); @@ -4548,6 +4786,8 @@ pool_info->unzip_cur = buf_LRU_stat_cur.unzip; buf_refresh_io_stats(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_exit(&buf_pool->free_list_mutex); buf_pool_mutex_exit(buf_pool); } @@ -4792,11 +5032,13 @@ { ulint len; - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->free_list_mutex); len = UT_LIST_GET_LEN(buf_pool->free); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->free_list_mutex); return(len); } --- a/storage/innobase/buf/buf0flu.c +++ b/storage/innobase/buf/buf0flu.c @@ -307,7 +307,7 @@ ut_d(block->page.in_flush_list = TRUE); block->page.oldest_modification = lsn; - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); #ifdef UNIV_DEBUG_VALGRIND { @@ -401,14 +401,14 @@ > block->page.oldest_modification) { ut_ad(b->in_flush_list); prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + b = UT_LIST_GET_NEXT(flush_list, b); } } if (prev_b == NULL) { - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); } else { - UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, + UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, &block->page); } @@ -434,7 +434,7 @@ //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); //ut_ad(buf_pool_mutex_own(buf_pool)); #endif - //ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); //ut_ad(bpage->in_LRU_list); if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) { @@ -470,14 +470,14 @@ enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif - ut_a(buf_page_in_file(bpage)); + //ut_a(buf_page_in_file(bpage)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST); - if (bpage->oldest_modification != 0 + if (buf_page_in_file(bpage) && bpage->oldest_modification != 0 && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { ut_ad(bpage->in_flush_list); @@ -508,7 +508,7 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(bpage->in_flush_list); @@ -526,13 +526,13 @@ return; case BUF_BLOCK_ZIP_DIRTY: buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ break; case BUF_BLOCK_FILE_PAGE: - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); break; } @@ -576,7 +576,7 @@ buf_page_t* prev_b = NULL; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); /* Must reside in the same buffer pool. */ ut_ad(buf_pool == buf_pool_from_bpage(dpage)); @@ -605,18 +605,18 @@ because we assert on in_flush_list in comparison function. */ ut_d(bpage->in_flush_list = FALSE); - prev = UT_LIST_GET_PREV(list, bpage); - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + prev = UT_LIST_GET_PREV(flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); if (prev) { ut_ad(prev->in_flush_list); UT_LIST_INSERT_AFTER( - list, + flush_list, buf_pool->flush_list, prev, dpage); } else { UT_LIST_ADD_FIRST( - list, + flush_list, buf_pool->flush_list, dpage); } @@ -1085,7 +1085,7 @@ #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(!buf_pool_mutex_own(buf_pool)); + //ut_ad(!buf_pool_mutex_own(buf_pool)); #endif #ifdef UNIV_LOG_DEBUG @@ -1099,7 +1099,8 @@ io_fixed and oldest_modification != 0. Thus, it cannot be relocated in the buffer pool or removed from flush_list or LRU_list. */ - ut_ad(!buf_pool_mutex_own(buf_pool)); + //ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(!buf_flush_list_mutex_own(buf_pool)); ut_ad(!mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); @@ -1179,7 +1180,7 @@ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ buf_block_t* block) /*!< in/out: buffer control block */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(mutex_own(&block->mutex)); @@ -1187,8 +1188,11 @@ return(FALSE); } + buf_pool_mutex_enter(buf_pool); + if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 || buf_pool->init_flush[BUF_FLUSH_LRU]) { + buf_pool_mutex_exit(buf_pool); /* There is already a flush batch of the same type running */ return(FALSE); } @@ -1262,12 +1266,18 @@ ibool is_uncompressed; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED)); +#endif ut_ad(buf_page_in_file(bpage)); block_mutex = buf_page_get_mutex(bpage); ut_ad(mutex_own(block_mutex)); + buf_pool_mutex_enter(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); + ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); buf_page_set_io_fix(bpage, BUF_IO_WRITE); @@ -1455,14 +1465,16 @@ buf_pool = buf_pool_get(space, i); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); /* We only want to flush pages from this buffer pool. */ bpage = buf_page_hash_get(buf_pool, space, i); if (!bpage) { - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); if (srv_flush_neighbor_pages == 2) { /* This is contiguous neighbor page flush and @@ -1480,11 +1492,9 @@ if (flush_type != BUF_FLUSH_LRU || i == offset || buf_page_is_old(bpage)) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); - mutex_enter(block_mutex); - - if (buf_flush_ready_for_flush(bpage, flush_type) + if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type) && (i == offset || !bpage->buf_fix_count)) { /* We only try to flush those neighbors != offset where the buf fix @@ -1500,11 +1510,12 @@ ut_ad(!buf_pool_mutex_own(buf_pool)); count++; continue; - } else { + } else if (block_mutex) { mutex_exit(block_mutex); } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); if (srv_flush_neighbor_pages == 2) { @@ -1553,21 +1564,25 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); #endif /* UNIV_DEBUG */ - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(flush_type != BUF_FLUSH_LRU + || mutex_own(&buf_pool->LRU_list_mutex)); - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); - ut_a(buf_page_in_file(bpage)); + //ut_a(buf_page_in_file(bpage)); - if (buf_flush_ready_for_flush(bpage, flush_type)) { + if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)) { ulint space; ulint offset; buf_pool_t* buf_pool; buf_pool = buf_pool_from_bpage(bpage); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&buf_pool->LRU_list_mutex); + } /* These fields are protected by both the buffer pool mutex and block mutex. */ @@ -1583,13 +1598,18 @@ *count, n_to_flush); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&buf_pool->LRU_list_mutex); + } flushed = TRUE; - } else { + } else if (block_mutex) { mutex_exit(block_mutex); } - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(flush_type != BUF_FLUSH_LRU + || mutex_own(&buf_pool->LRU_list_mutex)); return(flushed); } @@ -1610,7 +1630,8 @@ buf_page_t* bpage; ulint count = 0; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); do { /* Start from the end of the list looking for a @@ -1632,7 +1653,8 @@ should be flushed, we factor in this value. */ buf_lru_flush_page_count += count; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); return(count); } @@ -1660,9 +1682,10 @@ { ulint len; buf_page_t* bpage; + buf_page_t* prev_bpage = NULL; ulint count = 0; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); /* If we have flushed enough, leave the loop */ do { @@ -1681,6 +1704,7 @@ if (bpage) { ut_a(bpage->oldest_modification > 0); + prev_bpage = UT_LIST_GET_PREV(flush_list, bpage); } if (!bpage || bpage->oldest_modification >= lsn_limit) { @@ -1722,9 +1746,17 @@ break; } - bpage = UT_LIST_GET_PREV(list, bpage); + bpage = UT_LIST_GET_PREV(flush_list, bpage); - ut_ad(!bpage || bpage->in_flush_list); + //ut_ad(!bpage || bpage->in_flush_list); + if (bpage != prev_bpage) { + /* the search might warp.. retrying */ + buf_flush_list_mutex_exit(buf_pool); + break; + } + if (bpage) { + prev_bpage = UT_LIST_GET_PREV(flush_list, bpage); + } buf_flush_list_mutex_exit(buf_pool); @@ -1733,7 +1765,7 @@ } while (count < min_n && bpage != NULL && len > 0); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); return(count); } @@ -1772,13 +1804,15 @@ || sync_thread_levels_empty_except_dict()); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); /* Note: The buffer pool mutex is released and reacquired within the flush functions. */ switch(flush_type) { case BUF_FLUSH_LRU: + mutex_enter(&buf_pool->LRU_list_mutex); count = buf_flush_LRU_list_batch(buf_pool, min_n); + mutex_exit(&buf_pool->LRU_list_mutex); break; case BUF_FLUSH_LIST: count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit); @@ -1787,7 +1821,7 @@ ut_error; } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); buf_flush_buffered_writes(); @@ -2059,7 +2093,7 @@ retry: //buf_pool_mutex_enter(buf_pool); if (have_LRU_mutex) - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); @@ -2076,15 +2110,15 @@ bpage = UT_LIST_GET_LAST(buf_pool->LRU); continue; } - block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); - if (buf_flush_ready_for_replace(bpage)) { + if (block_mutex && buf_flush_ready_for_replace(bpage)) { n_replaceable++; } - mutex_exit(block_mutex); + if (block_mutex) { + mutex_exit(block_mutex); + } distance++; @@ -2093,7 +2127,7 @@ //buf_pool_mutex_exit(buf_pool); if (have_LRU_mutex) - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) { @@ -2292,7 +2326,7 @@ ut_ad(buf_flush_list_mutex_own(buf_pool)); - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, + UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list)); bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); @@ -2332,7 +2366,7 @@ rnode = rbt_next(buf_pool->flush_rbt, rnode); } - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(flush_list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); } --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -147,8 +147,9 @@ void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block); /*!< in: block, must contain a file page and + buf_block_t* block, /*!< in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex); /******************************************************************//** Determines if the unzip_LRU list should be used for evicting a victim @@ -158,15 +159,20 @@ ibool buf_LRU_evict_from_unzip_LRU( /*=========================*/ - buf_pool_t* buf_pool) + buf_pool_t* buf_pool, + ibool have_LRU_mutex) { ulint io_avg; ulint unzip_avg; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + if (!have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); /* If the unzip_LRU list is empty, we can only use the LRU. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); return(FALSE); } @@ -175,14 +181,20 @@ decompressed pages in the buffer pool. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); return(FALSE); } /* If eviction hasn't started yet, we assume by default that a workload is disk bound. */ if (buf_pool->freed_page_clock == 0) { + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); return(TRUE); } + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); /* Calculate the average over past intervals, and add the values of the current interval. */ @@ -250,18 +262,25 @@ page_arr = ut_malloc( sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); num_entries = 0; scan_again: bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { + /* bpage->state,space,io_fix,buf_fix_count are protected by block_mutex at XtraDB */ + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); buf_page_t* prev_bpage; ibool is_fixed; prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + if (UNIV_UNLIKELY(!block_mutex)) { + goto next_page; + } + ut_a(buf_page_in_file(bpage)); if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE @@ -270,24 +289,30 @@ /* Compressed pages are never hashed. Skip blocks of other tablespaces. Skip I/O-fixed blocks (to be dealt with later). */ + mutex_exit(block_mutex); next_page: bpage = prev_bpage; continue; } - mutex_enter(&((buf_block_t*) bpage)->mutex); + //mutex_enter(&((buf_block_t*) bpage)->mutex); is_fixed = bpage->buf_fix_count > 0 || !((buf_block_t*) bpage)->index; - mutex_exit(&((buf_block_t*) bpage)->mutex); + //mutex_exit(&((buf_block_t*) bpage)->mutex); if (is_fixed) { + mutex_exit(block_mutex); goto next_page; } /* Store the page number so that we can drop the hash index in a batch later. */ page_arr[num_entries] = bpage->offset; + + mutex_exit(block_mutex); + ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE); + ++num_entries; if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) { @@ -296,14 +321,16 @@ /* Array full. We release the buf_pool->mutex to obey the latching order. */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); buf_LRU_drop_page_hash_batch( id, zip_size, page_arr, num_entries); num_entries = 0; - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); /* Note that we released the buf_pool mutex above after reading the prev_bpage during processing of a @@ -321,13 +348,23 @@ /* If, however, bpage has been removed from LRU list to the free list then we should restart the scan. bpage->state is protected by buf_pool mutex. */ + + /* obtain block_mutex again to avoid race condition of bpage->state */ + block_mutex = buf_page_get_mutex_enter(bpage); + if (!block_mutex) { + goto scan_again; + } + if (bpage && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(block_mutex); goto scan_again; } + mutex_exit(block_mutex); } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); @@ -351,7 +388,9 @@ ulint i; scan_again: - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); buf_flush_list_mutex_enter(buf_pool); all_freed = TRUE; @@ -364,7 +403,7 @@ ut_a(buf_page_in_file(bpage)); - prev_bpage = UT_LIST_GET_PREV(list, bpage); + prev_bpage = UT_LIST_GET_PREV(flush_list, bpage); /* bpage->space and bpage->io_fix are protected by buf_pool->mutex and block_mutex. It is safe to check @@ -388,8 +427,14 @@ will stay in the flush_list because buf_flush_remove() needs buf_pool->mutex as well. */ buf_flush_list_mutex_exit(buf_pool); - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); + + if (!block_mutex) { + /* It may be impossible case... + Something wrong, so will be scan_again */ + all_freed = FALSE; + goto next_page; + } if (bpage->buf_fix_count > 0) { mutex_exit(block_mutex); @@ -440,9 +485,15 @@ mutex_exit(block_mutex); /* Now it is safe to release the buf_pool->mutex. */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + os_thread_yield(); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); + mutex_enter(block_mutex); buf_page_unset_sticky(bpage); @@ -454,7 +505,9 @@ i = 0; } - buf_pool_mutex_exit(buf_pool); +// buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_flush_list_mutex_exit(buf_pool); ut_ad(buf_flush_validate(buf_pool)); @@ -504,7 +557,9 @@ buf_page_t* b; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); /* Find the first successor of bpage in the LRU list @@ -512,17 +567,17 @@ b = bpage; do { b = UT_LIST_GET_NEXT(LRU, b); - } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + } while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list)); /* Insert bpage before b, i.e., after the predecessor of b. */ if (b) { - b = UT_LIST_GET_PREV(list, b); + b = UT_LIST_GET_PREV(zip_list, b); } if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage); + UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage); } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage); } } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -536,18 +591,19 @@ buf_LRU_free_from_unzip_LRU_list( /*=============================*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ - ulint n_iterations) /*!< in: how many times this has + ulint n_iterations, /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; we will search n_iterations / 5 of the unzip_LRU list, or nothing if n_iterations >= 5 */ + ibool have_LRU_mutex) { buf_block_t* block; ulint distance; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); /* Theoratically it should be much easier to find a victim from unzip_LRU as we can choose even a dirty block (as we'll @@ -557,7 +613,7 @@ if we have done five iterations so far. */ if (UNIV_UNLIKELY(n_iterations >= 5) - || !buf_LRU_evict_from_unzip_LRU(buf_pool)) { + || !buf_LRU_evict_from_unzip_LRU(buf_pool, have_LRU_mutex)) { return(FALSE); } @@ -565,18 +621,25 @@ distance = 100 + (n_iterations * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5; +restart: for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { ibool freed; + mutex_enter(&block->mutex); + if (!block->in_unzip_LRU_list || !block->page.in_LRU_list + || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(&block->mutex); + goto restart; + } + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); - mutex_enter(&block->mutex); - freed = buf_LRU_free_block(&block->page, FALSE); + freed = buf_LRU_free_block(&block->page, FALSE, have_LRU_mutex); mutex_exit(&block->mutex); if (freed) { @@ -595,35 +658,46 @@ buf_LRU_free_from_common_LRU_list( /*==============================*/ buf_pool_t* buf_pool, - ulint n_iterations) + ulint n_iterations, /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; if n_iterations < 10, then we search n_iterations / 10 * buf_pool->curr_size pages from the end of the LRU list */ + ibool have_LRU_mutex) { buf_page_t* bpage; ulint distance; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); distance = 100 + (n_iterations * buf_pool->curr_size) / 10; +restart: for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { ibool freed; unsigned accessed; - mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); + + if (!block_mutex) { + goto restart; + } + + if (!bpage->in_LRU_list + || !buf_page_in_file(bpage)) { + mutex_exit(block_mutex); + goto restart; + } ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); - mutex_enter(block_mutex); accessed = buf_page_is_accessed(bpage); - freed = buf_LRU_free_block(bpage, TRUE); + freed = buf_LRU_free_block(bpage, TRUE, have_LRU_mutex); mutex_exit(block_mutex); if (freed) { @@ -660,16 +734,23 @@ n_iterations / 5 of the unzip_LRU list. */ { ibool freed = FALSE; + ibool have_LRU_mutex = FALSE; - buf_pool_mutex_enter(buf_pool); + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + have_LRU_mutex = TRUE; + + //buf_pool_mutex_enter(buf_pool); + if (have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); - freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations); + freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations, have_LRU_mutex); if (!freed) { freed = buf_LRU_free_from_common_LRU_list( - buf_pool, n_iterations); + buf_pool, n_iterations, have_LRU_mutex); } + buf_pool_mutex_enter(buf_pool); if (!freed) { buf_pool->LRU_flush_ended = 0; } else if (buf_pool->LRU_flush_ended > 0) { @@ -677,6 +758,8 @@ } buf_pool_mutex_exit(buf_pool); + if (have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); return(freed); } @@ -737,7 +820,9 @@ buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->free_list_mutex); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) @@ -747,7 +832,9 @@ ret = TRUE; } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_exit(&buf_pool->free_list_mutex); } return(ret); @@ -765,9 +852,10 @@ { buf_block_t* block; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); - block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free); + mutex_enter(&buf_pool->free_list_mutex); + block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free); if (block) { @@ -776,7 +864,9 @@ ut_ad(!block->page.in_flush_list); ut_ad(!block->page.in_LRU_list); ut_a(!buf_page_in_file(&block->page)); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); + + mutex_exit(&buf_pool->free_list_mutex); mutex_enter(&block->mutex); @@ -786,6 +876,8 @@ ut_ad(buf_pool_from_block(block) == buf_pool); mutex_exit(&block->mutex); + } else { + mutex_exit(&buf_pool->free_list_mutex); } return(block); @@ -808,7 +900,7 @@ ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { @@ -876,7 +968,7 @@ /* If there is a block in the free list, take it */ block = buf_LRU_get_free_only(buf_pool); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); if (block) { ut_ad(buf_pool_from_block(block) == buf_pool); @@ -976,7 +1068,8 @@ ulint new_len; ut_a(buf_pool->LRU_old); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); #if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) @@ -1042,7 +1135,8 @@ { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use @@ -1077,13 +1171,14 @@ ut_ad(buf_pool); ut_ad(bpage); ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (buf_page_belongs_to_unzip_LRU(bpage)) { buf_block_t* block = (buf_block_t*) bpage; ut_ad(block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = FALSE); + block->in_unzip_LRU_list = FALSE; UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block); } @@ -1101,7 +1196,8 @@ ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1178,12 +1274,13 @@ ut_ad(buf_pool); ut_ad(block); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); ut_ad(!block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = TRUE); + block->in_unzip_LRU_list = TRUE; if (old) { UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block); @@ -1204,7 +1301,8 @@ ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1255,7 +1353,8 @@ ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); ut_ad(!bpage->in_LRU_list); @@ -1334,7 +1433,8 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (bpage->old) { buf_pool->stat.n_pages_made_young++; @@ -1373,17 +1473,18 @@ buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip) /*!< in: TRUE if should remove also the + ibool zip, /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ + ibool have_LRU_mutex) { buf_page_t* b = NULL; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); mutex_t* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + //ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); #if UNIV_WORD_SIZE == 4 /* On 32-bit systems, there is no padding in buf_page_t. On @@ -1392,7 +1493,7 @@ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); #endif - if (!buf_page_can_relocate(bpage)) { + if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ return(FALSE); @@ -1426,7 +1527,7 @@ alloc: b = buf_page_alloc_descriptor(); ut_a(b); - memcpy(b, bpage, sizeof *b); + //memcpy(b, bpage, sizeof *b); } #ifdef UNIV_DEBUG @@ -1437,6 +1538,39 @@ } #endif /* UNIV_DEBUG */ + /* not to break latch order, must re-enter block_mutex */ + mutex_exit(block_mutex); + + if (!have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); /* optimistic */ + rw_lock_x_lock(&buf_pool->page_hash_latch); + mutex_enter(block_mutex); + + /* recheck states of block */ + if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage) + || !buf_page_can_relocate(bpage)) { +not_freed: + if (b) { + buf_buddy_free(buf_pool, b, sizeof *b, TRUE); + } + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + return(FALSE); + } else if (zip || !bpage->zip.data) { + if (bpage->oldest_modification) + goto not_freed; + } else if (bpage->oldest_modification) { + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); + goto not_freed; + } + } + + if (b) { + memcpy(b, bpage, sizeof *b); + } + if (buf_LRU_block_remove_hashed_page(bpage, zip) != BUF_BLOCK_ZIP_FREE) { ut_a(bpage->buf_fix_count == 0); @@ -1453,6 +1587,10 @@ ut_a(!hash_b); + while (prev_b && !prev_b->in_LRU_list) { + prev_b = UT_LIST_GET_PREV(LRU, prev_b); + } + b->state = b->oldest_modification ? BUF_BLOCK_ZIP_DIRTY : BUF_BLOCK_ZIP_PAGE; @@ -1528,6 +1666,7 @@ buf_LRU_add_block_low(b, buf_page_is_old(b)); } + mutex_enter(&buf_pool->zip_mutex); if (b->state == BUF_BLOCK_ZIP_PAGE) { #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(b); @@ -1543,12 +1682,13 @@ /* Prevent buf_page_get_gen() from decompressing the block while we release buf_pool->mutex and block_mutex. */ - mutex_enter(&buf_pool->zip_mutex); buf_page_set_sticky(b); mutex_exit(&buf_pool->zip_mutex); } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); mutex_exit(block_mutex); /* Remove possible adaptive hash index on the page. @@ -1580,7 +1720,9 @@ : BUF_NO_CHECKSUM_MAGIC); } - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + if (have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); mutex_enter(block_mutex); if (b) { @@ -1589,13 +1731,17 @@ mutex_exit(&buf_pool->zip_mutex); } - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE); } else { /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() when it returns BUF_BLOCK_ZIP_FREE. */ ut_ad(block_mutex == &buf_pool->zip_mutex); mutex_enter(block_mutex); + + if (!have_LRU_mutex) + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); } return(TRUE); @@ -1607,13 +1753,14 @@ void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block) /*!< in: block, must not contain a file page */ + buf_block_t* block, /*!< in: block, must not contain a file page */ + ibool have_page_hash_mutex) { void* data; buf_pool_t* buf_pool = buf_pool_from_block(block); ut_ad(block); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(&block->mutex)); switch (buf_block_get_state(block)) { @@ -1647,18 +1794,21 @@ if (data) { block->page.zip.data = NULL; mutex_exit(&block->mutex); - buf_pool_mutex_exit_forbid(buf_pool); + //buf_pool_mutex_exit_forbid(buf_pool); buf_buddy_free( - buf_pool, data, page_zip_get_size(&block->page.zip)); + buf_pool, data, page_zip_get_size(&block->page.zip), + have_page_hash_mutex); - buf_pool_mutex_exit_allow(buf_pool); + //buf_pool_mutex_exit_allow(buf_pool); mutex_enter(&block->mutex); page_zip_set_size(&block->page.zip, 0); } - UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page)); + mutex_enter(&buf_pool->free_list_mutex); + UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&buf_pool->free_list_mutex); UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); } @@ -1688,7 +1838,11 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -1796,7 +1950,9 @@ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -1818,17 +1974,17 @@ ut_a(buf_page_get_zip_size(bpage)); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ mutex_exit(&buf_pool->zip_mutex); - buf_pool_mutex_exit_forbid(buf_pool); + //buf_pool_mutex_exit_forbid(buf_pool); buf_buddy_free( buf_pool, bpage->zip.data, - page_zip_get_size(&bpage->zip)); + page_zip_get_size(&bpage->zip), TRUE); - buf_pool_mutex_exit_allow(buf_pool); + //buf_pool_mutex_exit_allow(buf_pool); buf_page_free_descriptor(bpage); return(BUF_BLOCK_ZIP_FREE); @@ -1850,13 +2006,13 @@ ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); mutex_exit(&((buf_block_t*) bpage)->mutex); - buf_pool_mutex_exit_forbid(buf_pool); + //buf_pool_mutex_exit_forbid(buf_pool); buf_buddy_free( buf_pool, data, - page_zip_get_size(&bpage->zip)); + page_zip_get_size(&bpage->zip), TRUE); - buf_pool_mutex_exit_allow(buf_pool); + //buf_pool_mutex_exit_allow(buf_pool); mutex_enter(&((buf_block_t*) bpage)->mutex); page_zip_set_size(&bpage->zip, 0); } @@ -1882,18 +2038,19 @@ void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block) /*!< in: block, must contain a file page and + buf_block_t* block, /*!< in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex) { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_block(block); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif ut_ad(mutex_own(&block->mutex)); buf_block_set_state(block, BUF_BLOCK_MEMORY); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); } /******************************************************************//** @@ -1908,7 +2065,7 @@ { if (buf_LRU_block_remove_hashed_page(bpage, TRUE) != BUF_BLOCK_ZIP_FREE) { - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE); } } @@ -1936,7 +2093,8 @@ } if (adjust) { - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); if (ratio != buf_pool->LRU_old_ratio) { buf_pool->LRU_old_ratio = ratio; @@ -1948,7 +2106,8 @@ } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } else { buf_pool->LRU_old_ratio = ratio; } @@ -2053,7 +2212,8 @@ ulint new_len; ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -2114,16 +2274,22 @@ ut_a(buf_pool->LRU_old_len == old_len); - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->free_list_mutex); + + UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free, ut_ad(ut_list_node_313->in_free_list)); for (bpage = UT_LIST_GET_FIRST(buf_pool->free); bpage != NULL; - bpage = UT_LIST_GET_NEXT(list, bpage)) { + bpage = UT_LIST_GET_NEXT(free, bpage)) { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); } + mutex_exit(&buf_pool->free_list_mutex); + mutex_enter(&buf_pool->LRU_list_mutex); + UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU, ut_ad(ut_list_node_313->in_unzip_LRU_list && ut_list_node_313->page.in_LRU_list)); @@ -2137,7 +2303,8 @@ ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /**********************************************************************//** @@ -2173,7 +2340,8 @@ const buf_page_t* bpage; ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); bpage = UT_LIST_GET_FIRST(buf_pool->LRU); @@ -2230,7 +2398,8 @@ bpage = UT_LIST_GET_NEXT(LRU, bpage); } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /**********************************************************************//** --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -478,6 +478,7 @@ return(0); } + buf_pool_mutex_exit(buf_pool); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -496,6 +497,7 @@ fail_count = 0; + rw_lock_s_lock(&buf_pool->page_hash_latch); for (i = low; i < high; i++) { bpage = buf_page_hash_get(buf_pool, space, i); @@ -523,7 +525,8 @@ if (fail_count > threshold) { /* Too many failures: return */ - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } @@ -538,7 +541,8 @@ bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage == NULL) { - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } @@ -564,7 +568,8 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); if ((offset == low) && (succ_offset == offset + 1)) { --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -265,6 +265,10 @@ # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ {&buf_pool_mutex_key, "buf_pool_mutex", 0}, {&buf_pool_zip_mutex_key, "buf_pool_zip_mutex", 0}, + {&buf_pool_LRU_list_mutex_key, "buf_pool_LRU_list_mutex", 0}, + {&buf_pool_free_list_mutex_key, "buf_pool_free_list_mutex", 0}, + {&buf_pool_zip_free_mutex_key, "buf_pool_zip_free_mutex", 0}, + {&buf_pool_zip_hash_mutex_key, "buf_pool_zip_hash_mutex", 0}, {&cache_last_read_mutex_key, "cache_last_read_mutex", 0}, {&dict_foreign_err_mutex_key, "dict_foreign_err_mutex", 0}, {&dict_sys_mutex_key, "dict_sys_mutex", 0}, @@ -314,6 +318,7 @@ {&archive_lock_key, "archive_lock", 0}, # endif /* UNIV_LOG_ARCHIVE */ {&btr_search_latch_key, "btr_search_latch", 0}, + {&buf_pool_page_hash_key, "buf_pool_page_hash_latch", 0}, # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK {&buf_block_lock_key, "buf_block_lock", 0}, # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -1583,7 +1583,8 @@ buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->zip_free_mutex); for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { buf_buddy_stat_t* buddy_stat; @@ -1613,7 +1614,8 @@ } } - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->zip_free_mutex); if (status) { break; --- a/storage/innobase/ibuf/ibuf0ibuf.c +++ b/storage/innobase/ibuf/ibuf0ibuf.c @@ -3760,9 +3760,11 @@ ulint fold = buf_page_address_fold(space, page_no); buf_pool_t* buf_pool = buf_pool_get(space, page_no); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get_low(buf_pool, space, page_no, fold); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); if (UNIV_LIKELY_NULL(bpage)) { /* A buffer pool watch has been set or the --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -49,11 +49,12 @@ ulint size, /*!< in: compressed page size (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ + ibool have_page_hash_mutex) __attribute__((malloc, nonnull)); /**********************************************************************//** @@ -66,8 +67,9 @@ the block resides */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /*!< in: block size, + ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) __attribute__((nonnull)); #ifndef UNIV_NONINL --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -45,11 +45,12 @@ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ + ibool have_page_hash_mutex) __attribute__((malloc, nonnull)); /**********************************************************************//** @@ -61,8 +62,9 @@ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], + ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ + ibool have_page_hash_mutex) __attribute__((nonnull)); /**********************************************************************//** @@ -101,19 +103,20 @@ ulint size, /*!< in: compressed page size (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable + ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= PAGE_ZIP_MIN_SIZE); ut_ad(size <= UNIV_PAGE_SIZE); return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), - lru)); + lru, have_page_hash_mutex)); } /**********************************************************************//** @@ -126,15 +129,28 @@ the block resides */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /*!< in: block size, + ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= PAGE_ZIP_MIN_SIZE); ut_ad(size <= UNIV_PAGE_SIZE); - buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); + if (!have_page_hash_mutex) { + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(&buf_pool->page_hash_latch); + } + + mutex_enter(&buf_pool->zip_free_mutex); + buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size), TRUE); + mutex_exit(&buf_pool->zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } } #ifdef UNIV_MATERIALIZE --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -212,6 +212,20 @@ /*==========================*/ /********************************************************************//** +*/ +UNIV_INLINE +void +buf_pool_page_hash_x_lock_all(void); +/*================================*/ + +/********************************************************************//** +*/ +UNIV_INLINE +void +buf_pool_page_hash_x_unlock_all(void); +/*==================================*/ + +/********************************************************************//** Creates the buffer pool. @return own: buf_pool object, NULL if not enough memory or error */ UNIV_INTERN @@ -851,6 +865,15 @@ const buf_page_t* bpage) /*!< in: pointer to control block */ __attribute__((pure)); +/************************************************************************* +Gets the mutex of a block and enter the mutex with consistency. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex_enter( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + /*********************************************************************//** Get the flush type of a page. @return flush type */ @@ -1352,7 +1375,7 @@ All these are protected by buf_pool->mutex. */ /* @{ */ - UT_LIST_NODE_T(buf_page_t) list; + /* UT_LIST_NODE_T(buf_page_t) list; */ /*!< based on state, this is a list node, protected either by buf_pool->mutex or by @@ -1380,6 +1403,10 @@ BUF_BLOCK_REMOVE_HASH or BUF_BLOCK_READY_IN_USE. */ + /* resplit for optimistic use */ + UT_LIST_NODE_T(buf_page_t) free; + UT_LIST_NODE_T(buf_page_t) flush_list; + UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */ #ifdef UNIV_DEBUG ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list; when buf_pool->flush_list_mutex is @@ -1472,11 +1499,11 @@ a block is in the unzip_LRU list if page.state == BUF_BLOCK_FILE_PAGE and page.zip.data != NULL */ -#ifdef UNIV_DEBUG +//#ifdef UNIV_DEBUG ibool in_unzip_LRU_list;/*!< TRUE if the page is in the decompressed LRU list; used in debugging */ -#endif /* UNIV_DEBUG */ +//#endif /* UNIV_DEBUG */ mutex_t mutex; /*!< mutex protecting this block: state (also protected by the buffer pool mutex), io_fix, buf_fix_count, @@ -1656,6 +1683,11 @@ pool instance, protects compressed only pages (of type buf_page_t, not buf_block_t */ + mutex_t LRU_list_mutex; + rw_lock_t page_hash_latch; + mutex_t free_list_mutex; + mutex_t zip_free_mutex; + mutex_t zip_hash_mutex; ulint instance_no; /*!< Array index of this buffer pool instance */ ulint old_pool_size; /*!< Old pool size in bytes */ @@ -1809,8 +1841,8 @@ /** Test if a buffer pool mutex is owned. */ #define buf_pool_mutex_own(b) mutex_own(&b->mutex) /** Acquire a buffer pool mutex. */ +/* the buf_pool_mutex is changed the latch order */ #define buf_pool_mutex_enter(b) do { \ - ut_ad(!mutex_own(&b->zip_mutex)); \ mutex_enter(&b->mutex); \ } while (0) --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -292,7 +292,7 @@ case BUF_BLOCK_ZIP_FREE: /* This is a free page in buf_pool->zip_free[]. Such pages should only be accessed by the buddy allocator. */ - ut_error; + /* ut_error; */ /* optimistic */ break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -335,9 +335,16 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + if (/*equivalent to buf_pool_watch_is_sentinel(buf_pool, bpage)*/ + bpage >= &buf_pool->watch[0] + && bpage < &buf_pool->watch[BUF_POOL_WATCH_SIZE]) { + /* TODO: this code is the interim. should be confirmed later. */ + return(&buf_pool->zip_mutex); + } + switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: - ut_error; + /* ut_error; */ /* optimistic */ return(NULL); case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -347,6 +354,28 @@ } } +/************************************************************************* +Gets the mutex of a block and enter the mutex with consistency. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex_enter( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + mutex_t* block_mutex; + + while(1) { + block_mutex = buf_page_get_mutex(bpage); + if (!block_mutex) + return block_mutex; + + mutex_enter(block_mutex); + if (block_mutex == buf_page_get_mutex(bpage)) + return block_mutex; + mutex_exit(block_mutex); + } +} + /*********************************************************************//** Get the flush type of a page. @return flush type */ @@ -444,8 +473,8 @@ enum buf_io_fix io_fix) /*!< in: io_fix state */ { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); @@ -482,7 +511,7 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); #endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -500,7 +529,7 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); #endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); @@ -518,14 +547,14 @@ const buf_page_t* bpage) /*!< control block being relocated */ { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + //ut_ad(bpage->in_LRU_list); - return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE && bpage->buf_fix_count == 0); } @@ -539,8 +568,8 @@ const buf_page_t* bpage) /*!< in: control block */ { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif ut_ad(buf_page_in_file(bpage)); @@ -560,7 +589,8 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); #endif /* UNIV_DEBUG */ ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(bpage->in_LRU_list); #ifdef UNIV_LRU_DEBUG @@ -607,9 +637,10 @@ ulint time_ms) /*!< in: ut_time_ms() */ { #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + //ut_ad(buf_pool_mutex_own(buf_pool)); #endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_in_file(bpage)); if (!bpage->access_time) { @@ -852,19 +883,19 @@ /*===========*/ buf_block_t* block) /*!< in, own: block to be freed */ { - buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*)block); + //buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*)block); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); mutex_enter(&block->mutex); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); } #endif /* !UNIV_HOTBACKUP */ @@ -912,17 +943,17 @@ page frame */ { ib_uint64_t lsn; - mutex_t* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); - if (buf_page_in_file(bpage)) { + if (block_mutex && buf_page_in_file(bpage)) { lsn = bpage->newest_modification; } else { lsn = 0; } - mutex_exit(block_mutex); + if (block_mutex) { + mutex_exit(block_mutex); + } return(lsn); } @@ -940,7 +971,7 @@ #ifdef UNIV_SYNC_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*)block); - ut_ad((buf_pool_mutex_own(buf_pool) + ut_ad((mutex_own(&buf_pool->LRU_list_mutex) && (block->page.buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ @@ -1070,7 +1101,11 @@ buf_page_t* bpage; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own(buf_pool)); + //ut_ad(buf_pool_mutex_own(buf_pool)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX) + || rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED)); +#endif ut_ad(fold == buf_page_address_fold(space, offset)); /* Look for the page in the hash table */ @@ -1155,11 +1190,13 @@ const buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(buf_pool); + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); bpage = buf_page_hash_get(buf_pool, space, offset); - buf_pool_mutex_exit(buf_pool); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); return(bpage != NULL); } @@ -1287,4 +1324,38 @@ buf_pool_mutex_exit(buf_pool); } } + +/********************************************************************//** +*/ +UNIV_INLINE +void +buf_pool_page_hash_x_lock_all(void) +/*===============================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + rw_lock_x_lock(&buf_pool->page_hash_latch); + } +} + +/********************************************************************//** +*/ +UNIV_INLINE +void +buf_pool_page_hash_x_unlock_all(void) +/*=================================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + rw_lock_x_unlock(&buf_pool->page_hash_latch); + } +} #endif /* !UNIV_HOTBACKUP */ --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -100,8 +100,9 @@ buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip) /*!< in: TRUE if should remove also the + ibool zip, /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ + ibool have_LRU_mutex) __attribute__((nonnull)); /******************************************************************//** Try to free a replaceable block. @@ -148,7 +149,8 @@ void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block); /*!< in: block, must not contain a file page */ + buf_block_t* block, /*!< in: block, must not contain a file page */ + ibool have_page_hash_mutex); /******************************************************************//** Adds a block to the LRU list. */ UNIV_INTERN --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -112,6 +112,7 @@ extern mysql_pfs_key_t archive_lock_key; # endif /* UNIV_LOG_ARCHIVE */ extern mysql_pfs_key_t btr_search_latch_key; +extern mysql_pfs_key_t buf_pool_page_hash_key; extern mysql_pfs_key_t buf_block_lock_key; # ifdef UNIV_SYNC_DEBUG extern mysql_pfs_key_t buf_block_debug_latch_key; --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -75,6 +75,10 @@ extern mysql_pfs_key_t buffer_block_mutex_key; extern mysql_pfs_key_t buf_pool_mutex_key; extern mysql_pfs_key_t buf_pool_zip_mutex_key; +extern mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +extern mysql_pfs_key_t buf_pool_free_list_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_free_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_hash_mutex_key; extern mysql_pfs_key_t cache_last_read_mutex_key; extern mysql_pfs_key_t dict_foreign_err_mutex_key; extern mysql_pfs_key_t dict_sys_mutex_key; @@ -667,7 +671,7 @@ #define SYNC_TRX_SYS_HEADER 290 #define SYNC_PURGE_QUEUE 200 #define SYNC_LOG 170 -#define SYNC_LOG_FLUSH_ORDER 147 +#define SYNC_LOG_FLUSH_ORDER 156 #define SYNC_RECV 168 #define SYNC_WORK_QUEUE 162 #define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory @@ -676,8 +680,13 @@ SYNC_SEARCH_SYS, as memory allocation can call routines there! Otherwise the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_LRU_LIST 158 +#define SYNC_BUF_PAGE_HASH 157 +#define SYNC_BUF_BLOCK 155 /* Block mutex */ +#define SYNC_BUF_FREE_LIST 153 +#define SYNC_BUF_ZIP_FREE 152 +#define SYNC_BUF_ZIP_HASH 151 #define SYNC_BUF_POOL 150 /* Buffer pool mutex */ -#define SYNC_BUF_BLOCK 146 /* Block mutex */ #define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */ #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 @@ -708,7 +717,7 @@ os_fast_mutex; /*!< We use this OS mutex in place of lock_word when atomic operations are not enabled */ #endif - ulint waiters; /*!< This ulint is set to 1 if there are (or + volatile ulint waiters; /*!< This ulint is set to 1 if there are (or may be) threads waiting in the global wait array for this mutex to be released. Otherwise, this is 0. */ --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -3105,7 +3105,7 @@ level += log_sys->max_checkpoint_age - (lsn - oldest_modification); } - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(flush_list, bpage); n_blocks++; } @@ -3191,7 +3191,7 @@ found = TRUE; break; } - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(flush_list, bpage); new_blocks_num++; } if (!found) { --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -285,7 +285,7 @@ mutex->lock_word = 0; #endif mutex->event = os_event_create(NULL); - mutex_set_waiters(mutex, 0); + mutex->waiters = 0; #ifdef UNIV_DEBUG mutex->magic_n = MUTEX_MAGIC_N; #endif /* UNIV_DEBUG */ @@ -464,6 +464,15 @@ mutex_t* mutex, /*!< in: mutex */ ulint n) /*!< in: value to set */ { +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + ut_ad(mutex); + + if (n) { + os_compare_and_swap_ulint(&mutex->waiters, 0, 1); + } else { + os_compare_and_swap_ulint(&mutex->waiters, 1, 0); + } +#else volatile ulint* ptr; /* declared volatile to ensure that the value is stored to memory */ ut_ad(mutex); @@ -472,6 +481,7 @@ *ptr = n; /* Here we assume that the write of a single word in memory is atomic */ +#endif } /******************************************************************//** @@ -1233,7 +1243,12 @@ ut_error; } break; + case SYNC_BUF_LRU_LIST: case SYNC_BUF_FLUSH_LIST: + case SYNC_BUF_PAGE_HASH: + case SYNC_BUF_FREE_LIST: + case SYNC_BUF_ZIP_FREE: + case SYNC_BUF_ZIP_HASH: case SYNC_BUF_POOL: /* We can have multiple mutexes of this type therefore we can only check whether the greater than condition holds. */ @@ -1251,7 +1266,8 @@ buffer block (block->mutex or buf_pool->zip_mutex). */ if (!sync_thread_levels_g(array, level, FALSE)) { ut_a(sync_thread_levels_g(array, level - 1, TRUE)); - ut_a(sync_thread_levels_contain(array, SYNC_BUF_POOL)); + /* the exact rule is not fixed yet, for now */ + //ut_a(sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST)); } break; case SYNC_REC_LOCK: