diff -r 2e0c46e78b50 innobase/buf/buf0buf.c --- a/innobase/buf/buf0buf.c Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/buf/buf0buf.c Mon Dec 22 00:33:59 2008 -0800 @@ -548,6 +548,19 @@ mutex_create(&(buf_pool->mutex)); mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + mutex_create(&(buf_pool->flush_list_mutex)); + mutex_create(&(buf_pool->LRU_mutex)); + mutex_create(&(buf_pool->free_mutex)); + rw_lock_create(&(buf_pool->hash_latch)); + mutex_set_level(&(buf_pool->flush_list_mutex), SYNC_NO_ORDER_CHECK); + mutex_set_level(&(buf_pool->LRU_mutex), SYNC_NO_ORDER_CHECK); + mutex_set_level(&(buf_pool->free_mutex), SYNC_NO_ORDER_CHECK); + rw_lock_set_level(&(buf_pool->hash_latch), SYNC_NO_ORDER_CHECK); + + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); mutex_enter(&(buf_pool->mutex)); if (srv_use_awe) { @@ -723,6 +736,10 @@ block->in_free_list = TRUE; } + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); mutex_exit(&(buf_pool->mutex)); if (srv_use_adaptive_hash_indexes) { @@ -859,12 +876,12 @@ if (buf_pool->freed_page_clock >= block->freed_page_clock + 1 + (buf_pool->curr_size / 4)) { - mutex_enter(&buf_pool->mutex); + mutex_enter(&(buf_pool->LRU_mutex)); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(block); - mutex_exit(&buf_pool->mutex); + mutex_exit(&(buf_pool->LRU_mutex)); } } @@ -880,7 +897,7 @@ { buf_block_t* block; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); block = buf_block_align(frame); @@ -888,7 +905,7 @@ buf_LRU_make_block_young(block); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } /************************************************************************ @@ -899,7 +916,7 @@ /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&block->mutex); @@ -909,7 +926,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); } /************************************************************************* @@ -950,11 +967,11 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -971,7 +988,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -979,7 +996,7 @@ block->check_index_page_at_flush = FALSE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); } /************************************************************************ @@ -998,7 +1015,7 @@ buf_block_t* block; ibool is_hashed; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1008,7 +1025,7 @@ is_hashed = block->is_hashed; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(is_hashed); } @@ -1050,7 +1067,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1058,7 +1075,7 @@ block->file_page_was_freed = TRUE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -1079,7 +1096,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1087,7 +1104,7 @@ block->file_page_was_freed = FALSE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -1166,26 +1183,33 @@ buf_pool->n_page_gets++; loop: block = NULL; - mutex_enter_fast(&(buf_pool->mutex)); + // mutex_enter_fast(&(buf_pool->mutex)); if (guess) { block = buf_block_align(guess); + mutex_enter(&block->mutex); if ((offset != block->offset) || (space != block->space) || (block->state != BUF_BLOCK_FILE_PAGE)) { + mutex_exit(&block->mutex); block = NULL; } } if (block == NULL) { + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); + if(block) { + mutex_enter(&block->mutex); + } + rw_lock_s_unlock(&(buf_pool->hash_latch)); } if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - mutex_exit(&(buf_pool->mutex)); + // mutex_exit(&(buf_pool->mutex)); if (mode == BUF_GET_IF_IN_POOL) { @@ -1204,7 +1228,7 @@ goto loop; } - mutex_enter(&block->mutex); + // mutex_enter(&block->mutex); ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -1216,7 +1240,7 @@ if (mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - mutex_exit(&buf_pool->mutex); + // mutex_exit(&buf_pool->mutex); mutex_exit(&block->mutex); return(NULL); @@ -1233,7 +1257,9 @@ LRU list and we must put it to awe_LRU_free_mapped list once mapped to a frame */ + mutex_enter_fast(&(buf_pool->mutex)); buf_awe_map_page_to_frame(block, TRUE); + mutex_exit(&buf_pool->mutex); } #ifdef UNIV_SYNC_DEBUG @@ -1241,7 +1267,7 @@ #else buf_block_buf_fix_inc(block); #endif - mutex_exit(&buf_pool->mutex); + // mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1791,7 +1817,8 @@ ut_a(block); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); mutex_enter(&block->mutex); if (fil_tablespace_deleted_or_being_deleted_in_mem(space, @@ -1806,7 +1833,8 @@ being deleted, or the page is already in buf_pool, return */ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); buf_block_free(block); @@ -1821,10 +1849,14 @@ ut_ad(block); buf_page_init(space, offset, block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + mutex_exit(&(buf_pool->LRU_mutex)); + + mutex_enter(&(buf_pool->mutex)); /* for consistency about aio */ block->io_fix = BUF_IO_READ; @@ -1873,7 +1905,8 @@ free_block = buf_LRU_get_free_block(); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1884,7 +1917,8 @@ block->file_page_was_freed = FALSE; /* Page can be found in buf_pool */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); buf_block_free(free_block); @@ -1907,6 +1941,7 @@ mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* The block must be put to the LRU list */ buf_LRU_add_block(block, FALSE); @@ -1918,7 +1953,7 @@ #endif buf_pool->n_pages_created++; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -1932,7 +1967,7 @@ ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); frame = block->frame; @@ -1968,6 +2003,7 @@ { ulint io_type; ulint read_page_no; + ulint flush_type; buf_io_counter_t* io_counter; ulint fold; @@ -2050,9 +2086,6 @@ } } - mutex_enter(&(buf_pool->mutex)); - mutex_enter(&block->mutex); - #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif @@ -2061,9 +2094,12 @@ removes the newest lock debug record, without checking the thread id. */ - block->io_fix = 0; - if (io_type == BUF_IO_READ) { + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* NOTE that the call to ibuf may have moved the ownership of the x-latch to this OS thread: do not let this confuse you in debugging! */ @@ -2094,6 +2130,8 @@ } } + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has read ", stderr); @@ -2102,10 +2140,25 @@ } else { ut_ad(io_type == BUF_IO_WRITE); + flush_type = block->flush_type; + if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* Write means a flush operation: call the completion routine in the flush system */ buf_flush_write_complete(block); + + mutex_exit(&(buf_pool->flush_list_mutex)); + if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_exit(&(buf_pool->LRU_mutex)); + } rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); /* io_counter here */ @@ -2131,6 +2184,9 @@ buf_pool->n_pages_written++; + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); + #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has written ", stderr); @@ -2138,9 +2194,6 @@ #endif /* UNIV_DEBUG */ } - mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); - #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "page space %lu page no %lu\n", @@ -2168,11 +2221,11 @@ freed = buf_LRU_search_and_free_block(100); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } /************************************************************************* @@ -2191,10 +2244,22 @@ ulint n_flush = 0; ulint n_free = 0; ulint n_page = 0; + ulint n_single_flush_tmp = 0; + ulint n_lru_flush_tmp = 0; + ulint n_list_flush_tmp = 0; ut_ad(buf_pool); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); + mutex_enter(&(buf_pool->mutex)); + n_single_flush_tmp = buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]; + n_list_flush_tmp = buf_pool->n_flush[BUF_FLUSH_LIST]; + n_lru_flush_tmp = buf_pool->n_flush[BUF_FLUSH_LRU]; + mutex_exit(&(buf_pool->mutex)); for (i = 0; i < buf_pool->curr_size; i++) { @@ -2262,11 +2327,14 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + ut_a(n_single_flush_tmp == n_single_flush); + ut_a(n_list_flush_tmp == n_list_flush); + ut_a(n_lru_flush_tmp == n_lru_flush); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); @@ -2298,7 +2366,9 @@ index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); fprintf(stderr, "buf_pool size %lu\n" @@ -2351,7 +2421,9 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -2386,8 +2458,6 @@ ulint i; ulint fixed_pages_number = 0; - mutex_enter(&(buf_pool->mutex)); - for (i = 0; i < buf_pool->curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); @@ -2403,7 +2473,6 @@ } } - mutex_exit(&(buf_pool->mutex)); return fixed_pages_number; } #endif /* UNIV_DEBUG */ @@ -2431,7 +2500,9 @@ { ulint ratio; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) / (1 + UT_LIST_GET_LEN(buf_pool->LRU) @@ -2439,7 +2510,9 @@ /* 1 + is there to avoid division by zero */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(ratio); } @@ -2459,6 +2532,9 @@ ut_ad(buf_pool); size = buf_pool->curr_size; + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&(buf_pool->mutex)); if (srv_use_awe) { @@ -2532,6 +2608,9 @@ buf_pool->n_pages_written_old = buf_pool->n_pages_written; buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); mutex_exit(&(buf_pool->mutex)); } @@ -2562,8 +2641,6 @@ ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); - for (i = 0; i < buf_pool->curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); @@ -2584,8 +2661,6 @@ mutex_exit(&block->mutex); } - - mutex_exit(&(buf_pool->mutex)); return(TRUE); } @@ -2625,11 +2700,11 @@ { ulint len; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); len = UT_LIST_GET_LEN(buf_pool->free); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(len); } diff -r 2e0c46e78b50 innobase/buf/buf0flu.c --- a/innobase/buf/buf0flu.c Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/buf/buf0flu.c Mon Dec 22 00:33:59 2008 -0800 @@ -117,12 +117,14 @@ ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ if (block->state != BUF_BLOCK_FILE_PAGE) { + /* I permited not to own LRU_mutex.. */ +/* ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: buffer block state %lu in the LRU list!\n", (ulong)block->state); ut_print_buf(stderr, (byte*)block, sizeof(buf_block_t)); - +*/ return(FALSE); } @@ -536,18 +538,20 @@ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST || flush_type == BUF_FLUSH_SINGLE_PAGE); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); if (!block) { - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(0); } mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); if (flush_type == BUF_FLUSH_LIST && buf_flush_ready_for_flush(block, flush_type)) { @@ -744,7 +748,7 @@ high = fil_space_get_size(space); } - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { @@ -778,7 +782,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); /* Note: as we release the buf_pool mutex above, in buf_flush_try_page we cannot be sure @@ -789,14 +793,14 @@ count += buf_flush_try_page(space, i, flush_type); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); } else { mutex_exit(&block->mutex); } } } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(count); } @@ -849,7 +853,14 @@ } (buf_pool->init_flush)[flush_type] = TRUE; + + mutex_exit(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); + for (;;) { /* If we have flushed enough, leave the loop */ if (page_count >= min_n) { @@ -895,7 +906,10 @@ offset = block->offset; mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + mutex_exit(&(buf_pool->flush_list_mutex)); old_page_count = page_count; @@ -908,7 +922,10 @@ flush_type, offset, page_count - old_page_count); */ - mutex_enter(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); } else if (flush_type == BUF_FLUSH_LRU) { @@ -930,6 +947,13 @@ break; } } + + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + mutex_exit(&(buf_pool->flush_list_mutex)); + + mutex_enter(&(buf_pool->mutex)); (buf_pool->init_flush)[flush_type] = FALSE; @@ -989,10 +1013,14 @@ buf_block_t* block; ulint n_replaceable; ulint distance = 0; - - mutex_enter(&(buf_pool->mutex)); + + /* optimistic search... */ + //mutex_enter(&(buf_pool->LRU_mutex)); + //mutex_enter(&(buf_pool->free_mutex)); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + + //mutex_exit(&(buf_pool->free_mutex)); block = UT_LIST_GET_LAST(buf_pool->LRU); @@ -1014,7 +1042,7 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->LRU_mutex)); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { @@ -1033,8 +1061,9 @@ immediately, without waiting. */ void -buf_flush_free_margin(void) +buf_flush_free_margin( /*=======================*/ + ibool wait) { ulint n_to_flush; ulint n_flushed; @@ -1044,7 +1073,7 @@ if (n_to_flush > 0) { n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, ut_dulint_zero); - if (n_flushed == ULINT_UNDEFINED) { + if (wait && n_flushed == ULINT_UNDEFINED) { /* There was an LRU type flush batch already running; let us wait for it to end */ @@ -1094,11 +1123,11 @@ { ibool ret; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ret = buf_flush_validate_low(); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(ret); } diff -r 2e0c46e78b50 innobase/buf/buf0lru.c --- a/innobase/buf/buf0lru.c Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/buf/buf0lru.c Mon Dec 22 00:33:59 2008 -0800 @@ -79,7 +79,10 @@ ibool all_freed; scan_again: - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); all_freed = TRUE; @@ -117,7 +120,10 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* Note that the following call will acquire an S-latch on the page */ @@ -147,7 +153,10 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); if (!all_freed) { os_thread_sleep(20000); @@ -170,14 +179,14 @@ ulint len; ulint limit; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); len = UT_LIST_GET_LEN(buf_pool->LRU); if (len < BUF_LRU_OLD_MIN_LEN) { /* The LRU list is too short to do read-ahead */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(0); } @@ -186,7 +195,7 @@ limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(limit); } @@ -210,13 +219,15 @@ ulint distance = 0; ibool freed; - mutex_enter(&(buf_pool->mutex)); + /* optimistic search... */ + //mutex_enter(&(buf_pool->LRU_mutex)); +retry: freed = FALSE; block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { - ut_a(block->in_LRU_list); + //ut_a(block->in_LRU_list); /* optimistic */ mutex_enter(&block->mutex); @@ -231,9 +242,17 @@ } #endif /* UNIV_DEBUG */ + mutex_exit(&block->mutex); + + mutex_enter(&(buf_pool->LRU_mutex));/* optimistic */ + + rw_lock_x_lock(&(buf_pool->hash_latch)); + mutex_enter(&block->mutex); + if(block->in_LRU_list && buf_flush_ready_for_replace(block)) { buf_LRU_block_remove_hashed_page(block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mutex_exit(&block->mutex); /* Remove possible adaptive hash index built on the @@ -246,14 +265,25 @@ ut_a(block->buf_fix_count == 0); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&block->mutex); buf_LRU_block_free_hashed_page(block); freed = TRUE; + mutex_exit(&(buf_pool->free_mutex)); mutex_exit(&block->mutex); break; + } else { /* someone may interrupt...??? */ + mutex_exit(&(buf_pool->LRU_mutex));/* optimistic */ + + rw_lock_x_unlock(&(buf_pool->hash_latch)); + + if (!(block->in_LRU_list)) { + mutex_exit(&block->mutex); + goto retry; + } + } } mutex_exit(&block->mutex); @@ -264,13 +294,21 @@ if (!freed && n_iterations <= 10 && distance > 100 + (n_iterations * buf_pool->curr_size) / 10) { + + mutex_enter(&(buf_pool->mutex)); buf_pool->LRU_flush_ended = 0; + mutex_exit(&(buf_pool->mutex)); - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->LRU_mutex)); return(FALSE); } } + if (!freed) { + //mutex_exit(&(buf_pool->LRU_mutex)); + } + + mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } @@ -322,7 +360,8 @@ { ibool ret = FALSE; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) { @@ -330,7 +369,8 @@ ret = TRUE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(ret); } @@ -353,7 +393,7 @@ ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); /* LRU info:optimistic */ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { @@ -409,7 +449,7 @@ /* If there is a block in the free list, take it */ if (UT_LIST_GET_LEN(buf_pool->free) > 0) { - block = UT_LIST_GET_FIRST(buf_pool->free); + block = UT_LIST_GET_LAST(buf_pool->free); ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); block->in_free_list = FALSE; @@ -437,7 +477,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -449,7 +489,7 @@ /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); freed = buf_LRU_search_and_free_block(n_iterations); @@ -486,7 +526,7 @@ /* No free block was found: try to flush the LRU list */ - buf_flush_free_margin(); + buf_flush_free_margin(TRUE); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); @@ -958,7 +998,7 @@ ulint LRU_pos; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -1001,7 +1041,10 @@ if (buf_pool->LRU_old) { ut_a(buf_pool->LRU_old_len == old_len); - } + } + + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); @@ -1013,7 +1056,7 @@ block = UT_LIST_GET_NEXT(free, block); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(TRUE); } @@ -1029,7 +1072,7 @@ ulint len; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); @@ -1073,5 +1116,5 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } diff -r 2e0c46e78b50 innobase/buf/buf0rea.c --- a/innobase/buf/buf0rea.c Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/buf/buf0rea.c Mon Dec 22 00:33:59 2008 -0800 @@ -236,10 +236,12 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -250,8 +252,9 @@ recent_blocks++; } } + rw_lock_s_unlock(&(buf_pool->hash_latch)); - mutex_exit(&(buf_pool->mutex)); + // mutex_exit(&(buf_pool->mutex)); if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { /* Do nothing */ @@ -347,7 +350,7 @@ } /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); return(count + count2); } @@ -450,6 +453,7 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -463,6 +467,7 @@ fail_count = 0; + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -479,12 +484,13 @@ pred_block = block; } } + rw_lock_s_unlock(&(buf_pool->hash_latch)); if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - BUF_READ_AHEAD_LINEAR_THRESHOLD) { /* Too many failures: return */ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); return(0); } @@ -492,10 +498,11 @@ /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); if (block == NULL) { - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(0); } @@ -511,7 +518,7 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); if ((offset == low) && (succ_offset == offset + 1)) { @@ -587,7 +594,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { @@ -655,7 +662,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { @@ -727,7 +734,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { diff -r 2e0c46e78b50 innobase/include/buf0buf.h --- a/innobase/include/buf0buf.h Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/include/buf0buf.h Mon Dec 22 00:33:59 2008 -0800 @@ -946,6 +946,7 @@ mem_heap_t* io_counter_heap; ulint io_counters; hash_table_t* page_hash; /* hash table of the file pages */ + rw_lock_t hash_latch; ulint n_pend_reads; /* number of pending read operations */ @@ -978,6 +979,7 @@ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; /* base node of the modified block list */ + mutex_t flush_list_mutex; ibool init_flush[BUF_FLUSH_LIST + 1]; /* this is TRUE when a flush of the given type is being initialized */ @@ -1011,8 +1013,10 @@ in the case of AWE, at the start are always free blocks for which the physical memory is mapped to a frame */ + mutex_t free_mutex; UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ + mutex_t LRU_mutex; buf_block_t* LRU_old; /* pointer to the about 3/8 oldest blocks in the LRU list; NULL if LRU length less than BUF_LRU_OLD_MIN_LEN */ diff -r 2e0c46e78b50 innobase/include/buf0buf.ic --- a/innobase/include/buf0buf.ic Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/include/buf0buf.ic Mon Dec 22 00:33:59 2008 -0800 @@ -112,7 +112,7 @@ buf_block_t* block; dulint lsn; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -122,7 +122,7 @@ lsn = block->oldest_modification; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(lsn); } @@ -392,18 +392,18 @@ /* out: TRUE if io going on */ buf_block_t* block) /* in: buf_pool block, must be bufferfixed */ { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); ut_ad(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); if (block->io_fix != 0) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(TRUE); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(FALSE); } @@ -425,7 +425,7 @@ block = buf_block_align(frame); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->state == BUF_BLOCK_FILE_PAGE) { lsn = block->newest_modification; @@ -433,7 +433,7 @@ lsn = ut_dulint_zero; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(lsn); } @@ -632,9 +632,9 @@ ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (rw_latch == RW_X_LATCH && mtr->modifications) { - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->flush_list_mutex); buf_flush_note_modification(block, mtr); - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->flush_list_mutex); } mutex_enter(&block->mutex); diff -r 2e0c46e78b50 innobase/include/buf0flu.h --- a/innobase/include/buf0flu.h Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/include/buf0flu.h Mon Dec 22 00:33:59 2008 -0800 @@ -26,8 +26,9 @@ a margin of replaceable pages there. */ void -buf_flush_free_margin(void); +buf_flush_free_margin( /*=======================*/ + ibool wait); /************************************************************************ Initializes a page for writing to the tablespace. */ diff -r 2e0c46e78b50 innobase/include/buf0flu.ic --- a/innobase/include/buf0flu.ic Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/include/buf0flu.ic Mon Dec 22 00:33:59 2008 -0800 @@ -84,7 +84,7 @@ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0); @@ -102,5 +102,5 @@ start_lsn) <= 0); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); } diff -r 2e0c46e78b50 innobase/log/log0recv.c --- a/innobase/log/log0recv.c Mon Dec 22 00:33:53 2008 -0800 +++ b/innobase/log/log0recv.c Mon Dec 22 00:33:59 2008 -0800 @@ -1693,11 +1693,11 @@ mtr_start(&mtr); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); page = buf_page_hash_get(space, page_no)->frame; - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); replica = buf_page_get(space + RECV_REPLICA_SPACE_ADD, page_no, RW_X_LATCH, &mtr); diff -r 2e0c46e78b50 patch_info/split_buf_pool_mutex_fixed_optimistic_safe.info --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patch_info/split_buf_pool_mutex_fixed_optimistic_safe.info Mon Dec 22 00:33:59 2008 -0800 @@ -0,0 +1,6 @@ +File=split_buf_pool_mutex_fixed_optimistic_safe.patch +Name=InnoDB patch to fix buffer pool scalability +Version=1.0 +Author=Yasufumi Kinoshita +License=BSD +Comment=