diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c --- a/innobase/btr/btr0sea.c 2009-08-28 11:08:16.000000000 +0900 +++ b/innobase/btr/btr0sea.c 2009-08-28 11:06:20.000000000 +0900 @@ -1101,7 +1101,7 @@ ulint* offsets; rw_lock_x_lock(&btr_search_latch); - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->LRU_mutex); table = btr_search_sys->hash_index; @@ -1186,7 +1186,7 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->LRU_mutex); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c --- a/innobase/buf/buf0buf.c 2009-08-28 11:08:16.000000000 +0900 +++ b/innobase/buf/buf0buf.c 2009-08-28 11:06:30.000000000 +0900 @@ -549,6 +549,17 @@ mutex_create(&(buf_pool->mutex)); mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + mutex_create(&(buf_pool->LRU_mutex)); + mutex_set_level(&(buf_pool->LRU_mutex), SYNC_BUF_LRU_LIST); + rw_lock_create(&(buf_pool->hash_latch)); + rw_lock_set_level(&(buf_pool->hash_latch), SYNC_BUF_PAGE_HASH); + mutex_create(&(buf_pool->free_mutex)); + mutex_set_level(&(buf_pool->free_mutex), SYNC_BUF_FREE_LIST); + mutex_create(&(buf_pool->flush_list_mutex)); + mutex_set_level(&(buf_pool->flush_list_mutex), SYNC_BUF_FLUSH_LIST); + + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); mutex_enter(&(buf_pool->mutex)); if (srv_use_awe) { @@ -724,6 +735,8 @@ block->in_free_list = TRUE; } + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); mutex_exit(&(buf_pool->mutex)); if (srv_use_adaptive_hash_indexes) { @@ -753,6 +766,7 @@ { buf_block_t* bck; + ut_error; /* don't support AWE */ #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ @@ -851,7 +865,7 @@ buf_block_t* block) /* in: block to make younger */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(!mutex_own(&(buf_pool->mutex))); + ut_ad(!mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ /* Note that we read freed_page_clock's without holding any mutex: @@ -860,12 +874,12 @@ if (buf_pool->freed_page_clock >= block->freed_page_clock + 1 + (buf_pool->curr_size / 4)) { - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->LRU_mutex); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(block); - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->LRU_mutex); } } @@ -881,7 +895,7 @@ { buf_block_t* block; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); block = buf_block_align(frame); @@ -889,7 +903,7 @@ buf_LRU_make_block_young(block); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } /************************************************************************ @@ -900,7 +914,7 @@ /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); mutex_enter(&block->mutex); @@ -910,7 +924,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); } /************************************************************************* @@ -951,11 +965,11 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -972,7 +986,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -980,7 +994,7 @@ block->check_index_page_at_flush = FALSE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); } /************************************************************************ @@ -999,7 +1013,7 @@ buf_block_t* block; ibool is_hashed; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1009,7 +1023,7 @@ is_hashed = block->is_hashed; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(is_hashed); } @@ -1051,7 +1065,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1059,7 +1073,7 @@ block->file_page_was_freed = TRUE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -1080,7 +1094,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1088,7 +1102,7 @@ block->file_page_was_freed = FALSE; } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(block); } @@ -1167,26 +1181,33 @@ buf_pool->n_page_gets++; loop: block = NULL; - mutex_enter_fast(&(buf_pool->mutex)); + //mutex_enter_fast(&(buf_pool->mutex)); if (guess) { block = buf_block_align(guess); + mutex_enter(&block->mutex); if ((offset != block->offset) || (space != block->space) || (block->state != BUF_BLOCK_FILE_PAGE)) { + mutex_exit(&block->mutex); block = NULL; } } if (block == NULL) { + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); + if(block) { + mutex_enter(&block->mutex); + } + rw_lock_s_unlock(&(buf_pool->hash_latch)); } if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); if (mode == BUF_GET_IF_IN_POOL) { @@ -1205,7 +1226,7 @@ goto loop; } - mutex_enter(&block->mutex); + //mutex_enter(&block->mutex); ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -1217,7 +1238,7 @@ if (mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - mutex_exit(&buf_pool->mutex); + //mutex_exit(&buf_pool->mutex); mutex_exit(&block->mutex); return(NULL); @@ -1242,7 +1263,7 @@ #else buf_block_buf_fix_inc(block); #endif - mutex_exit(&buf_pool->mutex); + //mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1685,7 +1706,7 @@ buf_block_t* block) /* in: block to init */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state != BUF_BLOCK_FILE_PAGE); @@ -1792,7 +1813,8 @@ ut_a(block); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); mutex_enter(&block->mutex); if (fil_tablespace_deleted_or_being_deleted_in_mem(space, @@ -1807,7 +1829,8 @@ being deleted, or the page is already in buf_pool, return */ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); buf_block_free(block); @@ -1822,10 +1845,14 @@ ut_ad(block); buf_page_init(space, offset, block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + mutex_exit(&(buf_pool->LRU_mutex)); + + mutex_enter(&(buf_pool->mutex)); /* for consistency about aio */ block->io_fix = BUF_IO_READ; @@ -1874,7 +1901,8 @@ free_block = buf_LRU_get_free_block(); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); @@ -1885,7 +1913,8 @@ block->file_page_was_freed = FALSE; /* Page can be found in buf_pool */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); buf_block_free(free_block); @@ -1908,6 +1937,7 @@ mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* The block must be put to the LRU list */ buf_LRU_add_block(block, FALSE); @@ -1919,7 +1949,7 @@ #endif buf_pool->n_pages_created++; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -1933,7 +1963,7 @@ ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); frame = block->frame; @@ -1969,6 +1999,7 @@ { ulint io_type; ulint read_page_no; + ulint flush_type; buf_io_counter_t* io_counter; ulint fold; @@ -2051,9 +2082,6 @@ } } - mutex_enter(&(buf_pool->mutex)); - mutex_enter(&block->mutex); - #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif @@ -2062,9 +2090,12 @@ removes the newest lock debug record, without checking the thread id. */ - block->io_fix = 0; - if (io_type == BUF_IO_READ) { + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* NOTE that the call to ibuf may have moved the ownership of the x-latch to this OS thread: do not let this confuse you in debugging! */ @@ -2095,6 +2126,8 @@ } } + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has read ", stderr); @@ -2103,11 +2136,24 @@ } else { ut_ad(io_type == BUF_IO_WRITE); + flush_type = block->flush_type; + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* Write means a flush operation: call the completion routine in the flush system */ buf_flush_write_complete(block); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); /* io_counter here */ if (srv_io_pattern && srv_io_pattern_trace_running) { @@ -2132,6 +2178,9 @@ buf_pool->n_pages_written++; + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); + #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has written ", stderr); @@ -2139,9 +2188,6 @@ #endif /* UNIV_DEBUG */ } - mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); - #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "page space %lu page no %lu\n", @@ -2169,11 +2215,11 @@ freed = buf_LRU_search_and_free_block(100); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } /************************************************************************* @@ -2195,7 +2241,10 @@ ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); + /* for keep the new latch order, it cannot validate correctly... */ for (i = 0; i < buf_pool->curr_size; i++) { @@ -2256,18 +2305,26 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + /* because of latching order with block->mutex, we cannot get free_mutex before that */ +/* if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { fprintf(stderr, "Free list len %lu, free blocks %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); ut_error; } +*/ + /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */ +/* ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); +*/ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); @@ -2299,7 +2356,9 @@ index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); fprintf(stderr, "buf_pool size %lu\n" @@ -2352,7 +2411,9 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -2387,7 +2448,7 @@ ulint i; ulint fixed_pages_number = 0; - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); for (i = 0; i < buf_pool->curr_size; i++) { @@ -2404,7 +2465,7 @@ } } - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); return fixed_pages_number; } #endif /* UNIV_DEBUG */ @@ -2432,7 +2493,7 @@ { ulint ratio; - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); /* optimistic */ ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) / (1 + UT_LIST_GET_LEN(buf_pool->LRU) @@ -2440,7 +2501,7 @@ /* 1 + is there to avoid division by zero */ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); /* optimistic */ return(ratio); } @@ -2460,7 +2521,10 @@ ut_ad(buf_pool); size = buf_pool->curr_size; + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); if (srv_use_awe) { fprintf(stderr, @@ -2533,7 +2597,10 @@ buf_pool->n_pages_written_old = buf_pool->n_pages_written; buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->free_mutex)); mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); } /************************************************************************** @@ -2563,7 +2630,7 @@ ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); /* optimistic */ for (i = 0; i < buf_pool->curr_size; i++) { @@ -2586,7 +2653,7 @@ mutex_exit(&block->mutex); } - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); /* optimistic */ return(TRUE); } @@ -2626,11 +2693,11 @@ { ulint len; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); len = UT_LIST_GET_LEN(buf_pool->free); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(len); } diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c --- a/innobase/buf/buf0flu.c 2009-08-28 11:08:17.000000000 +0900 +++ b/innobase/buf/buf0flu.c 2009-08-28 11:06:30.000000000 +0900 @@ -49,7 +49,9 @@ buf_block_t* block) /* in: block which is modified */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -79,7 +81,9 @@ buf_block_t* b; #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); #endif /* UNIV_SYNC_DEBUG */ prev_b = NULL; @@ -130,16 +134,18 @@ BUF_BLOCK_FILE_PAGE and in the LRU list */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ - if (block->state != BUF_BLOCK_FILE_PAGE) { + if (!block->in_LRU_list || block->state != BUF_BLOCK_FILE_PAGE) { + /* permited not to own LRU_mutex.. */ +/* ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: buffer block state %lu in the LRU list!\n", (ulong)block->state); ut_print_buf(stderr, (byte*)block, sizeof(buf_block_t)); - +*/ return(FALSE); } @@ -165,12 +171,13 @@ ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ - ut_a(block->state == BUF_BLOCK_FILE_PAGE); + //ut_a(block->state == BUF_BLOCK_FILE_PAGE); - if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) + if (block->state == BUF_BLOCK_FILE_PAGE + && (ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) && (block->io_fix == 0)) { if (flush_type != BUF_FLUSH_LRU) { @@ -199,15 +206,17 @@ { ut_ad(block); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); + mutex_enter(&(buf_pool->flush_list_mutex)); block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); + mutex_exit(&(buf_pool->flush_list_mutex)); (buf_pool->n_flush[block->flush_type])--; @@ -553,18 +562,20 @@ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST || flush_type == BUF_FLUSH_SINGLE_PAGE); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); if (!block) { - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(0); } mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); if (flush_type == BUF_FLUSH_LIST && buf_flush_ready_for_flush(block, flush_type)) { @@ -761,7 +772,7 @@ high = fil_space_get_size(space); } - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { @@ -795,7 +806,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); /* Note: as we release the buf_pool mutex above, in buf_flush_try_page we cannot be sure @@ -806,14 +817,14 @@ count += buf_flush_try_page(space, i, flush_type); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); } else { mutex_exit(&block->mutex); } } } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(count); } @@ -848,6 +859,7 @@ ulint space; ulint offset; ibool found; + ulint remaining = 0; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); @@ -866,6 +878,12 @@ } (buf_pool->init_flush)[flush_type] = TRUE; + + mutex_exit(&(buf_pool->mutex)); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } for (;;) { /* If we have flushed enough, leave the loop */ @@ -882,7 +900,10 @@ } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_enter(&(buf_pool->flush_list_mutex)); + remaining = UT_LIST_GET_LEN(buf_pool->flush_list); block = UT_LIST_GET_LAST(buf_pool->flush_list); + mutex_exit(&(buf_pool->flush_list_mutex)); if (!block || (ut_dulint_cmp(block->oldest_modification, lsn_limit) >= 0)) { @@ -912,7 +933,9 @@ offset = block->offset; mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } old_page_count = page_count; @@ -932,7 +955,9 @@ flush_type, offset, page_count - old_page_count); */ - mutex_enter(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } } else if (flush_type == BUF_FLUSH_LRU) { @@ -944,17 +969,26 @@ mutex_exit(&block->mutex); + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_PREV(flush_list, block); + mutex_exit(&(buf_pool->flush_list_mutex)); + remaining--; } } /* If we could not find anything to flush, leave the loop */ - if (!found) { + if (!found && !remaining) { break; } } + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + + mutex_enter(&(buf_pool->mutex)); + (buf_pool->init_flush)[flush_type] = FALSE; if ((buf_pool->n_flush[flush_type] == 0) @@ -1013,11 +1047,15 @@ buf_block_t* block; ulint n_replaceable; ulint distance = 0; + ibool optimistic = TRUE; - mutex_enter(&(buf_pool->mutex)); - + //mutex_enter(&(buf_pool->mutex)); +retry: n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + if (!optimistic) + mutex_enter(&(buf_pool->LRU_mutex)); + block = UT_LIST_GET_LAST(buf_pool->LRU); while ((block != NULL) @@ -1025,6 +1063,12 @@ + BUF_FLUSH_EXTRA_MARGIN) && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + if (!block->in_LRU_list) { + /* reatart. but it is very optimistic */ + block = UT_LIST_GET_LAST(buf_pool->LRU); + continue; + } + mutex_enter(&block->mutex); if (buf_flush_ready_for_replace(block)) { @@ -1038,11 +1082,17 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); + if (!optimistic) + mutex_exit(&(buf_pool->LRU_mutex)); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { return(0); + } else if (optimistic) { + /* confirm it again with LRU_mutex for exactness */ + optimistic = FALSE; + goto retry; } return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN @@ -1057,8 +1107,9 @@ immediately, without waiting. */ void -buf_flush_free_margin(void) +buf_flush_free_margin( /*=======================*/ + ibool wait) { ulint n_to_flush; ulint n_flushed; @@ -1068,7 +1119,7 @@ if (n_to_flush > 0) { n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, ut_dulint_zero); - if (n_flushed == ULINT_UNDEFINED) { + if (wait && n_flushed == ULINT_UNDEFINED) { /* There was an LRU type flush batch already running; let us wait for it to end */ @@ -1118,11 +1169,11 @@ { ibool ret; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ret = buf_flush_validate_low(); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(ret); } diff -ruN a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c --- a/innobase/buf/buf0lru.c 2009-07-07 21:53:57.000000000 +0900 +++ b/innobase/buf/buf0lru.c 2009-08-28 11:06:30.000000000 +0900 @@ -108,7 +108,7 @@ page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->LRU_mutex); scan_again: num_entries = 0; @@ -147,12 +147,12 @@ } /* Array full. We release the buf_pool->mutex to obey the latching order. */ - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->LRU_mutex); buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); num_entries = 0; - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->LRU_mutex); } else { mutex_exit(&block->mutex); } @@ -177,7 +177,7 @@ } } - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->LRU_mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); @@ -206,7 +206,8 @@ buf_LRU_drop_page_hash_for_tablespace(id); scan_again: - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + rw_lock_x_lock(&(buf_pool->hash_latch)); all_freed = TRUE; @@ -244,7 +245,8 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); /* Note that the following call will acquire an S-latch on the page */ @@ -274,7 +276,8 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + rw_lock_x_unlock(&(buf_pool->hash_latch)); if (!all_freed) { os_thread_sleep(20000); @@ -297,14 +300,14 @@ ulint len; ulint limit; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); len = UT_LIST_GET_LEN(buf_pool->LRU); if (len < BUF_LRU_OLD_MIN_LEN) { /* The LRU list is too short to do read-ahead */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(0); } @@ -313,7 +316,7 @@ limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(limit); } @@ -337,13 +340,15 @@ ulint distance = 0; ibool freed; - mutex_enter(&(buf_pool->mutex)); + /* optimistic search... */ + //mutex_enter(&(buf_pool->mutex)); +retry: freed = FALSE; block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { - ut_a(block->in_LRU_list); + //ut_a(block->in_LRU_list); /* optimistic */ mutex_enter(&block->mutex); @@ -358,9 +363,17 @@ } #endif /* UNIV_DEBUG */ + mutex_exit(&block->mutex); + + mutex_enter(&(buf_pool->LRU_mutex));/* optimistic */ + + rw_lock_x_lock(&(buf_pool->hash_latch)); + mutex_enter(&block->mutex); + if(block->in_LRU_list && buf_flush_ready_for_replace(block)) { buf_LRU_block_remove_hashed_page(block); + rw_lock_x_unlock(&(buf_pool->hash_latch)); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mutex_exit(&block->mutex); /* Remove possible adaptive hash index built on the @@ -373,7 +386,6 @@ ut_a(block->buf_fix_count == 0); - mutex_enter(&(buf_pool->mutex)); mutex_enter(&block->mutex); buf_LRU_block_free_hashed_page(block); @@ -381,6 +393,16 @@ mutex_exit(&block->mutex); break; + } else { /* someone may interrupt...??? */ + mutex_exit(&(buf_pool->LRU_mutex));/* optimistic */ + + rw_lock_x_unlock(&(buf_pool->hash_latch)); + + if (!(block->in_LRU_list)) { + mutex_exit(&block->mutex); + goto retry; + } + } } mutex_exit(&block->mutex); @@ -391,6 +413,7 @@ if (!freed && n_iterations <= 10 && distance > 100 + (n_iterations * buf_pool->curr_size) / 10) { + mutex_enter(&(buf_pool->mutex)); buf_pool->LRU_flush_ended = 0; mutex_exit(&(buf_pool->mutex)); @@ -398,6 +421,8 @@ return(FALSE); } } + + mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } @@ -449,7 +474,8 @@ { ibool ret = FALSE; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) { @@ -457,7 +483,8 @@ ret = TRUE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(ret); } @@ -480,7 +507,7 @@ ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - mutex_enter(&(buf_pool->mutex)); + //mutex_enter(&(buf_pool->mutex)); /* optimistic */ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { @@ -536,10 +563,16 @@ /* If there is a block in the free list, take it */ if (UT_LIST_GET_LEN(buf_pool->free) > 0) { - block = UT_LIST_GET_FIRST(buf_pool->free); + mutex_enter(&(buf_pool->free_mutex)); + block = UT_LIST_GET_LAST(buf_pool->free); + if (!block) { + mutex_exit(&(buf_pool->free_mutex)); + goto no_block; + } ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); block->in_free_list = FALSE; + mutex_exit(&(buf_pool->free_mutex)); ut_a(block->state != BUF_BLOCK_FILE_PAGE); ut_a(!block->in_LRU_list); @@ -564,7 +597,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -572,11 +605,12 @@ return(block); } +no_block: /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); freed = buf_LRU_search_and_free_block(n_iterations); @@ -613,7 +647,7 @@ /* No free block was found: try to flush the LRU list */ - buf_flush_free_margin(); + buf_flush_free_margin(TRUE); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); @@ -655,7 +689,7 @@ ut_a(buf_pool->LRU_old); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_ad(3 * (BUF_LRU_OLD_MIN_LEN / 8) > BUF_LRU_OLD_TOLERANCE + 5); @@ -730,7 +764,7 @@ ut_ad(buf_pool); ut_ad(block); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -796,7 +830,7 @@ ut_ad(buf_pool); ut_ad(block); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -861,7 +895,7 @@ ut_ad(buf_pool); ut_ad(block); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -964,7 +998,7 @@ buf_block_t* block) /* in: block, must not contain a file page */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -981,8 +1015,10 @@ /* Wipe contents of page to reveal possible stale pointers to it */ memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif + mutex_enter(&(buf_pool->free_mutex)); UT_LIST_ADD_FIRST(free, buf_pool->free, block); block->in_free_list = TRUE; + mutex_exit(&(buf_pool->free_mutex)); if (srv_use_awe && block->frame) { /* Add to the list of mapped pages */ @@ -1004,7 +1040,7 @@ may or may not be a hash index to the page */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -1062,7 +1098,7 @@ be in a state where it can be freed */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_REMOVE_HASH); @@ -1085,7 +1121,7 @@ ulint LRU_pos; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -1130,6 +1166,9 @@ ut_a(buf_pool->LRU_old_len == old_len); } + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); block = UT_LIST_GET_FIRST(buf_pool->free); @@ -1140,7 +1179,7 @@ block = UT_LIST_GET_NEXT(free, block); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(TRUE); } @@ -1156,7 +1195,7 @@ ulint len; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); @@ -1200,5 +1239,5 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c --- a/innobase/buf/buf0rea.c 2009-08-28 11:08:17.000000000 +0900 +++ b/innobase/buf/buf0rea.c 2009-08-28 11:06:30.000000000 +0900 @@ -277,10 +277,12 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -292,7 +294,7 @@ } } - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { /* Do nothing */ @@ -388,7 +390,7 @@ } /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); return(count + count2); } @@ -491,6 +493,7 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -504,6 +507,7 @@ fail_count = 0; + rw_lock_s_lock(&(buf_pool->hash_latch)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -520,23 +524,23 @@ pred_block = block; } } + rw_lock_s_unlock(&(buf_pool->hash_latch)); if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - BUF_READ_AHEAD_LINEAR_THRESHOLD) { /* Too many failures: return */ - mutex_exit(&(buf_pool->mutex)); - return(0); } /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ + rw_lock_s_lock(&(buf_pool->hash_latch)); block = buf_page_hash_get(space, offset); if (block == NULL) { - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); return(0); } @@ -552,7 +556,7 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); if ((offset == low) && (succ_offset == offset + 1)) { @@ -628,7 +632,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { @@ -696,7 +700,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { @@ -768,7 +772,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { diff -ruN a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h --- a/innobase/include/buf0buf.h 2009-08-28 11:08:16.000000000 +0900 +++ b/innobase/include/buf0buf.h 2009-08-28 11:06:30.000000000 +0900 @@ -946,6 +946,7 @@ mem_heap_t* io_counter_heap; ulint io_counters; hash_table_t* page_hash; /* hash table of the file pages */ + rw_lock_t hash_latch; ulint n_pend_reads; /* number of pending read operations */ @@ -978,6 +979,7 @@ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; /* base node of the modified block list */ + mutex_t flush_list_mutex; ibool init_flush[BUF_FLUSH_LIST + 1]; /* this is TRUE when a flush of the given type is being initialized */ @@ -1011,8 +1013,10 @@ in the case of AWE, at the start are always free blocks for which the physical memory is mapped to a frame */ + mutex_t free_mutex; UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ + mutex_t LRU_mutex; buf_block_t* LRU_old; /* pointer to the about 3/8 oldest blocks in the LRU list; NULL if LRU length less than BUF_LRU_OLD_MIN_LEN */ diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic --- a/innobase/include/buf0buf.ic 2009-07-07 21:54:00.000000000 +0900 +++ b/innobase/include/buf0buf.ic 2009-08-28 11:06:30.000000000 +0900 @@ -112,7 +112,8 @@ buf_block_t* block; dulint lsn; - mutex_enter(&(buf_pool->mutex)); +try_again: + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -120,9 +121,13 @@ lsn = ut_dulint_zero; } else { lsn = block->oldest_modification; + if (ut_dulint_cmp(lsn, ut_dulint_zero) == 0) { + mutex_exit(&(buf_pool->flush_list_mutex)); + goto try_again; + } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(lsn); } @@ -137,7 +142,7 @@ /* out: new clock value */ { #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(buf_pool->LRU_mutex))); #endif /* UNIV_SYNC_DEBUG */ buf_pool->ulint_clock++; @@ -392,18 +397,18 @@ /* out: TRUE if io going on */ buf_block_t* block) /* in: buf_pool block, must be bufferfixed */ { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); ut_ad(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); if (block->io_fix != 0) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(TRUE); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(FALSE); } @@ -425,7 +430,7 @@ block = buf_block_align(frame); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->state == BUF_BLOCK_FILE_PAGE) { lsn = block->newest_modification; @@ -433,7 +438,7 @@ lsn = ut_dulint_zero; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(lsn); } @@ -456,7 +461,7 @@ block = buf_block_align(frame); #ifdef UNIV_SYNC_DEBUG - ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) + ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /*UNIV_SYNC_DEBUG */ @@ -477,7 +482,7 @@ buf_block_t* block) /* in: block */ { #ifdef UNIV_SYNC_DEBUG - ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) + ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ @@ -555,7 +560,8 @@ ut_ad(buf_pool); #ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_EX) + || rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ /* Look for the page in the hash table */ @@ -631,11 +637,14 @@ ut_a(block->state == BUF_BLOCK_FILE_PAGE); + /* buf_flush_note_modification() should be called before this function. */ +/* if (rw_latch == RW_X_LATCH && mtr->modifications) { mutex_enter(&buf_pool->mutex); buf_flush_note_modification(block, mtr); mutex_exit(&buf_pool->mutex); } +*/ mutex_enter(&block->mutex); diff -ruN a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h --- a/innobase/include/buf0flu.h 2009-07-07 21:54:00.000000000 +0900 +++ b/innobase/include/buf0flu.h 2009-08-28 11:06:30.000000000 +0900 @@ -26,8 +26,9 @@ a margin of replaceable pages there. */ void -buf_flush_free_margin(void); +buf_flush_free_margin( /*=======================*/ + ibool wait); /************************************************************************ Initializes a page for writing to the tablespace. */ diff -ruN a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic --- a/innobase/include/buf0flu.ic 2009-07-07 21:54:00.000000000 +0900 +++ b/innobase/include/buf0flu.ic 2009-08-28 11:06:30.000000000 +0900 @@ -38,11 +38,14 @@ mtr_t* mtr) /* in: mtr */ { ut_ad(block); + + mutex_enter(&block->mutex); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); - ut_ad(mutex_own(&(buf_pool->mutex))); + //ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0); @@ -52,16 +55,20 @@ block->newest_modification = mtr->end_lsn; if (ut_dulint_is_zero(block->oldest_modification)) { + mutex_enter(&(buf_pool->flush_list_mutex)); block->oldest_modification = mtr->start_lsn; ut_ad(!ut_dulint_is_zero(block->oldest_modification)); buf_flush_insert_into_flush_list(block); + mutex_exit(&(buf_pool->flush_list_mutex)); } else { ut_ad(ut_dulint_cmp(block->oldest_modification, mtr->start_lsn) <= 0); } + mutex_exit(&block->mutex); + ++srv_buf_pool_write_requests; } @@ -78,29 +85,32 @@ set of mtr's */ { ut_ad(block); + + mutex_enter(&(block->mutex)); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - mutex_enter(&(buf_pool->mutex)); - ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0); block->newest_modification = end_lsn; if (ut_dulint_is_zero(block->oldest_modification)) { + mutex_enter(&(buf_pool->flush_list_mutex)); block->oldest_modification = start_lsn; ut_ad(!ut_dulint_is_zero(block->oldest_modification)); buf_flush_insert_sorted_into_flush_list(block); + mutex_exit(&(buf_pool->flush_list_mutex)); } else { ut_ad(ut_dulint_cmp(block->oldest_modification, start_lsn) <= 0); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(block->mutex)); } diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h --- a/innobase/include/sync0sync.h 2009-07-07 21:54:06.000000000 +0900 +++ b/innobase/include/sync0sync.h 2009-08-28 11:06:30.000000000 +0900 @@ -438,8 +438,12 @@ SYNC_SEARCH_SYS, as memory allocation can call routines there! Otherwise the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_LRU_LIST 157 +#define SYNC_BUF_PAGE_HASH 156 +#define SYNC_BUF_BLOCK 155 +#define SYNC_BUF_FREE_LIST 153 #define SYNC_BUF_POOL 150 -#define SYNC_BUF_BLOCK 149 +#define SYNC_BUF_FLUSH_LIST 149 #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_THR_LOCAL 133 diff -ruN a/innobase/log/log0recv.c b/innobase/log/log0recv.c --- a/innobase/log/log0recv.c 2009-08-28 11:08:17.000000000 +0900 +++ b/innobase/log/log0recv.c 2009-08-28 11:06:30.000000000 +0900 @@ -1695,11 +1695,11 @@ mtr_start(&mtr); - mutex_enter(&(buf_pool->mutex)); + rw_lock_s_lock(&(buf_pool->hash_latch)); page = buf_page_hash_get(space, page_no)->frame; - mutex_exit(&(buf_pool->mutex)); + rw_lock_s_unlock(&(buf_pool->hash_latch)); replica = buf_page_get(space + RECV_REPLICA_SPACE_ADD, page_no, RW_X_LATCH, &mtr); diff -ruN a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c --- a/innobase/mtr/mtr0mtr.c 2009-07-07 21:54:08.000000000 +0900 +++ b/innobase/mtr/mtr0mtr.c 2009-08-28 11:06:30.000000000 +0900 @@ -103,6 +103,38 @@ } } +UNIV_INLINE +void +mtr_memo_note_modification_all( +/*===========================*/ + mtr_t* mtr) /* in: mtr */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + ut_ad(mtr->modifications); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + slot = dyn_array_get_element(memo, offset); + + if (UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } + } +} + /**************************************************************** Writes the contents of a mini-transaction log, if any, to the database log. */ static @@ -177,6 +209,8 @@ #endif if (mtr->modifications) { mtr_log_reserve_and_write(mtr); + + mtr_memo_note_modification_all(mtr); } /* We first update the modification info to buffer pages, and only @@ -187,12 +221,13 @@ required when we insert modified buffer pages in to the flush list which must be sorted on oldest_modification. */ - mtr_memo_pop_all(mtr); - if (mtr->modifications) { log_release(); } + /* All unlocking has been moved here, after log_sys mutex release. */ + mtr_memo_pop_all(mtr); + #ifdef UNIV_DEBUG mtr->state = MTR_COMMITTED; #endif @@ -262,6 +297,12 @@ slot = dyn_array_get_element(memo, offset); if ((object == slot->object) && (type == slot->type)) { + if (mtr->modifications && + UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } mtr_memo_slot_release(mtr, slot); diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c --- a/innobase/srv/srv0srv.c 2009-08-28 11:08:17.000000000 +0900 +++ b/innobase/srv/srv0srv.c 2009-08-28 11:06:30.000000000 +0900 @@ -370,6 +370,7 @@ ulong srv_n_free_tickets_to_enter = 500; ulong srv_thread_sleep_delay = 10000; ulint srv_spin_wait_delay = 5; +ulint srv_spins_microsec = 50; ibool srv_priority_boost = TRUE; ibool srv_print_thread_releases = FALSE; @@ -676,6 +677,47 @@ ulint srv_n_threads_active[SRV_MASTER + 1]; ulint srv_n_threads[SRV_MASTER + 1]; +static +void +srv_align_spins_microsec(void) +{ + ulint start_sec, end_sec; + ulint start_usec, end_usec; + ib_longlong usecs; + + /* change temporary */ + srv_spins_microsec = 1; + + if (ut_usectime(&start_sec, &start_usec)) { + srv_spins_microsec = 50; + goto end; + } + + ut_delay(100000); + + if (ut_usectime(&end_sec, &end_usec)) { + srv_spins_microsec = 50; + goto end; + } + + usecs = (end_sec - start_sec) * 1000000LL + (end_usec - start_usec); + + if (usecs) { + srv_spins_microsec = 100000 / usecs; + if (srv_spins_microsec == 0) + srv_spins_microsec = 1; + if (srv_spins_microsec > 50) + srv_spins_microsec = 50; + } else { + srv_spins_microsec = 50; + } +end: + if (srv_spins_microsec != 50) + fprintf(stderr, + "InnoDB: unit of spin count at ut_delay() is aligned to %lu\n", + srv_spins_microsec); +} + /************************************************************************* Sets the info describing an i/o thread current state. */ @@ -909,6 +951,8 @@ dict_table_t* table; ulint i; + srv_align_spins_microsec(); + srv_sys = mem_alloc(sizeof(srv_sys_t)); kernel_mutex_temp = mem_alloc(sizeof(mutex_t)); @@ -2665,7 +2709,7 @@ ib_longlong level, bpl; buf_block_t* bpage; - mutex_enter(&buf_pool->mutex); + mutex_enter(&(buf_pool->flush_list_mutex)); level = 0; bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); @@ -2687,7 +2731,7 @@ bpl = 0; } - mutex_exit(&buf_pool->mutex); + mutex_exit(&(buf_pool->flush_list_mutex)); if (!srv_use_doublewrite_buf) { /* flush is faster than when doublewrite */ diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c --- a/innobase/sync/sync0sync.c 2009-07-07 21:54:10.000000000 +0900 +++ b/innobase/sync/sync0sync.c 2009-08-28 11:06:30.000000000 +0900 @@ -1105,11 +1105,19 @@ } else if (level == SYNC_DOUBLEWRITE) { ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE)); } else if (level == SYNC_BUF_BLOCK) { - ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) + ut_a((sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST) && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) || sync_thread_levels_g(array, SYNC_BUF_BLOCK)); } else if (level == SYNC_BUF_POOL) { ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL)); + } else if (level == SYNC_BUF_FLUSH_LIST) { + ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST)); + } else if (level == SYNC_BUF_FREE_LIST) { + ut_a(sync_thread_levels_g(array, SYNC_BUF_FREE_LIST)); + } else if (level == SYNC_BUF_PAGE_HASH) { + ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH)); + } else if (level == SYNC_BUF_LRU_LIST) { + ut_a(sync_thread_levels_g(array, SYNC_BUF_LRU_LIST)); } else if (level == SYNC_SEARCH_SYS) { ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS)); } else if (level == SYNC_TRX_LOCK_HEAP) { diff -ruN a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c --- a/innobase/ut/ut0ut.c 2009-07-07 21:54:12.000000000 +0900 +++ b/innobase/ut/ut0ut.c 2009-08-28 11:06:30.000000000 +0900 @@ -347,6 +347,7 @@ /***************************************************************** Runs an idle loop on CPU. The argument gives the desired delay in microseconds on 100 MHz Pentium + Visual C++. */ +extern ulint srv_spins_microsec; ulint ut_delay( @@ -358,7 +359,11 @@ j = 0; - for (i = 0; i < delay * 50; i++) { + for (i = 0; i < delay * srv_spins_microsec; i++) { +#if (defined (__i386__) || defined (__x86_64__)) && defined (__GNUC__) + /* it is equal to the instruction 'pause' */ + __asm__ __volatile__ ("rep; nop"); +#endif j += i; } diff -ruN a/patch_info/innodb_split_buf_pool_mutex.info b/patch_info/innodb_split_buf_pool_mutex.info --- /dev/null 1970-01-01 09:00:00.000000000 +0900 +++ b/patch_info/innodb_split_buf_pool_mutex.info 2009-08-28 11:06:30.000000000 +0900 @@ -0,0 +1,6 @@ +File=innodb_split_buf_pool_mutex.patch +Name=InnoDB patch to fix buffer pool scalability +Version=1.0 +Author=Yasufumi Kinoshita +License=BSD +Comment=Backport from XtraDB diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc --- a/sql/ha_innodb.cc 2009-08-28 11:08:17.000000000 +0900 +++ b/sql/ha_innodb.cc 2009-08-28 11:06:30.000000000 +0900 @@ -1507,6 +1507,13 @@ /* We set srv_pool_size here in units of 1 kB. InnoDB internally changes the value so that it becomes the number of database pages. */ + if (innobase_buffer_pool_awe_mem_mb) { + /* split_buf_pool_mutex.patch don't support AWE */ + fputs("InnoDB: Warning: split_buf_pool_mutex.patch don't support AWE. Disabled.\n", + stderr); + innobase_buffer_pool_awe_mem_mb = 0; + } + if (innobase_buffer_pool_awe_mem_mb == 0) { /* Careful here: we first convert the signed long int to ulint and only after that divide */