diff -ruN mysql-5.1.29-rc_orig/storage/innobase/buf/buf0buf.c mysql-5.1.29-rc/storage/innobase/buf/buf0buf.c --- mysql-5.1.29-rc_orig/storage/innobase/buf/buf0buf.c 2008-10-12 06:54:12.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/buf/buf0buf.c 2008-11-18 15:44:00.000000000 +0900 @@ -596,6 +596,15 @@ ---------------------------- */ mutex_create(&buf_pool->mutex, SYNC_BUF_POOL); + mutex_create(&(buf_pool->flush_list_mutex), SYNC_NO_ORDER_CHECK); + mutex_create(&(buf_pool->LRU_mutex), SYNC_NO_ORDER_CHECK); + mutex_create(&(buf_pool->free_mutex), SYNC_NO_ORDER_CHECK); + mutex_create(&(buf_pool->hash_mutex), SYNC_NO_ORDER_CHECK); + + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + mutex_enter(&(buf_pool->hash_mutex)); mutex_enter(&(buf_pool->mutex)); if (srv_use_awe) { @@ -773,6 +782,10 @@ block->in_free_list = TRUE; } + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); mutex_exit(&(buf_pool->mutex)); if (srv_use_adaptive_hash_indexes) { @@ -905,12 +918,12 @@ if (buf_block_peek_if_too_old(block)) { - mutex_enter(&buf_pool->mutex); + mutex_enter(&(buf_pool->LRU_mutex)); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(block); - mutex_exit(&buf_pool->mutex); + mutex_exit(&(buf_pool->LRU_mutex)); } } @@ -926,7 +939,7 @@ { buf_block_t* block; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); block = buf_block_align(frame); @@ -934,7 +947,7 @@ buf_LRU_make_block_young(block); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } /************************************************************************ @@ -945,7 +958,7 @@ /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&block->mutex); @@ -955,7 +968,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); } /************************************************************************* @@ -996,11 +1009,11 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(block); } @@ -1017,7 +1030,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); @@ -1025,7 +1038,7 @@ block->check_index_page_at_flush = FALSE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); } /************************************************************************ @@ -1044,7 +1057,7 @@ buf_block_t* block; ibool is_hashed; - mutex_enter_fast(&(buf_pool->mutex)); + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); @@ -1054,7 +1067,7 @@ is_hashed = block->is_hashed; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(is_hashed); } @@ -1096,7 +1109,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); @@ -1104,7 +1117,7 @@ block->file_page_was_freed = TRUE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(block); } @@ -1125,7 +1138,7 @@ { buf_block_t* block; - mutex_enter_fast(&(buf_pool->mutex)); + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); @@ -1133,7 +1146,7 @@ block->file_page_was_freed = FALSE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(block); } @@ -1174,26 +1187,33 @@ buf_pool->n_page_gets++; loop: block = NULL; - mutex_enter_fast(&(buf_pool->mutex)); + // mutex_enter_fast(&(buf_pool->mutex)); if (guess) { block = buf_block_align(guess); + mutex_enter(&block->mutex); if ((offset != block->offset) || (space != block->space) || (block->state != BUF_BLOCK_FILE_PAGE)) { + mutex_exit(&block->mutex); block = NULL; } } if (block == NULL) { + mutex_enter_fast(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); + if(block) { + mutex_enter(&block->mutex); + } + mutex_exit(&(buf_pool->hash_mutex)); } if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - mutex_exit(&(buf_pool->mutex)); + // mutex_exit(&(buf_pool->mutex)); if (mode == BUF_GET_IF_IN_POOL) { @@ -1212,7 +1232,7 @@ goto loop; } - mutex_enter(&block->mutex); + // mutex_enter(&block->mutex); ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -1224,7 +1244,7 @@ if (mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - mutex_exit(&buf_pool->mutex); + // mutex_exit(&buf_pool->mutex); mutex_exit(&block->mutex); return(NULL); @@ -1241,7 +1261,9 @@ LRU list and we must put it to awe_LRU_free_mapped list once mapped to a frame */ + mutex_enter_fast(&(buf_pool->mutex)); buf_awe_map_page_to_frame(block, TRUE); + mutex_exit(&buf_pool->mutex); } #ifdef UNIV_SYNC_DEBUG @@ -1249,7 +1271,7 @@ #else buf_block_buf_fix_inc(block); #endif - mutex_exit(&buf_pool->mutex); + // mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1747,7 +1769,8 @@ ut_a(block); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->hash_mutex)); mutex_enter(&block->mutex); if (fil_tablespace_deleted_or_being_deleted_in_mem( @@ -1763,7 +1786,8 @@ already in buf_pool, return */ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); buf_block_free(block); @@ -1778,10 +1802,14 @@ ut_ad(block); buf_page_init(space, offset, block); + mutex_exit(&(buf_pool->hash_mutex)); /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + mutex_exit(&(buf_pool->LRU_mutex)); + + mutex_enter(&(buf_pool->mutex)); /* for consistency about aio */ block->io_fix = BUF_IO_READ; @@ -1830,7 +1858,8 @@ free_block = buf_LRU_get_free_block(); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); @@ -1841,7 +1870,8 @@ block->file_page_was_freed = FALSE; /* Page can be found in buf_pool */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); buf_block_free(free_block); @@ -1864,6 +1894,7 @@ mutex_enter(&block->mutex); buf_page_init(space, offset, block); + mutex_exit(&(buf_pool->hash_mutex)); /* The block must be put to the LRU list */ buf_LRU_add_block(block, FALSE); @@ -1875,7 +1906,7 @@ #endif buf_pool->n_pages_created++; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -1889,7 +1920,7 @@ ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); frame = block->frame; @@ -1928,6 +1959,7 @@ buf_block_t* block) /* in: pointer to the block in question */ { ulint io_type; + ulint flush_type; ut_ad(block); @@ -2040,9 +2072,6 @@ } } - mutex_enter(&(buf_pool->mutex)); - mutex_enter(&block->mutex); - #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif @@ -2051,9 +2080,12 @@ removes the newest lock debug record, without checking the thread id. */ - block->io_fix = 0; - if (io_type == BUF_IO_READ) { + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* NOTE that the call to ibuf may have moved the ownership of the x-latch to this OS thread: do not let this confuse you in debugging! */ @@ -2064,6 +2096,8 @@ rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has read ", stderr); @@ -2072,15 +2106,33 @@ } else { ut_ad(io_type == BUF_IO_WRITE); + flush_type = block->flush_type; + if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + + block->io_fix = 0; + /* Write means a flush operation: call the completion routine in the flush system */ buf_flush_write_complete(block); + mutex_exit(&(buf_pool->flush_list_mutex)); + if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_exit(&(buf_pool->LRU_mutex)); + } + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); buf_pool->n_pages_written++; + mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); + #ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has written ", stderr); @@ -2088,9 +2140,6 @@ #endif /* UNIV_DEBUG */ } - mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); - #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "page space %lu page no %lu\n", @@ -2118,11 +2167,11 @@ freed = buf_LRU_search_and_free_block(100); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } #ifdef UNIV_DEBUG @@ -2142,10 +2191,22 @@ ulint n_flush = 0; ulint n_free = 0; ulint n_page = 0; + ulint n_single_flush_tmp = 0; + ulint n_lru_flush_tmp = 0; + ulint n_list_flush_tmp = 0; ut_ad(buf_pool); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + mutex_enter(&(buf_pool->hash_mutex)); + mutex_enter(&(buf_pool->mutex)); + n_single_flush_tmp = buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]; + n_list_flush_tmp = buf_pool->n_flush[BUF_FLUSH_LIST]; + n_lru_flush_tmp = buf_pool->n_flush[BUF_FLUSH_LRU]; + mutex_exit(&(buf_pool->mutex)); for (i = 0; i < buf_pool->curr_size; i++) { @@ -2216,11 +2277,14 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); - ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); - - mutex_exit(&(buf_pool->mutex)); + ut_a(n_single_flush_tmp == n_single_flush); + ut_a(n_list_flush_tmp == n_list_flush); + ut_a(n_lru_flush_tmp == n_lru_flush); + + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); @@ -2252,7 +2316,9 @@ index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); fprintf(stderr, "buf_pool size %lu\n" @@ -2305,7 +2371,9 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -2339,8 +2407,6 @@ ulint i; ulint fixed_pages_number = 0; - mutex_enter(&(buf_pool->mutex)); - for (i = 0; i < buf_pool->curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); @@ -2356,7 +2422,6 @@ } } - mutex_exit(&(buf_pool->mutex)); return(fixed_pages_number); } @@ -2385,7 +2450,9 @@ { ulint ratio; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) / (1 + UT_LIST_GET_LEN(buf_pool->LRU) @@ -2393,7 +2460,9 @@ /* 1 + is there to avoid division by zero */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(ratio); } @@ -2413,6 +2482,9 @@ ut_ad(buf_pool); size = buf_pool->curr_size; + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&(buf_pool->mutex)); if (srv_use_awe) { @@ -2487,6 +2559,9 @@ buf_pool->n_pages_written_old = buf_pool->n_pages_written; buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); mutex_exit(&(buf_pool->mutex)); } @@ -2517,8 +2592,6 @@ ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); - for (i = 0; i < buf_pool->curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); @@ -2540,8 +2613,6 @@ mutex_exit(&block->mutex); } - mutex_exit(&(buf_pool->mutex)); - return(TRUE); } @@ -2580,11 +2651,11 @@ { ulint len; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); len = UT_LIST_GET_LEN(buf_pool->free); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(len); } diff -ruN mysql-5.1.29-rc_orig/storage/innobase/buf/buf0flu.c mysql-5.1.29-rc/storage/innobase/buf/buf0flu.c --- mysql-5.1.29-rc_orig/storage/innobase/buf/buf0flu.c 2008-10-12 06:54:12.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/buf/buf0flu.c 2008-11-18 15:26:07.000000000 +0900 @@ -109,13 +109,15 @@ ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); if (block->state != BUF_BLOCK_FILE_PAGE) { + /* It is permited not to own LRU_mutex.. */ +/* ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: buffer block state %lu" " in the LRU list!\n", (ulong)block->state); ut_print_buf(stderr, block, sizeof(buf_block_t)); - +*/ return(FALSE); } @@ -546,18 +548,20 @@ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST || flush_type == BUF_FLUSH_SINGLE_PAGE); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); if (!block) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(0); } mutex_enter(&block->mutex); + mutex_enter(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); if (flush_type == BUF_FLUSH_LIST && buf_flush_ready_for_flush(block, flush_type)) { @@ -755,7 +759,7 @@ high = fil_space_get_size(space); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->hash_mutex)); for (i = low; i < high; i++) { @@ -789,7 +793,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); /* Note: as we release the buf_pool mutex above, in buf_flush_try_page we cannot be sure @@ -800,14 +804,14 @@ count += buf_flush_try_page(space, i, flush_type); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->hash_mutex)); } else { mutex_exit(&block->mutex); } } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(count); } @@ -863,6 +867,13 @@ (buf_pool->init_flush)[flush_type] = TRUE; + mutex_exit(&(buf_pool->mutex)); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); + for (;;) { /* If we have flushed enough, leave the loop */ if (page_count >= min_n) { @@ -908,7 +919,10 @@ offset = block->offset; mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + mutex_exit(&(buf_pool->flush_list_mutex)); old_page_count = page_count; @@ -920,7 +934,10 @@ flush_type, offset, page_count - old_page_count); */ - mutex_enter(&(buf_pool->mutex)); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&(buf_pool->LRU_mutex)); + } + mutex_enter(&(buf_pool->flush_list_mutex)); } else if (flush_type == BUF_FLUSH_LRU) { @@ -943,6 +960,13 @@ } } + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&(buf_pool->LRU_mutex)); + } + mutex_exit(&(buf_pool->flush_list_mutex)); + + mutex_enter(&(buf_pool->mutex)); + (buf_pool->init_flush)[flush_type] = FALSE; if ((buf_pool->n_flush[flush_type] == 0) @@ -1001,10 +1025,14 @@ ulint n_replaceable; ulint distance = 0; - mutex_enter(&(buf_pool->mutex)); + /* optimistic search... */ + //mutex_enter(&(buf_pool->LRU_mutex)); + //mutex_enter(&(buf_pool->free_mutex)); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + //mutex_exit(&(buf_pool->free_mutex)); + block = UT_LIST_GET_LAST(buf_pool->LRU); while ((block != NULL) @@ -1025,7 +1053,7 @@ block = UT_LIST_GET_PREV(LRU, block); } - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->LRU_mutex)); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { @@ -1044,8 +1072,9 @@ immediately, without waiting. */ void -buf_flush_free_margin(void) +buf_flush_free_margin( /*=======================*/ + ibool wait) { ulint n_to_flush; ulint n_flushed; @@ -1055,7 +1084,7 @@ if (n_to_flush > 0) { n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, ut_dulint_zero); - if (n_flushed == ULINT_UNDEFINED) { + if (wait && n_flushed == ULINT_UNDEFINED) { /* There was an LRU type flush batch already running; let us wait for it to end */ @@ -1105,11 +1134,11 @@ { ibool ret; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ret = buf_flush_validate_low(); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(ret); } diff -ruN mysql-5.1.29-rc_orig/storage/innobase/buf/buf0lru.c mysql-5.1.29-rc/storage/innobase/buf/buf0lru.c --- mysql-5.1.29-rc_orig/storage/innobase/buf/buf0lru.c 2008-10-12 06:54:12.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/buf/buf0lru.c 2008-11-18 15:09:58.000000000 +0900 @@ -79,7 +79,10 @@ ibool all_freed; scan_again: - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + mutex_enter(&(buf_pool->hash_mutex)); all_freed = TRUE; @@ -119,7 +122,10 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); /* Note that the following call will acquire an S-latch on the page */ @@ -149,7 +155,10 @@ block = prev_block; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); + mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->hash_mutex)); if (!all_freed) { os_thread_sleep(20000); @@ -172,14 +181,14 @@ ulint len; ulint limit; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); len = UT_LIST_GET_LEN(buf_pool->LRU); if (len < BUF_LRU_OLD_MIN_LEN) { /* The LRU list is too short to do read-ahead */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(0); } @@ -188,7 +197,7 @@ limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); return(limit); } @@ -212,13 +221,15 @@ ulint distance = 0; ibool freed; - mutex_enter(&(buf_pool->mutex)); + /* optimistic search... */ + //mutex_enter(&(buf_pool->LRU_mutex)); +retry: freed = FALSE; block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { - ut_a(block->in_LRU_list); + //ut_a(block->in_LRU_list); /* optimistic */ mutex_enter(&block->mutex); @@ -234,9 +245,17 @@ } #endif /* UNIV_DEBUG */ + mutex_exit(&block->mutex); + + mutex_enter(&(buf_pool->LRU_mutex));/* optimistic */ + + mutex_enter(&(buf_pool->hash_mutex)); + mutex_enter(&block->mutex); + if(block->in_LRU_list && buf_flush_ready_for_replace(block)) { buf_LRU_block_remove_hashed_page(block); + mutex_exit(&(buf_pool->hash_mutex)); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); mutex_exit(&block->mutex); /* Remove possible adaptive hash index built on the @@ -257,14 +276,25 @@ ut_a(block->buf_fix_count == 0); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); mutex_enter(&block->mutex); buf_LRU_block_free_hashed_page(block); freed = TRUE; + mutex_exit(&(buf_pool->free_mutex)); mutex_exit(&block->mutex); break; + } else { /* someone may interrupt...??? */ + mutex_exit(&(buf_pool->LRU_mutex));/* optimistic */ + + mutex_exit(&(buf_pool->hash_mutex)); + + if (!(block->in_LRU_list)) { + mutex_exit(&block->mutex); + goto retry; + } + } } mutex_exit(&block->mutex); @@ -275,13 +305,21 @@ if (!freed && n_iterations <= 10 && distance > 100 + (n_iterations * buf_pool->curr_size) / 10) { - buf_pool->LRU_flush_ended = 0; + mutex_enter(&(buf_pool->mutex)); + buf_pool->LRU_flush_ended = 0; mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->LRU_mutex)); + return(FALSE); } } + if (!freed) { + //mutex_exit(&(buf_pool->LRU_mutex)); + } + + mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } @@ -333,7 +371,8 @@ { ibool ret = FALSE; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) { @@ -341,7 +380,8 @@ ret = TRUE; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(ret); } @@ -364,7 +404,7 @@ ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->free_mutex)); /* LRU info:optimistic */ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { @@ -461,7 +501,7 @@ mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -473,7 +513,7 @@ /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); freed = buf_LRU_search_and_free_block(n_iterations); @@ -517,7 +557,7 @@ /* No free block was found: try to flush the LRU list */ - buf_flush_free_margin(); + buf_flush_free_margin(TRUE); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); @@ -988,7 +1028,7 @@ ulint LRU_pos; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -1033,6 +1073,9 @@ ut_a(buf_pool->LRU_old_len == old_len); } + mutex_exit(&(buf_pool->LRU_mutex)); + mutex_enter(&(buf_pool->free_mutex)); + UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); block = UT_LIST_GET_FIRST(buf_pool->free); @@ -1043,7 +1086,7 @@ block = UT_LIST_GET_NEXT(free, block); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->free_mutex)); return(TRUE); } @@ -1059,7 +1102,7 @@ ulint len; ut_ad(buf_pool); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->LRU_mutex)); fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); @@ -1105,6 +1148,6 @@ } } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->LRU_mutex)); } #endif /* UNIV_DEBUG */ diff -ruN mysql-5.1.29-rc_orig/storage/innobase/buf/buf0rea.c mysql-5.1.29-rc/storage/innobase/buf/buf0rea.c --- mysql-5.1.29-rc_orig/storage/innobase/buf/buf0rea.c 2008-10-12 06:54:12.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/buf/buf0rea.c 2008-11-18 15:28:13.000000000 +0900 @@ -219,10 +219,12 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ + mutex_enter(&(buf_pool->hash_mutex)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -233,8 +235,9 @@ recent_blocks++; } } + mutex_exit(&(buf_pool->hash_mutex)); - mutex_exit(&(buf_pool->mutex)); + // mutex_exit(&(buf_pool->mutex)); if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { /* Do nothing */ @@ -334,7 +337,7 @@ } /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); return(count + count2); } @@ -432,6 +435,7 @@ return(0); } + mutex_exit(&(buf_pool->mutex)); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -445,6 +449,7 @@ fail_count = 0; + mutex_enter(&(buf_pool->hash_mutex)); for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); @@ -462,12 +467,13 @@ pred_block = block; } } + mutex_exit(&(buf_pool->hash_mutex)); if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - BUF_READ_AHEAD_LINEAR_THRESHOLD) { /* Too many failures: return */ - mutex_exit(&(buf_pool->mutex)); + //mutex_exit(&(buf_pool->mutex)); return(0); } @@ -475,10 +481,11 @@ /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ + mutex_enter(&(buf_pool->hash_mutex)); block = buf_page_hash_get(space, offset); if (block == NULL) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); return(0); } @@ -494,7 +501,7 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->hash_mutex)); if ((offset == low) && (succ_offset == offset + 1)) { @@ -573,7 +580,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { @@ -639,7 +646,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { @@ -716,7 +723,7 @@ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { diff -ruN mysql-5.1.29-rc_orig/storage/innobase/include/buf0buf.h mysql-5.1.29-rc/storage/innobase/include/buf0buf.h --- mysql-5.1.29-rc_orig/storage/innobase/include/buf0buf.h 2008-10-12 06:54:13.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/include/buf0buf.h 2008-11-18 15:09:58.000000000 +0900 @@ -926,6 +926,7 @@ currently always the same as max_size */ hash_table_t* page_hash; /* hash table of the file pages */ + mutex_t hash_mutex; ulint n_pend_reads; /* number of pending read operations */ @@ -958,6 +959,7 @@ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; /* base node of the modified block list */ + mutex_t flush_list_mutex; ibool init_flush[BUF_FLUSH_LIST + 1]; /* this is TRUE when a flush of the given type is being initialized */ @@ -991,8 +993,10 @@ in the case of AWE, at the start are always free blocks for which the physical memory is mapped to a frame */ + mutex_t free_mutex; UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ + mutex_t LRU_mutex; buf_block_t* LRU_old; /* pointer to the about 3/8 oldest blocks in the LRU list; NULL if LRU length less than BUF_LRU_OLD_MIN_LEN */ diff -ruN mysql-5.1.29-rc_orig/storage/innobase/include/buf0buf.ic mysql-5.1.29-rc/storage/innobase/include/buf0buf.ic --- mysql-5.1.29-rc_orig/storage/innobase/include/buf0buf.ic 2008-10-12 06:54:13.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/include/buf0buf.ic 2008-11-18 15:09:58.000000000 +0900 @@ -104,7 +104,7 @@ buf_block_t* block; dulint lsn; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -114,7 +114,7 @@ lsn = block->oldest_modification; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(lsn); } @@ -388,18 +388,18 @@ /* out: TRUE if io going on */ buf_block_t* block) /* in: buf_pool block, must be bufferfixed */ { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); ut_ad(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); if (block->io_fix != 0) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(TRUE); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(FALSE); } @@ -421,7 +421,7 @@ block = buf_block_align(frame); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->state == BUF_BLOCK_FILE_PAGE) { lsn = block->newest_modification; @@ -429,7 +429,7 @@ lsn = ut_dulint_zero; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(lsn); } @@ -624,9 +624,9 @@ ut_a(block->buf_fix_count > 0); if (rw_latch == RW_X_LATCH && mtr->modifications) { - mutex_enter(&buf_pool->mutex); + mutex_enter(&buf_pool->flush_list_mutex); buf_flush_note_modification(block, mtr); - mutex_exit(&buf_pool->mutex); + mutex_exit(&buf_pool->flush_list_mutex); } mutex_enter(&block->mutex); diff -ruN mysql-5.1.29-rc_orig/storage/innobase/include/buf0flu.h mysql-5.1.29-rc/storage/innobase/include/buf0flu.h --- mysql-5.1.29-rc_orig/storage/innobase/include/buf0flu.h 2008-10-12 06:54:13.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/include/buf0flu.h 2008-11-18 15:09:58.000000000 +0900 @@ -26,8 +26,9 @@ a margin of replaceable pages there. */ void -buf_flush_free_margin(void); +buf_flush_free_margin( /*=======================*/ + ibool wait); /************************************************************************ Initializes a page for writing to the tablespace. */ diff -ruN mysql-5.1.29-rc_orig/storage/innobase/include/buf0flu.ic mysql-5.1.29-rc/storage/innobase/include/buf0flu.ic --- mysql-5.1.29-rc_orig/storage/innobase/include/buf0flu.ic 2008-10-12 06:54:13.000000000 +0900 +++ mysql-5.1.29-rc/storage/innobase/include/buf0flu.ic 2008-11-18 15:09:58.000000000 +0900 @@ -84,7 +84,7 @@ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0); @@ -102,5 +102,5 @@ start_lsn) <= 0); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); } diff -ruN mysql-5.1.29-rc_orig/patch_info/split_buf_pool_mutex_fixed_optimistic_safe.info mysql-5.1.29-rc/patch_info/split_buf_pool_mutex_fixed_optimistic_safe.info --- /dev/null 1970-01-01 09:00:00.000000000 +0900 +++ mysql-5.1.29-rc/patch_info/split_buf_pool_mutex_fixed_optimistic_safe.info 2008-11-18 15:09:58.000000000 +0900 @@ -0,0 +1,6 @@ +File=split_buf_pool_mutex_fixed_optimistic_safe.patch +Name=InnoDB patch to fix buffer pool scalability +Version=1.0 +Author=Yasufumi Kinoshita +License=BSD +Comment=