5 #define my_b_tell(info) ((info)->pos_in_file + \
6 (size_t) (*(info)->current_pos - (info)->request_pos))
7 +#define my_b_write_tell(info) ((info)->pos_in_file + \
8 + ((info)->write_pos - (info)->write_buffer))
10 #define my_b_get_buffer_start(info) (info)->request_pos
11 #define my_b_get_bytes_in_buffer(info) (char*) (info)->read_end - \
12 --- a/include/mysql/plugin.h
13 +++ b/include/mysql/plugin.h
16 #define EXTENDED_FOR_USERSTAT
18 +#define EXTENDED_FOR_COMMIT_ORDERED
21 Create a temporary file.
26 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
27 uint known_extensions_id= 0;
29 +static int commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans,
30 + bool is_real_trans);
33 static plugin_ref ha_default_plugin(THD *thd)
36 bool is_real_trans= all || thd->transaction.all.ha_list == 0;
37 Ha_trx_info *ha_info= trans->ha_list;
38 - my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
39 + bool need_commit_ordered;
41 DBUG_ENTER("ha_commit_trans");
44 @@ -1152,13 +1155,20 @@
51 + /* Free resources and perform other cleanup even for 'empty' transactions. */
53 + thd->transaction.cleanup();
60 MDL_request mdl_request;
62 - DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
63 + DBUG_EXECUTE_IF("crash_commit_before", abort(););
65 /* Close all cursors that can not survive COMMIT */
66 if (is_real_trans) /* not a statement commit */
67 @@ -1197,57 +1207,80 @@
70 my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
71 - ha_rollback_trans(thd, all);
77 - if (!trans->no_2pc && (rw_ha_count > 1))
78 + if (trans->no_2pc || (rw_ha_count <= 1))
80 - for (; ha_info && !error; ha_info= ha_info->next())
81 + error= ha_commit_one_phase(thd, all);
82 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
86 + need_commit_ordered= FALSE;
87 + xid= thd->transaction.xid_state.xid.get_my_xid();
89 + for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
92 - handlerton *ht= ha_info->ht();
93 + handlerton *ht= hi->ht();
95 Do not call two-phase commit if this particular
96 transaction is read-only. This allows for simpler
97 implementation in engines that are always read-only.
99 - if (! ha_info->is_trx_read_write())
100 + if (! hi->is_trx_read_write())
103 Sic: we know that prepare() is not NULL since otherwise
104 trans->no_2pc would have been set.
106 - if ((err= ht->prepare(ht, thd, all)))
108 - my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
111 + err= ht->prepare(ht, thd, all);
112 status_var_increment(thd->status_var.ha_prepare_count);
114 + my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
119 + need_commit_ordered|= (ht->commit_ordered != NULL);
121 - DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
122 - if (error || (is_real_trans && xid &&
123 - (error= !(cookie= tc_log->log_xid(thd, xid)))))
124 + DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
126 + if (!is_real_trans)
128 - ha_rollback_trans(thd, all);
130 + error= commit_one_phase_low(thd, all, trans, is_real_trans);
131 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
134 - DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
136 - error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
137 - DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
140 + cookie= tc_log->log_and_order(thd, xid, all, need_commit_ordered);
144 + DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
146 + error= commit_one_phase_low(thd, all, trans, is_real_trans) ? 2 : 0;
147 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
148 + if (is_real_trans) /* userstat.patch */
149 + thd->diff_commit_trans++; /* userstat.patch */
150 + RUN_HOOK(transaction, after_commit, (thd, FALSE));
152 + DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
153 if(tc_log->unlog(cookie, xid))
158 - DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
160 - thd->diff_commit_trans++;
161 - RUN_HOOK(transaction, after_commit, (thd, FALSE));
163 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
166 + /* Come here if error and we need to rollback. */
168 + error= 1; /* Transaction was rolled back */
169 + ha_rollback_trans(thd, all);
172 if (rw_trans && mdl_request.ticket)
174 @@ -1260,9 +1293,6 @@
175 thd->mdl_context.release_lock(mdl_request.ticket);
178 - /* Free resources and perform other cleanup even for 'empty' transactions. */
179 - else if (is_real_trans)
180 - thd->transaction.cleanup();
184 @@ -1279,7 +1309,6 @@
186 int ha_commit_one_phase(THD *thd, bool all)
189 THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
191 "real" is a nick name for a transaction for which a commit will
192 @@ -1295,8 +1324,16 @@
193 transaction.all.ha_list, see why in trans_register_ha()).
195 bool is_real_trans=all || thd->transaction.all.ha_list == 0;
196 - Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
197 DBUG_ENTER("ha_commit_one_phase");
198 + DBUG_RETURN(commit_one_phase_low(thd, all, trans, is_real_trans));
202 +commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
205 + Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
206 + DBUG_ENTER("commit_one_phase_low");
210 @@ -1894,7 +1931,16 @@
215 + Holding the LOCK_commit_ordered mutex ensures that we get the same
216 + snapshot for all engines (including the binary log). This allows us
217 + among other things to do backups with
218 + START TRANSACTION WITH CONSISTENT SNAPSHOT and
219 + have a consistent binlog position.
221 + mysql_mutex_lock(&LOCK_commit_ordered);
222 plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
223 + mysql_mutex_unlock(&LOCK_commit_ordered);
226 Same idea as when one wants to CREATE TABLE in one engine which does not
230 and 'real commit' mean the same event.
232 int (*commit)(handlerton *hton, THD *thd, bool all);
234 + The commit_ordered() method is called prior to the commit() method, after
235 + the transaction manager has decided to commit (not rollback) the
236 + transaction. Unlike commit(), commit_ordered() is called only when the
237 + full transaction is committed, not for each commit of statement
238 + transaction in a multi-statement transaction.
240 + Not that like prepare(), commit_ordered() is only called when 2-phase
241 + commit takes place. Ie. when no binary log and only a single engine
242 + participates in a transaction, one commit() is called, no
243 + commit_ordered(). So engines must be prepared for this.
245 + The calls to commit_ordered() in multiple parallel transactions is
246 + guaranteed to happen in the same order in every participating
247 + handler. This can be used to ensure the same commit order among multiple
248 + handlers (eg. in table handler and binlog). So if transaction T1 calls
249 + into commit_ordered() of handler A before T2, then T1 will also call
250 + commit_ordered() of handler B before T2.
252 + Engines that implement this method should during this call make the
253 + transaction visible to other transactions, thereby making the order of
254 + transaction commits be defined by the order of commit_ordered() calls.
256 + The intention is that commit_ordered() should do the minimal amount of
257 + work that needs to happen in consistent commit order among handlers. To
258 + preserve ordering, calls need to be serialised on a global mutex, so
259 + doing any time-consuming or blocking operations in commit_ordered() will
262 + Handlers can rely on commit_ordered() calls to be serialised (no two
263 + calls can run in parallel, so no extra locking on the handler part is
264 + required to ensure this).
266 + Note that commit_ordered() can be called from a different thread than the
267 + one handling the transaction! So it can not do anything that depends on
268 + thread local storage, in particular it can not call my_error() and
269 + friends (instead it can store the error code and delay the call of
270 + my_error() to the commit() method).
272 + Similarly, since commit_ordered() returns void, any return error code
273 + must be saved and returned from the commit() method instead.
275 + The commit_ordered method is optional, and can be left unset if not
276 + needed in a particular handler (then there will be no ordering guarantees
277 + wrt. other engines and binary log).
279 + void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
280 int (*rollback)(handlerton *hton, THD *thd, bool all);
281 int (*prepare)(handlerton *hton, THD *thd, bool all);
282 int (*recover)(handlerton *hton, XID *xid_list, uint len);
286 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
287 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
289 +static LEX_STRING const write_error_msg=
290 + { C_STRING_WITH_LEN("error writing to the binary log") };
292 +static my_bool mutexes_inited;
293 +mysql_mutex_t LOCK_group_commit_queue;
294 +mysql_mutex_t LOCK_commit_ordered;
296 +static ulonglong binlog_status_var_num_commits;
297 +static ulonglong binlog_status_var_num_group_commits;
299 +static SHOW_VAR binlog_status_vars_detail[]=
302 + (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
304 + (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
305 + {NullS, NullS, SHOW_LONG}
309 purge logs, master and slave sides both, related error code
315 - Helper class to hold a mutex for the duration of the
318 - Eliminates the need for explicit unlocking of mutexes on, e.g.,
319 - error returns. On passing a null pointer, the sentry will not do
325 - Mutex_sentry(mysql_mutex_t *mutex)
329 - mysql_mutex_lock(mutex);
335 - mysql_mutex_unlock(m_mutex);
342 - mysql_mutex_t *m_mutex;
344 - // It's not allowed to copy this object in any way
345 - Mutex_sentry(Mutex_sentry const&);
346 - void operator=(Mutex_sentry const&);
350 Helper classes to store non-transactional and transactional data
351 before copying it to the binary log.
354 binlog_cache_data(): m_pending(0), before_stmt_pos(MY_OFF_T_UNDEF),
355 incident(FALSE), changes_to_non_trans_temp_table_flag(FALSE),
356 saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
357 - ptr_binlog_cache_disk_use(0)
358 + ptr_binlog_cache_disk_use(0), commit_bin_log_file_pos(0),
359 + using_xa(FALSE), xa_xid(0)
364 variable after truncating the cache.
366 cache_log.disk_writes= 0;
368 + commit_bin_log_file_pos= 0;
369 DBUG_ASSERT(empty());
374 binlog_cache_data& operator=(const binlog_cache_data& info);
375 binlog_cache_data(const binlog_cache_data& info);
379 + Binlog position after current commit, available to storage engines during
380 + commit_ordered() and commit().
382 + ulonglong commit_bin_log_file_pos;
385 + Flag set true if this transaction is committed with log_xid() as part of
392 class binlog_cache_mngr {
393 @@ -1624,7 +1625,7 @@
396 binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
397 - bool is_transactional)
398 + bool is_transactional, bool all)
400 DBUG_ENTER("binlog_flush_cache");
402 @@ -1643,8 +1644,8 @@
403 were, we would have to ensure that we're not ending a statement
404 inside a stored function.
406 - error= mysql_bin_log.write(thd, &cache_data->cache_log, end_evt,
407 - cache_data->has_incident());
408 + error= mysql_bin_log.write_transaction_to_binlog(thd, cache_data,
413 @@ -1663,12 +1664,12 @@
416 binlog_commit_flush_stmt_cache(THD *thd,
417 - binlog_cache_mngr *cache_mngr)
418 + binlog_cache_mngr *cache_mngr, bool all)
420 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
421 FALSE, FALSE, TRUE, 0);
422 return (binlog_flush_cache(thd, &cache_mngr->stmt_cache, &end_evt,
428 @@ -1681,12 +1682,12 @@
429 nonzero if an error pops up when flushing the cache.
432 -binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
433 +binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
435 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
436 TRUE, FALSE, TRUE, 0);
437 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
443 @@ -1699,12 +1700,12 @@
444 nonzero if an error pops up when flushing the cache.
447 -binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
448 +binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
450 Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
451 TRUE, FALSE, TRUE, 0);
452 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
458 @@ -1719,11 +1720,11 @@
461 binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr,
463 + my_xid xid, bool all)
465 Xid_log_event end_evt(thd, xid);
466 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
472 @@ -1785,7 +1786,7 @@
474 just pretend we can do 2pc, so that MySQL won't
476 - real work will be done in MYSQL_BIN_LOG::log_xid()
477 + real work will be done in MYSQL_BIN_LOG::log_and_order()
481 @@ -1818,7 +1819,7 @@
483 if (!cache_mngr->stmt_cache.empty())
485 - error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
486 + error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
489 if (cache_mngr->trx_cache.empty())
490 @@ -1837,7 +1838,7 @@
491 Otherwise, we accumulate the changes.
493 if (!error && ending_trans(thd, all))
494 - error= binlog_commit_flush_trx_cache(thd, cache_mngr);
495 + error= binlog_commit_flush_trx_cache(thd, cache_mngr, all);
498 This is part of the stmt rollback.
499 @@ -1881,7 +1882,7 @@
501 else if (!cache_mngr->stmt_cache.empty())
503 - error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
504 + error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
507 if (cache_mngr->trx_cache.empty())
508 @@ -1929,7 +1930,7 @@
509 (trans_has_updated_non_trans_table(thd) &&
510 ending_single_stmt_trans(thd,all) &&
511 thd->variables.binlog_format == BINLOG_FORMAT_MIXED)))
512 - error= binlog_rollback_flush_trx_cache(thd, cache_mngr);
513 + error= binlog_rollback_flush_trx_cache(thd, cache_mngr, all);
515 Truncate the cache if:
516 . aborting a single or multi-statement transaction or;
517 @@ -2904,6 +2905,7 @@
518 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
519 :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
520 need_start_event(TRUE),
521 + group_commit_queue(0), num_commits(0), num_group_commits(0),
522 sync_period_ptr(sync_period),
523 is_relay_log(0), signal_cnt(0),
524 description_event_for_exec(0), description_event_for_queue(0)
525 @@ -5361,19 +5363,15 @@
528 cache Cache to write to the binary log
529 - lock_log True if the LOCK_log mutex should be aquired, false otherwise
530 - sync_log True if the log should be flushed and synced
533 Write the contents of the cache to the binary log. The cache will
534 be reset as a READ_CACHE to be able to read the contents from it.
537 -int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache,
538 - bool lock_log, bool sync_log)
539 +int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
541 - Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
543 + mysql_mutex_assert_owner(&LOCK_log);
544 if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
545 return ER_ERROR_ON_WRITE;
546 uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
547 @@ -5484,6 +5482,8 @@
550 /* Write data to the binary log file */
551 + DBUG_EXECUTE_IF("fail_binlog_write_1",
552 + errno= 28; return ER_ERROR_ON_WRITE;);
553 if (my_b_write(&log_file, cache->read_pos, length))
554 return ER_ERROR_ON_WRITE;
555 thd->binlog_bytes_written+= length;
556 @@ -5492,9 +5492,6 @@
558 DBUG_ASSERT(carry == 0);
561 - return flush_and_sync(0);
566 @@ -5535,8 +5532,6 @@
570 - LEX_STRING const write_error_msg=
571 - { C_STRING_WITH_LEN("error writing to the binary log") };
572 Incident incident= INCIDENT_LOST_EVENTS;
573 Incident_log_event ev(thd, incident, write_error_msg);
575 @@ -5585,112 +5580,332 @@
576 'cache' needs to be reinitialized after this functions returns.
579 -bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
582 +MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
583 + Log_event *end_ev, bool all)
585 + group_commit_entry entry;
587 + DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
590 + entry.cache_data= cache_data;
595 + Log "BEGIN" at the beginning of every transaction. Here, a transaction is
596 + either a BEGIN..COMMIT block or a single statement in autocommit mode.
598 + Create the necessary events here, where we have the correct THD (and
601 + Due to group commit the actual writing to binlog may happen in a different
604 + Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
605 + entry.begin_event= &qinfo;
606 + entry.end_event= end_ev;
607 + if (cache_data->has_incident())
609 + Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
610 + entry.incident_event= &inc_ev;
611 + ret = write_transaction_to_binlog_events(&entry);
615 + entry.incident_event= NULL;
616 + ret = write_transaction_to_binlog_events(&entry);
618 + if (!ret) /* userstat.patch */
619 + thd->binlog_bytes_written += qinfo.data_written; /* userstat.patch */
624 +MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
626 - DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
628 + To facilitate group commit for the binlog, we first queue up ourselves in
629 + the group commit queue. Then the first thread to enter the queue waits for
630 + the LOCK_log mutex, and commits for everyone in the queue once it gets the
631 + lock. Any other threads in the queue just wait for the first one to finish
632 + the commit and wake them up.
634 + entry->thd->clear_wakeup_ready();
635 + mysql_mutex_lock(&LOCK_group_commit_queue);
636 + group_commit_entry *orig_queue= group_commit_queue;
637 + entry->next= orig_queue;
638 + group_commit_queue= entry;
639 + DEBUG_SYNC(entry->thd, "commit_group_commit_queue");
640 + mysql_mutex_unlock(&LOCK_group_commit_queue);
643 + The first in the queue handle group commit for all; the others just wait
644 + to be signalled when group commit is done.
646 + if (orig_queue != NULL)
647 + entry->thd->wait_for_wakeup_ready();
649 + trx_group_commit_leader(entry);
651 + if (likely(!entry->error))
654 + switch (entry->error)
656 + case ER_ERROR_ON_WRITE:
657 + my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
659 + case ER_ERROR_ON_READ:
660 + my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
661 + entry->cache_data->cache_log.file_name, entry->commit_errno);
665 + There are not (and should not be) any errors thrown not covered above.
666 + But just in case one is added later without updating the above switch
667 + statement, include a catch-all.
669 + my_printf_error(entry->error,
670 + "Error writing transaction to binary log: %d",
671 + MYF(ME_NOREFRESH), entry->error);
675 + Since we return error, this transaction XID will not be committed, so
676 + we need to mark it as not needed for recovery (unlog() is not called
677 + for a transaction if log_xid() fails).
679 + if (entry->cache_data->using_xa && entry->cache_data->xa_xid)
686 + Do binlog group commit as the lead thread.
688 + This must be called when this thread/transaction is queued at the start of
689 + the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
690 + commit all the transactions in the queue (more may have entered while waiting
691 + for LOCK_log). After commit is done, all other threads in the queue will be
696 +MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
698 + DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
700 + uint write_count= 0;
701 + bool check_purge= false;
702 + group_commit_entry *current= 0;
703 DBUG_ASSERT(is_open());
704 if (likely(is_open())) // Should always be true
709 + Lock the LOCK_log(), and once we get it, collect any additional writes
710 + that queued up while we were waiting.
712 mysql_mutex_lock(&LOCK_log);
714 + DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
715 + mysql_mutex_lock(&LOCK_group_commit_queue);
716 + current= group_commit_queue;
717 + group_commit_queue= NULL;
718 + mysql_mutex_unlock(&LOCK_group_commit_queue);
720 + /* As the queue is in reverse order of entering, reverse it. */
721 + group_commit_entry *queue= NULL;
724 + group_commit_entry *next= current->next;
725 + current->next= queue;
729 + DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
731 - We only bother to write to the binary log if there is anything
734 - if (my_b_tell(cache) > 0)
735 + Now we have in queue the list of transactions to be committed in order.
737 + Commit every transaction in the queue.
739 + Note that we are doing this in a different thread than the one running
740 + the transaction! So we are limited in the operations we can do. In
741 + particular, we cannot call my_error() on behalf of a transaction, as
742 + that obtains the THD from thread local storage. Instead, we must set
743 + current->error and let the thread do the error reporting itself once
746 + for (current= queue; current != NULL; current= current->next)
748 + binlog_cache_data *cache_data= current->cache_data;
749 + IO_CACHE *cache= &cache_data->cache_log;
752 - Log "BEGIN" at the beginning of every transaction. Here, a
753 - transaction is either a BEGIN..COMMIT block or a single
754 - statement in autocommit mode.
755 + We only bother to write to the binary log if there is anything
758 - Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
759 - if (qinfo.write(&log_file))
761 - thd->binlog_bytes_written+= qinfo.data_written;
762 - DBUG_EXECUTE_IF("crash_before_writing_xid",
764 - if ((write_error= write_cache(thd, cache, false, true)))
765 - DBUG_PRINT("info", ("error writing binlog cache: %d",
767 - DBUG_PRINT("info", ("crashing before writing xid"));
771 - if ((write_error= write_cache(thd, cache, false, false)))
774 - if (commit_event && commit_event->write(&log_file))
777 - thd->binlog_bytes_written+= commit_event->data_written;
778 + if (my_b_tell(cache) > 0)
780 + if ((current->error= write_transaction(current)))
781 + current->commit_errno= errno;
785 - if (incident && write_incident(thd, FALSE))
787 + cache_data->commit_bin_log_file_pos= my_b_write_tell(&log_file);
788 + if (cache_data->using_xa && cache_data->xa_xid)
792 + if (write_count > 0)
795 if (flush_and_sync(&synced))
797 - DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
798 - if (cache->error) // Error on read
800 - sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
801 - write_error=1; // Don't give more errors
803 + for (current= queue; current != NULL; current= current->next)
805 + if (!current->error)
807 + current->error= ER_ERROR_ON_WRITE;
808 + current->commit_errno= errno;
817 if (RUN_HOOK(binlog_storage, after_flush,
818 - (thd, log_file_name, log_file.pos_in_file, synced)))
819 + (leader->thd, log_file_name, log_file.pos_in_file, synced)))
821 sql_print_error("Failed to run 'after_flush' hooks");
824 + for (current= queue; current != NULL; current= current->next)
826 + if (!current->error)
828 + current->error= ER_ERROR_ON_WRITE;
829 + current->commit_errno= errno;
838 - if commit_event is Xid_log_event, increase the number of
839 - prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
840 + if any commit_events are Xid_log_event, increase the number of
841 + prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
842 if there're prepared xids in it - see the comment in new_file() for
844 - If the commit_event is not Xid_log_event (then it's a Query_log_event)
845 - rotate binlog, if necessary.
846 + If no Xid_log_events (then it's all Query_log_event) rotate binlog,
849 - if (commit_event && commit_event->get_type_code() == XID_EVENT)
852 - mysql_mutex_lock(&LOCK_prep_xids);
854 - mysql_mutex_unlock(&LOCK_prep_xids);
855 - mysql_mutex_unlock(&LOCK_log);
856 + mark_xids_active(xid_count);
860 if (rotate(false, &check_purge))
862 - mysql_mutex_unlock(&LOCK_log);
866 + for (current= queue; current != NULL; current= current->next)
868 + if (!current->error)
870 + current->error= ER_ERROR_ON_WRITE;
871 + current->commit_errno= errno;
879 + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
880 + mysql_mutex_lock(&LOCK_commit_ordered);
882 + We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
883 + otherwise scheduling could allow the next group commit to run ahead of us,
884 + messing up the order of commit_ordered() calls. But as soon as
885 + LOCK_commit_ordered is obtained, we can let the next group commit start.
892 - sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
893 + mysql_mutex_unlock(&LOCK_log);
895 + if (xid_count > 0 && check_purge)
900 + DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
901 + ++num_group_commits;
904 + Wakeup each participant waiting for our group commit, first calling the
905 + commit_ordered() methods for any transactions doing 2-phase commit.
908 + while (current != NULL)
910 + group_commit_entry *next;
912 + DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
914 + if (current->cache_data->using_xa && !current->error)
915 + run_commit_ordered(current->thd, current->all);
918 + Careful not to access current->next after waking up the other thread! As
919 + it may change immediately after wakeup.
921 + next= current->next;
922 + if (current != leader) // Don't wake up ourself
923 + current->thd->signal_wakeup_ready();
926 + DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
927 + mysql_mutex_unlock(&LOCK_commit_ordered);
929 - mysql_mutex_unlock(&LOCK_log);
937 +MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry)
939 + binlog_cache_data *cache_data= entry->cache_data;
940 + IO_CACHE *cache= &cache_data->cache_log;
942 + if (entry->begin_event->write(&log_file))
943 + return ER_ERROR_ON_WRITE;
945 + DBUG_EXECUTE_IF("crash_before_writing_xid",
947 + if ((write_cache(entry->thd, cache)))
948 + DBUG_PRINT("info", ("error writing binlog cache"));
952 + DBUG_PRINT("info", ("crashing before writing xid"));
956 + if (write_cache(entry->thd, cache))
957 + return ER_ERROR_ON_WRITE;
959 + if (entry->end_event->write(&log_file))
960 + return ER_ERROR_ON_WRITE;
962 + if (entry->incident_event && entry->incident_event->write(&log_file))
963 + return ER_ERROR_ON_WRITE;
965 + if (cache->error) // Error on read
966 + return ER_ERROR_ON_READ;
972 Wait until we get a signal that the relay log has been updated.
974 @@ -6095,6 +6310,68 @@
981 + mysql_mutex_init(key_LOCK_group_commit_queue, &LOCK_group_commit_queue, MY_MUTEX_INIT_SLOW);
982 + mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered, MY_MUTEX_INIT_SLOW);
983 + mutexes_inited= TRUE;
990 + if (mutexes_inited)
992 + mysql_mutex_destroy(&LOCK_group_commit_queue);
993 + mysql_mutex_destroy(&LOCK_commit_ordered);
994 + mutexes_inited= FALSE;
1000 +TC_LOG::run_commit_ordered(THD *thd, bool all)
1002 + Ha_trx_info *ha_info=
1003 + all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
1005 + mysql_mutex_assert_owner(&LOCK_commit_ordered);
1006 + for (; ha_info; ha_info= ha_info->next())
1008 + handlerton *ht= ha_info->ht();
1009 + if (!ht->commit_ordered)
1011 + ht->commit_ordered(ht, thd, all);
1012 + DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
1016 +int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
1017 + bool need_commit_ordered)
1023 + cookie= log_one_transaction(xid);
1025 + if (need_commit_ordered)
1027 + /* Only run commit_ordered() if log_xid was successful. */
1030 + mysql_mutex_lock(&LOCK_commit_ordered);
1031 + run_commit_ordered(thd, all);
1032 + mysql_mutex_unlock(&LOCK_commit_ordered);
1040 /********* transaction coordinator log for 2pc - mmap() based solution *******/
1043 @@ -6231,6 +6508,7 @@
1044 mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
1045 mysql_cond_init(key_COND_active, &COND_active, 0);
1046 mysql_cond_init(key_COND_pool, &COND_pool, 0);
1047 + mysql_cond_init(key_COND_queue_busy, &COND_queue_busy, 0);
1051 @@ -6238,6 +6516,8 @@
1054 pool_last=pages+npages-1;
1055 + commit_ordered_queue= NULL;
1056 + commit_ordered_queue_busy= false;
1060 @@ -6343,7 +6623,7 @@
1061 to the position in memory where xid was logged to.
1064 -int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
1065 +int TC_LOG_MMAP::log_one_transaction(my_xid xid)
1069 @@ -6482,7 +6762,9 @@
1070 mysql_mutex_destroy(&LOCK_sync);
1071 mysql_mutex_destroy(&LOCK_active);
1072 mysql_mutex_destroy(&LOCK_pool);
1073 + mysql_cond_destroy(&COND_active);
1074 mysql_cond_destroy(&COND_pool);
1075 + mysql_cond_destroy(&COND_queue_busy);
1077 data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
1079 @@ -6692,42 +6974,87 @@
1080 mysql_cond_destroy(&COND_prep_xids);
1087 + Do a binlog log_xid() for a group of transactions, linked through
1088 + thd->next_commit_ordered.
1095 -int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
1096 +int TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
1097 + bool need_commit_ordered __attribute__((unused)))
1099 - DBUG_ENTER("TC_LOG_BINLOG::log");
1100 + DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
1101 binlog_cache_mngr *cache_mngr=
1102 (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1104 + cache_mngr->trx_cache.using_xa= TRUE;
1105 + cache_mngr->trx_cache.xa_xid= xid;
1107 We always commit the entire transaction when writing an XID. Also
1108 note that the return value is inverted.
1110 - DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr) &&
1111 - !binlog_commit_flush_trx_cache(thd, cache_mngr, xid));
1112 + DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr, all) &&
1113 + !binlog_commit_flush_trx_cache(thd, cache_mngr, xid, all));
1116 -int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1118 + After an XID is logged, we need to hold on to the current binlog file until
1119 + it is fully committed in the storage engine. The reason is that crash
1120 + recovery only looks at the latest binlog, so we must make sure there are no
1121 + outstanding prepared (but not committed) transactions before rotating the
1124 + To handle this, we keep a count of outstanding XIDs. This function is used
1125 + to increase this count when committing one or more transactions to the
1129 +TC_LOG_BINLOG::mark_xids_active(uint xid_count)
1131 - DBUG_ENTER("TC_LOG_BINLOG::unlog");
1132 + DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
1133 + DBUG_PRINT("info", ("xid_count=%u", xid_count));
1134 + mysql_mutex_lock(&LOCK_prep_xids);
1135 + prepared_xids+= xid_count;
1136 + mysql_mutex_unlock(&LOCK_prep_xids);
1141 + Once an XID is committed, it is safe to rotate the binary log, as it can no
1142 + longer be needed during crash recovery.
1144 + This function is called to mark an XID this way. It needs to decrease the
1145 + count of pending XIDs, and signal the log rotator thread when it reaches zero.
1148 +TC_LOG_BINLOG::mark_xid_done()
1150 + my_bool send_signal;
1152 + DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
1153 mysql_mutex_lock(&LOCK_prep_xids);
1154 // prepared_xids can be 0 if the transaction had ignorable errors.
1155 DBUG_ASSERT(prepared_xids >= 0);
1156 if (prepared_xids > 0)
1158 - if (prepared_xids == 0) {
1159 + send_signal= (prepared_xids == 0);
1160 + mysql_mutex_unlock(&LOCK_prep_xids);
1161 + if (send_signal) {
1162 DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
1163 mysql_cond_signal(&COND_prep_xids);
1165 - mysql_mutex_unlock(&LOCK_prep_xids);
1166 - DBUG_RETURN(rotate_and_purge(0)); // as ::write() did not rotate
1170 +int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1172 + DBUG_ENTER("TC_LOG_BINLOG::unlog");
1175 + DBUG_RETURN(rotate_and_purge(0));
1178 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
1179 @@ -6796,9 +7123,67 @@
1181 return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
1184 + Get the current position of the MySQL binlog for transaction currently being
1187 + This is valid to call from within storage engine commit_ordered() and
1188 + commit() methods only.
1190 + Since it stores the position inside THD, it is safe to call without any
1193 + Note that currently the binlog file name is not stored inside THD, but this
1194 + is still safe as it can only change when the log is rotated, and we never
1195 + rotate the binlog while commits are pending inside storage engines.
1198 +void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
1200 + binlog_cache_mngr *cache_mngr;
1201 + if (binlog_hton->state == SHOW_OPTION_YES
1202 + && (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
1204 + *out_pos= cache_mngr->trx_cache.commit_bin_log_file_pos;
1205 + *out_file= mysql_bin_log.get_log_fname();
1213 #endif /* INNODB_COMPATIBILITY_HOOKS */
1216 +static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
1218 + mysql_bin_log.set_status_variables();
1219 + var->type= SHOW_ARRAY;
1220 + var->value= (char *)&binlog_status_vars_detail;
1224 +static SHOW_VAR binlog_status_vars_top[]= {
1225 + {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
1226 + {NullS, NullS, SHOW_LONG}
1230 + Copy out current values of status variables, for SHOW STATUS or
1231 + information_schema.global_status.
1233 + This is called only under LOCK_status, so we can fill in a static array.
1236 +TC_LOG_BINLOG::set_status_variables()
1238 + mysql_mutex_lock(&LOCK_commit_ordered);
1239 + binlog_status_var_num_commits= this->num_commits;
1240 + binlog_status_var_num_group_commits= this->num_group_commits;
1241 + mysql_mutex_unlock(&LOCK_commit_ordered);
1244 struct st_mysql_storage_engine binlog_storage_engine=
1245 { MYSQL_HANDLERTON_INTERFACE_VERSION };
1247 @@ -6813,7 +7198,7 @@
1248 binlog_init, /* Plugin Init */
1249 NULL, /* Plugin Deinit */
1251 - NULL, /* status variables */
1252 + binlog_status_vars_top, /* status variables */
1253 NULL, /* system variables */
1254 NULL, /* config options */
1260 virtual int open(const char *opt_name)=0;
1261 virtual void close()=0;
1262 - virtual int log_xid(THD *thd, my_xid xid)=0;
1263 + virtual int log_and_order(THD *thd, my_xid xid, bool all,
1264 + bool need_commit_ordered)=0;
1265 virtual int unlog(ulong cookie, my_xid xid)=0;
1268 + void run_commit_ordered(THD *thd, bool all);
1272 + Locks used to ensure serialised execution of
1273 + TC_LOG::run_commit_ordered(), or any other code that calls handler
1274 + commit_ordered() methods.
1276 +extern mysql_mutex_t LOCK_group_commit_queue;
1277 +extern mysql_mutex_t LOCK_commit_ordered;
1279 +extern void TC_init();
1280 +extern void TC_destroy();
1282 class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
1286 int open(const char *opt_name) { return 0; }
1288 - int log_xid(THD *thd, my_xid xid) { return 1; }
1290 + TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we
1291 + only use internal XA during commit when >= 2 XA-capable engines
1294 + int log_and_order(THD *thd, my_xid xid, bool all,
1295 + bool need_commit_ordered)
1297 + DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */);
1300 int unlog(ulong cookie, my_xid xid) { return 0; }
1304 mysql_cond_t cond; // to wait for a sync
1307 + /* List of THDs for which to invoke commit_ordered(), in order. */
1308 + struct commit_entry
1310 + struct commit_entry *next;
1314 char logname[FN_REFLEN];
1316 my_off_t file_length;
1317 @@ -94,16 +126,38 @@
1319 mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
1320 mysql_cond_t COND_pool, COND_active;
1322 + Queue of threads that need to call commit_ordered().
1323 + Access to this queue must be protected by LOCK_group_commit_queue
1325 + commit_entry *commit_ordered_queue;
1327 + This flag and condition is used to reserve the queue while threads in it
1328 + each run the commit_ordered() methods one after the other. Only once the
1329 + last commit_ordered() in the queue is done can we start on a new queue
1332 + Since we start this process in the first thread in the queue and finish in
1333 + the last (and possibly different) thread, we need a condition variable for
1334 + this (we cannot unlock a mutex in a different thread than the one who
1337 + The condition is used together with the LOCK_group_commit_queue mutex.
1339 + my_bool commit_ordered_queue_busy;
1340 + mysql_cond_t COND_queue_busy;
1343 TC_LOG_MMAP(): inited(0) {}
1344 int open(const char *opt_name);
1346 - int log_xid(THD *thd, my_xid xid);
1347 + int log_and_order(THD *thd, my_xid xid, bool all,
1348 + bool need_commit_ordered);
1349 int unlog(ulong cookie, my_xid xid);
1353 + int log_one_transaction(my_xid xid);
1354 void get_active_from_pool();
1357 @@ -271,9 +325,31 @@
1361 +class binlog_cache_data;
1362 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
1365 + struct group_commit_entry
1367 + struct group_commit_entry *next;
1369 + binlog_cache_data *cache_data;
1371 + Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
1372 + written during group commit. The incident_event is only valid if
1373 + trx_data->has_incident() is true.
1375 + Log_event *begin_event;
1376 + Log_event *end_event;
1377 + Log_event *incident_event;
1378 + /* Set during group commit to record any per-thread error. */
1381 + /* This is the `all' parameter for ha_commit_ordered(). */
1383 + /* True if we come in through XA log_and_order(), false otherwise. */
1386 #ifdef HAVE_PSI_INTERFACE
1387 /** The instrumentation key to use for @ LOCK_index. */
1388 PSI_mutex_key m_key_LOCK_index;
1389 @@ -325,6 +401,12 @@
1390 In 5.0 it's 0 for relay logs too!
1392 bool no_auto_events;
1393 + /* Queue of transactions queued up to participate in group commit. */
1394 + group_commit_entry *group_commit_queue;
1395 + /* Total number of committed transactions. */
1396 + ulonglong num_commits;
1397 + /* Number of group commits done. */
1398 + ulonglong num_group_commits;
1400 /* pointer to the sync period variable, for binlog this will be
1401 sync_binlog_period, for relay log this will be
1402 @@ -346,6 +428,11 @@
1404 int new_file_without_locking();
1405 int new_file_impl(bool need_lock);
1406 + int write_transaction(group_commit_entry *entry);
1407 + bool write_transaction_to_binlog_events(group_commit_entry *entry);
1408 + void trx_group_commit_leader(group_commit_entry *leader);
1409 + void mark_xid_done();
1410 + void mark_xids_active(uint xid_count);
1413 MYSQL_LOG::generate_name;
1416 int open(const char *opt_name);
1418 - int log_xid(THD *thd, my_xid xid);
1419 + int log_and_order(THD *thd, my_xid xid, bool all,
1420 + bool need_commit_ordered);
1421 int unlog(ulong cookie, my_xid xid);
1422 int recover(IO_CACHE *log, Format_description_log_event *fdle);
1423 #if !defined(MYSQL_CLIENT)
1424 @@ -434,11 +522,11 @@
1427 bool write(Log_event* event_info); // binary log write
1428 - bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
1429 + bool write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
1430 + Log_event *end_ev, bool all);
1431 bool write_incident(THD *thd, bool lock);
1433 - int write_cache(THD *thd, IO_CACHE *cache,
1434 - bool lock_log, bool flush_and_sync);
1435 + int write_cache(THD *thd, IO_CACHE *cache);
1436 void set_write_error(THD *thd, bool is_transactional);
1437 bool check_write_error(THD *thd);
1440 inline void unlock_index() { mysql_mutex_unlock(&LOCK_index);}
1441 inline IO_CACHE *get_index_file() { return &index_file;}
1442 inline uint32 get_open_count() { return open_count; }
1443 + void set_status_variables();
1446 class Log_event_handler
1449 @@ -1495,6 +1495,7 @@
1454 delegates_destroy();
1457 @@ -3911,6 +3912,8 @@
1458 query_response_time_init();
1459 #endif // HAVE_RESPONSE_TIME_DISTRIBUTION
1460 /* We have to initialize the storage engines before CSV logging */
1463 init_global_table_stats();
1464 init_global_index_stats();
1466 @@ -7872,6 +7875,7 @@
1467 key_LOCK_error_messages, key_LOG_INFO_lock, key_LOCK_thread_count,
1468 key_PARTITION_LOCK_auto_inc;
1469 PSI_mutex_key key_RELAYLOG_LOCK_index;
1470 +PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1472 static PSI_mutex_info all_server_mutexes[]=
1474 @@ -7892,6 +7896,7 @@
1475 { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
1476 { &key_hash_filo_lock, "hash_filo::lock", 0},
1477 { &key_LOCK_active_mi, "LOCK_active_mi", PSI_FLAG_GLOBAL},
1478 + { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
1479 { &key_LOCK_connection_count, "LOCK_connection_count", PSI_FLAG_GLOBAL},
1480 { &key_LOCK_crypt, "LOCK_crypt", PSI_FLAG_GLOBAL},
1481 { &key_LOCK_delayed_create, "LOCK_delayed_create", PSI_FLAG_GLOBAL},
1482 @@ -7907,6 +7912,7 @@
1483 "LOCK_global_index_stats", PSI_FLAG_GLOBAL},
1484 { &key_LOCK_gdl, "LOCK_gdl", PSI_FLAG_GLOBAL},
1485 { &key_LOCK_global_system_variables, "LOCK_global_system_variables", PSI_FLAG_GLOBAL},
1486 + { &key_LOCK_group_commit_queue, "LOCK_group_commit_queue", PSI_FLAG_GLOBAL},
1487 { &key_LOCK_manager, "LOCK_manager", PSI_FLAG_GLOBAL},
1488 { &key_LOCK_prepared_stmt_count, "LOCK_prepared_stmt_count", PSI_FLAG_GLOBAL},
1489 { &key_LOCK_rpl_status, "LOCK_rpl_status", PSI_FLAG_GLOBAL},
1490 @@ -7918,6 +7924,7 @@
1491 { &key_LOCK_temporary_tables, "THD::LOCK_temporary_tables", 0},
1492 { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
1493 { &key_LOCK_uuid_generator, "LOCK_uuid_generator", PSI_FLAG_GLOBAL},
1494 + { &key_LOCK_wakeup_ready, "THD::LOCK_wakeup_ready", 0},
1495 { &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
1496 { &key_master_info_data_lock, "Master_info::data_lock", 0},
1497 { &key_master_info_run_lock, "Master_info::run_lock", 0},
1498 @@ -7965,6 +7972,7 @@
1499 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1500 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1501 PSI_cond_key key_RELAYLOG_update_cond;
1502 +PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1504 static PSI_cond_info all_server_conds[]=
1506 @@ -7981,8 +7989,10 @@
1507 { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
1508 { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
1509 { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
1510 + { &key_COND_queue_busy, "COND_queue_busy", PSI_FLAG_GLOBAL},
1511 { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
1512 { &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
1513 + { &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
1514 { &key_delayed_insert_cond, "Delayed_insert::cond", 0},
1515 { &key_delayed_insert_cond_client, "Delayed_insert::cond_client", 0},
1516 { &key_item_func_sleep_cond, "Item_func_sleep::cond", 0},
1520 key_structure_guard_mutex, key_TABLE_SHARE_LOCK_ha_data,
1521 key_LOCK_error_messages, key_LOCK_thread_count, key_PARTITION_LOCK_auto_inc;
1522 extern PSI_mutex_key key_RELAYLOG_LOCK_index;
1523 +extern PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1525 extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger,
1526 key_rwlock_LOCK_sys_init_connect, key_rwlock_LOCK_sys_init_slave,
1528 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1529 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1530 extern PSI_cond_key key_RELAYLOG_update_cond;
1531 +extern PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1533 extern PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
1534 key_thread_handle_manager, key_thread_kill_server, key_thread_main,
1535 --- a/sql/sql_class.cc
1536 +++ b/sql/sql_class.cc
1537 @@ -1005,6 +1005,8 @@
1538 mysql_mutex_init(key_LOCK_thd_data, &LOCK_thd_data, MY_MUTEX_INIT_FAST);
1539 mysql_mutex_init(key_LOCK_temporary_tables, &LOCK_temporary_tables,
1540 MY_MUTEX_INIT_FAST);
1541 + mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
1542 + mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, NULL);
1544 /* Variables with default values */
1546 @@ -1609,6 +1611,8 @@
1549 free_root(&transaction.mem_root,MYF(0));
1550 + mysql_cond_destroy(&COND_wakeup_ready);
1551 + mysql_mutex_destroy(&LOCK_wakeup_ready);
1552 mysql_mutex_destroy(&LOCK_thd_data);
1553 mysql_mutex_destroy(&LOCK_temporary_tables);
1555 @@ -5297,6 +5301,24 @@
1560 +THD::wait_for_wakeup_ready()
1562 + mysql_mutex_lock(&LOCK_wakeup_ready);
1563 + while (!wakeup_ready)
1564 + mysql_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready);
1565 + mysql_mutex_unlock(&LOCK_wakeup_ready);
1569 +THD::signal_wakeup_ready()
1571 + mysql_mutex_lock(&LOCK_wakeup_ready);
1572 + wakeup_ready= true;
1573 + mysql_mutex_unlock(&LOCK_wakeup_ready);
1574 + mysql_cond_signal(&COND_wakeup_ready);
1577 bool Discrete_intervals_list::append(ulonglong start, ulonglong val,
1580 --- a/sql/sql_class.h
1581 +++ b/sql/sql_class.h
1582 @@ -3078,6 +3078,14 @@
1583 LEX_STRING get_invoker_user() { return invoker_user; }
1584 LEX_STRING get_invoker_host() { return invoker_host; }
1585 bool has_invoker() { return invoker_user.length > 0; }
1586 + void clear_wakeup_ready() { wakeup_ready= false; }
1588 + Sleep waiting for others to wake us up with signal_wakeup_ready().
1589 + Must call clear_wakeup_ready() before waiting.
1591 + void wait_for_wakeup_ready();
1592 + /* Wake this thread up from wait_for_wakeup_ready(). */
1593 + void signal_wakeup_ready();
1596 /** The current internal error handler for this thread, or NULL. */
1597 @@ -3120,6 +3128,16 @@
1599 LEX_STRING invoker_user;
1600 LEX_STRING invoker_host;
1602 + Flag, mutex and condition for a thread to wait for a signal from another
1605 + Currently used to wait for group commit to complete, can also be used for
1608 + bool wakeup_ready;
1609 + mysql_mutex_t LOCK_wakeup_ready;
1610 + mysql_cond_t COND_wakeup_ready;
1613 /* Returns string as 'IP' for the client-side of the connection represented by
1614 --- a/sql/sql_parse.cc
1615 +++ b/sql/sql_parse.cc
1616 @@ -889,6 +889,10 @@
1617 DBUG_ENTER("dispatch_command");
1618 DBUG_PRINT("info",("packet: '%*.s'; command: %d", packet_length, packet, command));
1620 + DBUG_EXECUTE_IF("crash_dispatch_command_before",
1621 + { DBUG_PRINT("crash_dispatch_command_before", ("now"));
1624 #if defined(ENABLED_PROFILING)
1625 thd->profiling.start_new_query();
1627 --- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1628 +++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1630 wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
1631 wait/synch/mutex/sql/LOCK_active_mi YES YES
1632 wait/synch/mutex/sql/LOCK_audit_mask YES YES
1633 +wait/synch/mutex/sql/LOCK_commit_ordered YES YES
1634 wait/synch/mutex/sql/LOCK_connection_count YES YES
1635 wait/synch/mutex/sql/LOCK_crypt YES YES
1636 -wait/synch/mutex/sql/LOCK_delayed_create YES YES
1637 select * from performance_schema.setup_instruments
1638 where name like 'Wait/Synch/Rwlock/sql/%'
1639 and name not in ('wait/synch/rwlock/sql/CRYPTO_dynlock_value::lock')
1642 wait/synch/cond/sql/COND_flush_thread_cache YES YES
1643 wait/synch/cond/sql/COND_manager YES YES
1644 +wait/synch/cond/sql/COND_queue_busy YES YES
1645 wait/synch/cond/sql/COND_queue_state YES YES
1646 wait/synch/cond/sql/COND_rpl_status YES YES
1647 wait/synch/cond/sql/COND_server_started YES YES
1649 wait/synch/cond/sql/COND_thread_count YES YES
1650 wait/synch/cond/sql/Delayed_insert::cond YES YES
1651 wait/synch/cond/sql/Delayed_insert::cond_client YES YES
1652 -wait/synch/cond/sql/Event_scheduler::COND_state YES YES
1653 select * from performance_schema.setup_instruments
1655 select * from performance_schema.setup_instruments
1656 --- a/storage/innobase/handler/ha_innodb.cc
1657 +++ b/storage/innobase/handler/ha_innodb.cc
1659 static INNOBASE_SHARE *get_share(const char *table_name);
1660 static void free_share(INNOBASE_SHARE *share);
1661 static int innobase_close_connection(handlerton *hton, THD* thd);
1662 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1663 +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
1665 static int innobase_commit(handlerton *hton, THD* thd, bool all);
1666 static int innobase_rollback(handlerton *hton, THD* thd, bool all);
1667 static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
1668 @@ -1699,7 +1702,10 @@
1669 trx_t* trx) /*!< in/out: InnoDB transaction handle */
1671 DBUG_ENTER("innobase_trx_init");
1672 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1673 + /* used by innobase_commit_ordered */
1674 DBUG_ASSERT(EQ_CURRENT_THD(thd));
1676 DBUG_ASSERT(thd == trx->mysql_thd);
1678 trx->check_foreigns = !thd_test_options(
1679 @@ -1760,7 +1766,10 @@
1681 trx_t*& trx = thd_to_trx(thd);
1683 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1684 + /* used by innobase_commit_ordered */
1685 ut_ad(EQ_CURRENT_THD(thd));
1689 trx = innobase_trx_allocate(thd);
1690 @@ -1846,6 +1855,7 @@
1692 trx->is_registered = 0;
1693 trx->owns_prepare_mutex = 0;
1694 + trx->called_commit_ordered = 0;
1697 /*********************************************************************//**
1698 @@ -1861,6 +1871,29 @@
1701 /*********************************************************************//**
1705 +trx_called_commit_ordered_set(
1706 +/*==========================*/
1709 + ut_a(trx_is_registered_for_2pc(trx));
1710 + trx->called_commit_ordered = 1;
1713 +/*********************************************************************//**
1717 +trx_called_commit_ordered(
1718 +/*======================*/
1721 + return(trx->called_commit_ordered == 1);
1724 +/*********************************************************************//**
1725 Check if transaction is started.
1726 @reutrn true if transaction is in state started */
1728 @@ -2435,6 +2468,9 @@
1729 innobase_hton->savepoint_set=innobase_savepoint;
1730 innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
1731 innobase_hton->savepoint_release=innobase_release_savepoint;
1732 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1733 + innobase_hton->commit_ordered=innobase_commit_ordered;
1735 innobase_hton->commit=innobase_commit;
1736 innobase_hton->rollback=innobase_rollback;
1737 innobase_hton->prepare=innobase_xa_prepare;
1738 @@ -3187,6 +3223,126 @@
1742 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1744 + InnoDB is coded with intention that always trx is accessed by the owner thd.
1745 + (not protected by any mutex/lock)
1746 + So, the caller of innobase_commit_ordered() should be conscious of
1747 + cache coherency between multi CPU about the trx, if called from another thd.
1749 + MariaDB's first implementation about it seems the cherency is protected by
1750 + the pthread_mutex LOCK_wakeup_ready. So, no problem for now.
1752 + But we should be aware the importance of the coherency.
1754 +/*****************************************************************//**
1755 +low function function innobase_commit_ordered().*/
1758 +innobase_commit_ordered_low(
1759 +/*========================*/
1760 + trx_t* trx, /*!< in: Innodb transaction */
1761 + THD* thd) /*!< in: MySQL thread handle */
1763 + ulonglong tmp_pos;
1764 + DBUG_ENTER("innobase_commit_ordered");
1766 + /* This part was from innobase_commit() */
1768 + /* We need current binlog position for ibbackup to work.
1769 + Note, the position is current because commit_ordered is guaranteed
1770 + to be called in same sequenece as writing to binlog. */
1772 + if (innobase_commit_concurrency > 0) {
1773 + mysql_mutex_lock(&commit_cond_m);
1776 + if (commit_threads > innobase_commit_concurrency) {
1778 + mysql_cond_wait(&commit_cond,
1780 + mysql_mutex_unlock(&commit_cond_m);
1784 + mysql_mutex_unlock(&commit_cond_m);
1788 + mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
1789 + trx->mysql_log_offset = (ib_int64_t) tmp_pos;
1791 + /* Don't do write + flush right now. For group commit
1792 + to work we want to do the flush in the innobase_commit()
1793 + method, which runs without holding any locks. */
1794 + trx->flush_log_later = TRUE;
1795 + innobase_commit_low(trx);
1796 + trx->flush_log_later = FALSE;
1798 + if (innobase_commit_concurrency > 0) {
1799 + mysql_mutex_lock(&commit_cond_m);
1801 + mysql_cond_signal(&commit_cond);
1802 + mysql_mutex_unlock(&commit_cond_m);
1808 +/*****************************************************************//**
1809 +Perform the first, fast part of InnoDB commit.
1811 +Doing it in this call ensures that we get the same commit order here
1812 +as in binlog and any other participating transactional storage engines.
1814 +Note that we want to do as little as really needed here, as we run
1815 +under a global mutex. The expensive fsync() is done later, in
1816 +innobase_commit(), without a lock so group commit can take place.
1818 +Note also that this method can be called from a different thread than
1819 +the one handling the rest of the transaction. */
1822 +innobase_commit_ordered(
1823 +/*====================*/
1824 + handlerton *hton, /*!< in: Innodb handlerton */
1825 + THD* thd, /*!< in: MySQL thread handle of the user for whom
1826 + the transaction should be committed */
1827 + bool all) /*!< in: TRUE - commit transaction
1828 + FALSE - the current SQL statement ended */
1831 + DBUG_ENTER("innobase_commit_ordered");
1832 + DBUG_ASSERT(hton == innodb_hton_ptr);
1834 + trx = check_trx_exists(thd);
1836 + /* Since we will reserve the kernel mutex, we have to release
1837 + the search system latch first to obey the latching order. */
1839 + if (trx->has_search_latch) {
1840 + trx_search_latch_release_if_reserved(trx);
1843 + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
1844 + /* We cannot throw error here; instead we will catch this error
1845 + again in innobase_commit() and report it from there. */
1849 + /* commit_ordered is only called when committing the whole transaction
1850 + (or an SQL statement when autocommit is on). */
1851 + DBUG_ASSERT(all ||
1852 + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
1854 + innobase_commit_ordered_low(trx, thd);
1856 + trx_called_commit_ordered_set(trx);
1860 +#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1862 /*****************************************************************//**
1863 Commits a transaction in an InnoDB database or marks an SQL statement
1865 @@ -3238,6 +3394,16 @@
1866 /* We were instructed to commit the whole transaction, or
1867 this is an SQL statement end and autocommit is on */
1869 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1870 + ut_ad(!trx_has_prepare_commit_mutex(trx));
1872 + /* Run the fast part of commit if we did not already. */
1873 + if (!trx_called_commit_ordered(trx)) {
1874 + innobase_commit_ordered_low(trx, thd);
1877 + ut_ad(!trx_called_commit_ordered(trx));
1879 /* We need current binlog position for ibbackup to work.
1880 Note, the position is current because of
1881 prepare_commit_mutex */
1882 @@ -3292,6 +3458,7 @@
1884 mysql_mutex_unlock(&prepare_commit_mutex);
1886 +#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1888 trx_deregister_from_2pc(trx);
1890 @@ -10981,6 +11148,7 @@
1892 srv_active_wake_master_thread();
1894 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1895 if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
1897 || !thd_test_options(
1898 @@ -11007,6 +11175,7 @@
1899 mysql_mutex_lock(&prepare_commit_mutex);
1900 trx_owns_prepare_commit_mutex_set(trx);
1902 +#endif /* ifndef EXTENDED_FOR_COMMIT_ORDERED */
1906 --- a/storage/innobase/handler/ha_innodb.h
1907 +++ b/storage/innobase/handler/ha_innodb.h
1908 @@ -240,6 +240,12 @@
1909 struct charset_info_st *thd_charset(MYSQL_THD thd);
1910 LEX_STRING *thd_query_string(MYSQL_THD thd);
1912 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1913 +/** Get the file name and position of the MySQL binlog corresponding to the
1916 +void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
1918 /** Get the file name of the MySQL binlog.
1919 * @return the name of the binlog file
1922 * @return byte offset from the beginning of the binlog
1924 ulonglong mysql_bin_log_file_pos(void);
1928 Check if a user thread is a replication slave thread
1929 --- a/storage/innobase/include/trx0trx.h
1930 +++ b/storage/innobase/include/trx0trx.h
1932 this is set to 1 then registered should
1933 also be set to 1. This is used in the
1935 + unsigned called_commit_ordered:1;/* 1 if innobase_commit_ordered has run. */
1936 /*------------------------------*/
1937 ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
1938 ulint check_foreigns; /* normally TRUE, but if the user
1939 --- a/storage/innobase/trx/trx0trx.c
1940 +++ b/storage/innobase/trx/trx0trx.c
1943 trx->is_registered = 0;
1944 trx->owns_prepare_mutex = 0;
1945 + trx->called_commit_ordered = 0;
1947 trx->start_time = ut_time();
1950 +++ b/mysql-test/r/group_commit.result
1952 +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
1953 +SELECT variable_value INTO @commits FROM information_schema.global_status
1954 +WHERE variable_name = 'binlog_commits';
1955 +SELECT variable_value INTO @group_commits FROM information_schema.global_status
1956 +WHERE variable_name = 'binlog_group_commits';
1957 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
1958 +INSERT INTO t1 VALUES ("con1");
1959 +set DEBUG_SYNC= "now WAIT_FOR group1_running";
1960 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
1961 +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
1962 +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
1963 +INSERT INTO t1 VALUES ("con2");
1964 +SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
1965 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
1966 +INSERT INTO t1 VALUES ("con3");
1967 +SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
1968 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
1969 +INSERT INTO t1 VALUES ("con4");
1970 +SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
1971 +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
1972 +SELECT * FROM t1 ORDER BY a;
1974 +SET DEBUG_SYNC= "now SIGNAL group2_queued";
1975 +SELECT * FROM t1 ORDER BY a;
1978 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
1979 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
1980 +INSERT INTO t1 VALUES ("con5");
1981 +SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
1982 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
1983 +INSERT INTO t1 VALUES ("con6");
1984 +SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
1985 +SELECT * FROM t1 ORDER BY a;
1988 +SET DEBUG_SYNC= "now SIGNAL group3_committed";
1989 +SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
1990 +SELECT * FROM t1 ORDER BY a;
1996 +SET DEBUG_SYNC= "now SIGNAL group2_checked";
1997 +SELECT * FROM t1 ORDER BY a;
2005 +SELECT variable_value - @commits FROM information_schema.global_status
2006 +WHERE variable_name = 'binlog_commits';
2007 +variable_value - @commits
2009 +SELECT variable_value - @group_commits FROM information_schema.global_status
2010 +WHERE variable_name = 'binlog_group_commits';
2011 +variable_value - @group_commits
2013 +SET DEBUG_SYNC= 'RESET';
2016 +++ b/mysql-test/r/group_commit_binlog_pos.result
2018 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2019 +INSERT INTO t1 VALUES (0);
2020 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2021 +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2022 +INSERT INTO t1 VALUES (1);
2023 +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2024 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2025 +INSERT INTO t1 VALUES (2);
2026 +SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2027 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2028 +INSERT INTO t1 VALUES (3);
2029 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2030 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2031 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2032 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2033 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2034 +SELECT * FROM t1 ORDER BY a;
2039 +SET SESSION debug="+d,crash_dispatch_command_before";
2041 +Got one of the listed errors
2042 +Got one of the listed errors
2043 +Got one of the listed errors
2044 +SELECT * FROM t1 ORDER BY a;
2050 +InnoDB: Last MySQL binlog file position 0 768, file name ./master-bin.000001
2051 +SET DEBUG_SYNC= 'RESET';
2054 +++ b/mysql-test/r/group_commit_crash.result
2056 +CREATE TABLE t1(a CHAR(255),
2060 +id INT AUTO_INCREMENT,
2061 +PRIMARY KEY(id)) ENGINE=InnoDB;
2062 +create table t2 like t1;
2063 +create procedure setcrash(IN i INT)
2066 +WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2067 +WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2068 +WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2069 +WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2070 +WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2075 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2076 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2077 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2078 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2079 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2080 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2081 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2082 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2083 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2084 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2087 +insert into t1 select * from t2;
2090 +Got one of the listed errors
2091 +SELECT * FROM t1 ORDER BY id;
2093 +SHOW BINLOG EVENTS LIMIT 2,1;
2094 +Log_name Pos Event_type Server_id End_log_pos Info
2098 +insert into t1 select * from t2;
2101 +Got one of the listed errors
2102 +SELECT * FROM t1 ORDER BY id;
2114 +SHOW BINLOG EVENTS LIMIT 2,1;
2115 +Log_name Pos Event_type Server_id End_log_pos Info
2116 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2120 +insert into t1 select * from t2;
2123 +Got one of the listed errors
2124 +SELECT * FROM t1 ORDER BY id;
2136 +SHOW BINLOG EVENTS LIMIT 2,1;
2137 +Log_name Pos Event_type Server_id End_log_pos Info
2138 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2142 +insert into t1 select * from t2;
2145 +Got one of the listed errors
2146 +SELECT * FROM t1 ORDER BY id;
2158 +SHOW BINLOG EVENTS LIMIT 2,1;
2159 +Log_name Pos Event_type Server_id End_log_pos Info
2160 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2164 +insert into t1 select * from t2;
2167 +Got one of the listed errors
2168 +SELECT * FROM t1 ORDER BY id;
2170 +SHOW BINLOG EVENTS LIMIT 2,1;
2171 +Log_name Pos Event_type Server_id End_log_pos Info
2175 +DROP PROCEDURE setcrash;
2177 +++ b/mysql-test/r/xa_binlog.result
2179 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2180 +SET binlog_format= mixed;
2183 +INSERT INTO t1 VALUES (1);
2185 +XA PREPARE 'xatest';
2186 +XA COMMIT 'xatest';
2188 +INSERT INTO t1 VALUES (2);
2190 +XA COMMIT 'xatest' ONE PHASE;
2192 +INSERT INTO t1 VALUES (3);
2194 +SELECT * FROM t1 ORDER BY a;
2199 +SHOW BINLOG EVENTS LIMIT 1,9;
2200 +Log_name Pos Event_type Server_id End_log_pos Info
2201 +master-bin.000001 # Query 1 # BEGIN
2202 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (1)
2203 +master-bin.000001 # Query 1 # COMMIT
2204 +master-bin.000001 # Query 1 # BEGIN
2205 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (2)
2206 +master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2207 +master-bin.000001 # Query 1 # BEGIN
2208 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (3)
2209 +master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2212 +++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
2214 +CALL mtr.add_suppression("Error writing file 'master-bin'");
2216 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2217 +INSERT INTO t1 VALUES(0);
2218 +SET SESSION debug='+d,fail_binlog_write_1';
2219 +INSERT INTO t1 VALUES(1);
2220 +ERROR HY000: Error writing file 'master-bin' (errno: 28)
2221 +INSERT INTO t1 VALUES(2);
2222 +ERROR HY000: Error writing file 'master-bin' (errno: 28)
2223 +SET SESSION debug='';
2224 +INSERT INTO t1 VALUES(3);
2229 +SHOW BINLOG EVENTS;
2230 +Log_name Pos Event_type Server_id End_log_pos Info
2231 +BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
2232 +BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
2233 +BINLOG POS Query 1 ENDPOS BEGIN
2234 +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
2235 +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2236 +BINLOG POS Query 1 ENDPOS BEGIN
2237 +BINLOG POS Query 1 ENDPOS BEGIN
2238 +BINLOG POS Query 1 ENDPOS BEGIN
2239 +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
2240 +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2243 +++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
2245 +source include/have_debug.inc;
2246 +source include/have_innodb.inc;
2247 +source include/have_log_bin.inc;
2248 +source include/have_binlog_format_mixed_or_statement.inc;
2250 +CALL mtr.add_suppression("Error writing file 'master-bin'");
2254 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2255 +INSERT INTO t1 VALUES(0);
2256 +SET SESSION debug='+d,fail_binlog_write_1';
2257 +--error ER_ERROR_ON_WRITE
2258 +INSERT INTO t1 VALUES(1);
2259 +--error ER_ERROR_ON_WRITE
2260 +INSERT INTO t1 VALUES(2);
2261 +SET SESSION debug='';
2262 +INSERT INTO t1 VALUES(3);
2265 +# Actually the output from this currently shows a bug.
2266 +# The injected IO error leaves partially written transactions in the binlog in
2267 +# the form of stray "BEGIN" events.
2268 +# These should disappear from the output if binlog error handling is improved
2269 +# (see MySQL Bug#37148 and WL#1790).
2270 +--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
2271 +--replace_column 1 BINLOG 2 POS 5 ENDPOS
2272 +SHOW BINLOG EVENTS;
2276 +++ b/mysql-test/t/group_commit.test
2278 +--source include/have_debug_sync.inc
2279 +--source include/have_innodb.inc
2280 +--source include/have_log_bin.inc
2282 +# Test some group commit code paths by using debug_sync to do controlled
2283 +# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
2286 +# Group 3 is allowed to race as far as possible ahead before group 2 finishes
2287 +# to check some edge case for concurrency control.
2289 +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
2291 +SELECT variable_value INTO @commits FROM information_schema.global_status
2292 + WHERE variable_name = 'binlog_commits';
2293 +SELECT variable_value INTO @group_commits FROM information_schema.global_status
2294 + WHERE variable_name = 'binlog_group_commits';
2296 +connect(con1,localhost,root,,);
2297 +connect(con2,localhost,root,,);
2298 +connect(con3,localhost,root,,);
2299 +connect(con4,localhost,root,,);
2300 +connect(con5,localhost,root,,);
2301 +connect(con6,localhost,root,,);
2303 +# Start group1 (with one thread) doing commit, waiting for
2304 +# group2 to queue up before finishing.
2307 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
2308 +send INSERT INTO t1 VALUES ("con1");
2310 +# Make group2 (with three threads) queue up.
2311 +# Make sure con2 is the group commit leader for group2.
2312 +# Make group2 wait with running commit_ordered() until group3 has committed.
2315 +set DEBUG_SYNC= "now WAIT_FOR group1_running";
2316 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
2317 +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
2318 +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
2319 +send INSERT INTO t1 VALUES ("con2");
2321 +SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
2322 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
2323 +send INSERT INTO t1 VALUES ("con3");
2325 +SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
2326 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
2327 +send INSERT INTO t1 VALUES ("con4");
2329 +# When group2 is queued, let group1 continue and queue group3.
2331 +connection default;
2332 +SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
2334 +# At this point, trasaction 1 is still not visible as commit_ordered() has not
2336 +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
2337 +SELECT * FROM t1 ORDER BY a;
2339 +SET DEBUG_SYNC= "now SIGNAL group2_queued";
2343 +# Now transaction 1 is visible.
2344 +connection default;
2345 +SELECT * FROM t1 ORDER BY a;
2348 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
2349 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
2350 +send INSERT INTO t1 VALUES ("con5");
2353 +SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
2354 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
2355 +send INSERT INTO t1 VALUES ("con6");
2357 +connection default;
2358 +SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
2359 +# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
2360 +SELECT * FROM t1 ORDER BY a;
2361 +SET DEBUG_SYNC= "now SIGNAL group3_committed";
2362 +SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
2363 +# Now transactions 1-4 visible.
2364 +SELECT * FROM t1 ORDER BY a;
2365 +SET DEBUG_SYNC= "now SIGNAL group2_checked";
2382 +connection default;
2383 +# Check all transactions finally visible.
2384 +SELECT * FROM t1 ORDER BY a;
2386 +SELECT variable_value - @commits FROM information_schema.global_status
2387 + WHERE variable_name = 'binlog_commits';
2388 +SELECT variable_value - @group_commits FROM information_schema.global_status
2389 + WHERE variable_name = 'binlog_group_commits';
2391 +SET DEBUG_SYNC= 'RESET';
2394 +++ b/mysql-test/t/group_commit_binlog_pos-master.opt
2396 +--skip-stack-trace --skip-core-file
2398 +++ b/mysql-test/t/group_commit_binlog_pos.test
2400 +--source include/have_debug_sync.inc
2401 +--source include/have_innodb.inc
2402 +--source include/have_log_bin.inc
2403 +--source include/have_binlog_format_mixed_or_statement.inc
2405 +# Need DBUG to crash the server intentionally
2406 +--source include/have_debug.inc
2407 +# Don't test this under valgrind, memory leaks will occur as we crash
2408 +--source include/not_valgrind.inc
2410 +# The test case currently uses grep and tail, which may be unavailable on
2411 +# some windows systems. But see MWL#191 for how to remove the need for grep.
2412 +--source include/not_windows.inc
2414 +# XtraDB stores the binlog position corresponding to the last commit, and
2415 +# prints it during crash recovery.
2416 +# Test that we get the correct position when we group commit several
2417 +# transactions together.
2419 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2420 +INSERT INTO t1 VALUES (0);
2422 +connect(con1,localhost,root,,);
2423 +connect(con2,localhost,root,,);
2424 +connect(con3,localhost,root,,);
2426 +# Queue up three commits for group commit.
2429 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2430 +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2431 +send INSERT INTO t1 VALUES (1);
2434 +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2435 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2436 +send INSERT INTO t1 VALUES (2);
2439 +SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2440 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2441 +send INSERT INTO t1 VALUES (3);
2443 +connection default;
2444 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2445 +# At this point, no transactions are committed.
2446 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2447 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2448 +# At this point, 1 transaction is committed.
2449 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2450 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2452 +# At this point, 2 transactions are committed.
2453 +SELECT * FROM t1 ORDER BY a;
2458 +# Now crash the server with 1+2 in-memory committed, 3 only prepared.
2459 +connection default;
2460 +system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2461 +SET SESSION debug="+d,crash_dispatch_command_before";
2472 +system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2474 +connection default;
2476 +--source include/wait_until_connected_again.inc
2478 +# Crash recovery should recover all three transactions.
2479 +SELECT * FROM t1 ORDER BY a;
2481 +# Check that the binlog position reported by InnoDB is the correct one
2482 +# for the end of the second transaction (as can be checked with
2484 +let $MYSQLD_DATADIR= `SELECT @@datadir`;
2485 +--exec grep 'InnoDB: Last MySQL binlog file position' $MYSQLD_DATADIR/../../log/mysqld.1.err | tail -1
2487 +SET DEBUG_SYNC= 'RESET';
2490 +++ b/mysql-test/t/group_commit_crash-master.opt
2492 +--skip-stack-trace --skip-core-file
2494 +++ b/mysql-test/t/group_commit_crash.test
2496 +# Testing group commit by crashing a few times.
2497 +# Test adapted from the Facebook patch: lp:mysqlatfacebook
2498 +--source include/not_embedded.inc
2499 +# Don't test this under valgrind, memory leaks will occur
2500 +--source include/not_valgrind.inc
2502 +# Binary must be compiled with debug for crash to occur
2503 +--source include/have_debug.inc
2504 +--source include/have_innodb.inc
2505 +--source include/have_log_bin.inc
2507 +let $innodb_file_format_max_orig=`select @@innodb_file_format_max`;
2508 +CREATE TABLE t1(a CHAR(255),
2512 + id INT AUTO_INCREMENT,
2513 + PRIMARY KEY(id)) ENGINE=InnoDB;
2514 +create table t2 like t1;
2516 +create procedure setcrash(IN i INT)
2519 + WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2520 + WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2521 + WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2522 + WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2523 + WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2528 +# Avoid getting a crashed mysql.proc table.
2533 +let $numinserts = 10;
2534 +while ($numinserts)
2537 + INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2546 + START TRANSACTION;
2547 + insert into t1 select * from t2;
2548 + # Write file to make mysql-test-run.pl expect crash
2549 + --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
2551 + eval call setcrash($numtests);
2553 + # Run the crashing query
2557 + # Poll the server waiting for it to be back online again.
2558 + --source include/wait_until_connected_again.inc
2560 + # table and binlog should be in sync.
2561 + SELECT * FROM t1 ORDER BY id;
2562 + SHOW BINLOG EVENTS LIMIT 2,1;
2572 +DROP PROCEDURE setcrash;
2573 +--disable_query_log
2574 +eval SET GLOBAL innodb_file_format_max=$innodb_file_format_max_orig;
2577 +++ b/mysql-test/t/xa_binlog.test
2579 +--source include/have_innodb.inc
2580 +--source include/have_log_bin.inc
2582 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2584 +# Fix binlog format (otherwise SHOW BINLOG EVENTS will fluctuate).
2585 +SET binlog_format= mixed;
2590 +INSERT INTO t1 VALUES (1);
2592 +XA PREPARE 'xatest';
2593 +XA COMMIT 'xatest';
2596 +INSERT INTO t1 VALUES (2);
2598 +XA COMMIT 'xatest' ONE PHASE;
2601 +INSERT INTO t1 VALUES (3);
2604 +SELECT * FROM t1 ORDER BY a;
2606 +--replace_column 2 # 5 #
2607 +--replace_regex /xid=[0-9]+/xid=XX/
2608 +SHOW BINLOG EVENTS LIMIT 1,9;