5 #define my_b_tell(info) ((info)->pos_in_file + \
6 (size_t) (*(info)->current_pos - (info)->request_pos))
7 +#define my_b_write_tell(info) ((info)->pos_in_file + \
8 + ((info)->write_pos - (info)->write_buffer))
10 #define my_b_get_buffer_start(info) (info)->request_pos
11 #define my_b_get_bytes_in_buffer(info) (char*) (info)->read_end - \
12 --- a/include/mysql/plugin.h
13 +++ b/include/mysql/plugin.h
16 #define EXTENDED_FOR_USERSTAT
18 +#define EXTENDED_FOR_COMMIT_ORDERED
21 Create a temporary file.
26 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
27 uint known_extensions_id= 0;
29 +static int commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans,
30 + bool is_real_trans);
33 static plugin_ref ha_default_plugin(THD *thd)
36 bool is_real_trans= all || thd->transaction.all.ha_list == 0;
37 Ha_trx_info *ha_info= trans->ha_list;
38 - my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
39 + bool need_commit_ordered;
41 DBUG_ENTER("ha_commit_trans");
44 @@ -1152,13 +1155,20 @@
51 + /* Free resources and perform other cleanup even for 'empty' transactions. */
53 + thd->transaction.cleanup();
60 MDL_request mdl_request;
62 - DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
63 + DBUG_EXECUTE_IF("crash_commit_before", abort(););
65 /* Close all cursors that can not survive COMMIT */
66 if (is_real_trans) /* not a statement commit */
67 @@ -1197,57 +1207,80 @@
70 my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
71 - ha_rollback_trans(thd, all);
77 - if (!trans->no_2pc && (rw_ha_count > 1))
78 + if (trans->no_2pc || (rw_ha_count <= 1))
80 - for (; ha_info && !error; ha_info= ha_info->next())
81 + error= ha_commit_one_phase(thd, all);
82 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
86 + need_commit_ordered= FALSE;
87 + xid= thd->transaction.xid_state.xid.get_my_xid();
89 + for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
92 - handlerton *ht= ha_info->ht();
93 + handlerton *ht= hi->ht();
95 Do not call two-phase commit if this particular
96 transaction is read-only. This allows for simpler
97 implementation in engines that are always read-only.
99 - if (! ha_info->is_trx_read_write())
100 + if (! hi->is_trx_read_write())
103 Sic: we know that prepare() is not NULL since otherwise
104 trans->no_2pc would have been set.
106 - if ((err= ht->prepare(ht, thd, all)))
108 - my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
111 + err= ht->prepare(ht, thd, all);
112 status_var_increment(thd->status_var.ha_prepare_count);
114 + my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
119 + need_commit_ordered|= (ht->commit_ordered != NULL);
121 - DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
122 - if (error || (is_real_trans && xid &&
123 - (error= !(cookie= tc_log->log_xid(thd, xid)))))
124 + DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
126 + if (!is_real_trans)
128 - ha_rollback_trans(thd, all);
130 + error= commit_one_phase_low(thd, all, trans, is_real_trans);
131 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
134 - DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
136 - error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
137 - DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
140 + cookie= tc_log->log_and_order(thd, xid, all, need_commit_ordered);
144 + DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
146 + error= commit_one_phase_low(thd, all, trans, is_real_trans) ? 2 : 0;
147 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
148 + if (is_real_trans) /* userstat.patch */
149 + thd->diff_commit_trans++; /* userstat.patch */
150 + RUN_HOOK(transaction, after_commit, (thd, FALSE));
152 + DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
153 if(tc_log->unlog(cookie, xid))
158 - DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
160 - thd->diff_commit_trans++;
161 - RUN_HOOK(transaction, after_commit, (thd, FALSE));
163 + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
166 + /* Come here if error and we need to rollback. */
168 + error= 1; /* Transaction was rolled back */
169 + ha_rollback_trans(thd, all);
172 if (rw_trans && mdl_request.ticket)
174 @@ -1260,9 +1293,6 @@
175 thd->mdl_context.release_lock(mdl_request.ticket);
178 - /* Free resources and perform other cleanup even for 'empty' transactions. */
179 - else if (is_real_trans)
180 - thd->transaction.cleanup();
184 @@ -1279,7 +1309,6 @@
186 int ha_commit_one_phase(THD *thd, bool all)
189 THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
191 "real" is a nick name for a transaction for which a commit will
192 @@ -1295,8 +1324,16 @@
193 transaction.all.ha_list, see why in trans_register_ha()).
195 bool is_real_trans=all || thd->transaction.all.ha_list == 0;
196 - Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
197 DBUG_ENTER("ha_commit_one_phase");
198 + DBUG_RETURN(commit_one_phase_low(thd, all, trans, is_real_trans));
202 +commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
205 + Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
206 + DBUG_ENTER("commit_one_phase_low");
210 @@ -1894,7 +1931,16 @@
215 + Holding the LOCK_commit_ordered mutex ensures that we get the same
216 + snapshot for all engines (including the binary log). This allows us
217 + among other things to do backups with
218 + START TRANSACTION WITH CONSISTENT SNAPSHOT and
219 + have a consistent binlog position.
221 + mysql_mutex_lock(&LOCK_commit_ordered);
222 plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
223 + mysql_mutex_unlock(&LOCK_commit_ordered);
226 Same idea as when one wants to CREATE TABLE in one engine which does not
230 and 'real commit' mean the same event.
232 int (*commit)(handlerton *hton, THD *thd, bool all);
234 + The commit_ordered() method is called prior to the commit() method, after
235 + the transaction manager has decided to commit (not rollback) the
236 + transaction. Unlike commit(), commit_ordered() is called only when the
237 + full transaction is committed, not for each commit of statement
238 + transaction in a multi-statement transaction.
240 + Not that like prepare(), commit_ordered() is only called when 2-phase
241 + commit takes place. Ie. when no binary log and only a single engine
242 + participates in a transaction, one commit() is called, no
243 + commit_ordered(). So engines must be prepared for this.
245 + The calls to commit_ordered() in multiple parallel transactions is
246 + guaranteed to happen in the same order in every participating
247 + handler. This can be used to ensure the same commit order among multiple
248 + handlers (eg. in table handler and binlog). So if transaction T1 calls
249 + into commit_ordered() of handler A before T2, then T1 will also call
250 + commit_ordered() of handler B before T2.
252 + Engines that implement this method should during this call make the
253 + transaction visible to other transactions, thereby making the order of
254 + transaction commits be defined by the order of commit_ordered() calls.
256 + The intention is that commit_ordered() should do the minimal amount of
257 + work that needs to happen in consistent commit order among handlers. To
258 + preserve ordering, calls need to be serialised on a global mutex, so
259 + doing any time-consuming or blocking operations in commit_ordered() will
262 + Handlers can rely on commit_ordered() calls to be serialised (no two
263 + calls can run in parallel, so no extra locking on the handler part is
264 + required to ensure this).
266 + Note that commit_ordered() can be called from a different thread than the
267 + one handling the transaction! So it can not do anything that depends on
268 + thread local storage, in particular it can not call my_error() and
269 + friends (instead it can store the error code and delay the call of
270 + my_error() to the commit() method).
272 + Similarly, since commit_ordered() returns void, any return error code
273 + must be saved and returned from the commit() method instead.
275 + The commit_ordered method is optional, and can be left unset if not
276 + needed in a particular handler (then there will be no ordering guarantees
277 + wrt. other engines and binary log).
279 + void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
280 int (*rollback)(handlerton *hton, THD *thd, bool all);
281 int (*prepare)(handlerton *hton, THD *thd, bool all);
282 int (*recover)(handlerton *hton, XID *xid_list, uint len);
287 #include "sql_plugin.h"
288 #include "rpl_handler.h"
289 +#include "debug_sync.h"
291 /* max size of the log message */
292 #define MAX_LOG_BUFFER_SIZE 1024
294 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
295 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
297 +static LEX_STRING const write_error_msg=
298 + { C_STRING_WITH_LEN("error writing to the binary log") };
300 +static my_bool mutexes_inited;
301 +mysql_mutex_t LOCK_group_commit_queue;
302 +mysql_mutex_t LOCK_commit_ordered;
304 +static ulonglong binlog_status_var_num_commits;
305 +static ulonglong binlog_status_var_num_group_commits;
307 +static SHOW_VAR binlog_status_vars_detail[]=
310 + (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
312 + (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
313 + {NullS, NullS, SHOW_LONG}
317 purge logs, master and slave sides both, related error code
323 - Helper class to hold a mutex for the duration of the
326 - Eliminates the need for explicit unlocking of mutexes on, e.g.,
327 - error returns. On passing a null pointer, the sentry will not do
333 - Mutex_sentry(mysql_mutex_t *mutex)
337 - mysql_mutex_lock(mutex);
343 - mysql_mutex_unlock(m_mutex);
350 - mysql_mutex_t *m_mutex;
352 - // It's not allowed to copy this object in any way
353 - Mutex_sentry(Mutex_sentry const&);
354 - void operator=(Mutex_sentry const&);
358 Helper classes to store non-transactional and transactional data
359 before copying it to the binary log.
362 binlog_cache_data(): m_pending(0), before_stmt_pos(MY_OFF_T_UNDEF),
363 incident(FALSE), changes_to_non_trans_temp_table_flag(FALSE),
364 saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
365 - ptr_binlog_cache_disk_use(0)
366 + ptr_binlog_cache_disk_use(0), commit_bin_log_file_pos(0),
367 + using_xa(FALSE), xa_xid(0)
372 variable after truncating the cache.
374 cache_log.disk_writes= 0;
376 + commit_bin_log_file_pos= 0;
377 DBUG_ASSERT(empty());
382 binlog_cache_data& operator=(const binlog_cache_data& info);
383 binlog_cache_data(const binlog_cache_data& info);
387 + Binlog position after current commit, available to storage engines during
388 + commit_ordered() and commit().
390 + ulonglong commit_bin_log_file_pos;
393 + Flag set true if this transaction is committed with log_xid() as part of
400 class binlog_cache_mngr {
401 @@ -1627,7 +1629,7 @@
404 binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
405 - bool is_transactional)
406 + bool is_transactional, bool all)
408 DBUG_ENTER("binlog_flush_cache");
410 @@ -1646,8 +1648,8 @@
411 were, we would have to ensure that we're not ending a statement
412 inside a stored function.
414 - error= mysql_bin_log.write(thd, &cache_data->cache_log, end_evt,
415 - cache_data->has_incident());
416 + error= mysql_bin_log.write_transaction_to_binlog(thd, cache_data,
421 @@ -1666,12 +1668,12 @@
424 binlog_commit_flush_stmt_cache(THD *thd,
425 - binlog_cache_mngr *cache_mngr)
426 + binlog_cache_mngr *cache_mngr, bool all)
428 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
429 FALSE, FALSE, TRUE, 0);
430 return (binlog_flush_cache(thd, &cache_mngr->stmt_cache, &end_evt,
436 @@ -1684,12 +1686,12 @@
437 nonzero if an error pops up when flushing the cache.
440 -binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
441 +binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
443 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
444 TRUE, FALSE, TRUE, 0);
445 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
451 @@ -1702,12 +1704,12 @@
452 nonzero if an error pops up when flushing the cache.
455 -binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
456 +binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
458 Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
459 TRUE, FALSE, TRUE, 0);
460 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
466 @@ -1722,11 +1724,11 @@
469 binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr,
471 + my_xid xid, bool all)
473 Xid_log_event end_evt(thd, xid);
474 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
480 @@ -1788,7 +1790,7 @@
482 just pretend we can do 2pc, so that MySQL won't
484 - real work will be done in MYSQL_BIN_LOG::log_xid()
485 + real work will be done in MYSQL_BIN_LOG::log_and_order()
489 @@ -1821,7 +1823,7 @@
491 if (!cache_mngr->stmt_cache.empty())
493 - error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
494 + error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
497 if (cache_mngr->trx_cache.empty())
498 @@ -1840,7 +1842,7 @@
499 Otherwise, we accumulate the changes.
501 if (!error && ending_trans(thd, all))
502 - error= binlog_commit_flush_trx_cache(thd, cache_mngr);
503 + error= binlog_commit_flush_trx_cache(thd, cache_mngr, all);
506 This is part of the stmt rollback.
507 @@ -1884,7 +1886,7 @@
509 else if (!cache_mngr->stmt_cache.empty())
511 - error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
512 + error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
515 if (cache_mngr->trx_cache.empty())
516 @@ -1932,7 +1934,7 @@
517 (trans_has_updated_non_trans_table(thd) &&
518 ending_single_stmt_trans(thd,all) &&
519 thd->variables.binlog_format == BINLOG_FORMAT_MIXED)))
520 - error= binlog_rollback_flush_trx_cache(thd, cache_mngr);
521 + error= binlog_rollback_flush_trx_cache(thd, cache_mngr, all);
523 Truncate the cache if:
524 . aborting a single or multi-statement transaction or;
525 @@ -2907,6 +2909,7 @@
526 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
527 :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
528 need_start_event(TRUE),
529 + group_commit_queue(0), num_commits(0), num_group_commits(0),
530 sync_period_ptr(sync_period),
531 is_relay_log(0), signal_cnt(0),
532 description_event_for_exec(0), description_event_for_queue(0)
533 @@ -5279,19 +5282,15 @@
536 cache Cache to write to the binary log
537 - lock_log True if the LOCK_log mutex should be aquired, false otherwise
538 - sync_log True if the log should be flushed and synced
541 Write the contents of the cache to the binary log. The cache will
542 be reset as a READ_CACHE to be able to read the contents from it.
545 -int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache,
546 - bool lock_log, bool sync_log)
547 +int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
549 - Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
551 + mysql_mutex_assert_owner(&LOCK_log);
552 if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
553 return ER_ERROR_ON_WRITE;
554 uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
555 @@ -5402,6 +5401,8 @@
558 /* Write data to the binary log file */
559 + DBUG_EXECUTE_IF("fail_binlog_write_1",
560 + errno= 28; return ER_ERROR_ON_WRITE;);
561 if (my_b_write(&log_file, cache->read_pos, length))
562 return ER_ERROR_ON_WRITE;
563 thd->binlog_bytes_written+= length;
564 @@ -5410,9 +5411,6 @@
566 DBUG_ASSERT(carry == 0);
569 - return flush_and_sync(0);
574 @@ -5453,8 +5451,6 @@
578 - LEX_STRING const write_error_msg=
579 - { C_STRING_WITH_LEN("error writing to the binary log") };
580 Incident incident= INCIDENT_LOST_EVENTS;
581 Incident_log_event ev(thd, incident, write_error_msg);
583 @@ -5496,104 +5492,320 @@
584 'cache' needs to be reinitialized after this functions returns.
587 -bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
590 +MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
591 + Log_event *end_ev, bool all)
593 - DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
594 + group_commit_entry entry;
596 + DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
599 + entry.cache_data= cache_data;
604 + Log "BEGIN" at the beginning of every transaction. Here, a transaction is
605 + either a BEGIN..COMMIT block or a single statement in autocommit mode.
607 + Create the necessary events here, where we have the correct THD (and
610 + Due to group commit the actual writing to binlog may happen in a different
613 + Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
614 + entry.begin_event= &qinfo;
615 + entry.end_event= end_ev;
616 + if (cache_data->has_incident())
618 + Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
619 + entry.incident_event= &inc_ev;
620 + ret = write_transaction_to_binlog_events(&entry);
624 + entry.incident_event= NULL;
625 + ret = write_transaction_to_binlog_events(&entry);
627 + if (!ret) /* userstat.patch */
628 + thd->binlog_bytes_written += qinfo.data_written; /* userstat.patch */
633 +MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
636 + To facilitate group commit for the binlog, we first queue up ourselves in
637 + the group commit queue. Then the first thread to enter the queue waits for
638 + the LOCK_log mutex, and commits for everyone in the queue once it gets the
639 + lock. Any other threads in the queue just wait for the first one to finish
640 + the commit and wake them up.
642 + entry->thd->clear_wakeup_ready();
643 + mysql_mutex_lock(&LOCK_group_commit_queue);
644 + group_commit_entry *orig_queue= group_commit_queue;
645 + entry->next= orig_queue;
646 + group_commit_queue= entry;
647 + DEBUG_SYNC(entry->thd, "commit_group_commit_queue");
648 + mysql_mutex_unlock(&LOCK_group_commit_queue);
651 + The first in the queue handle group commit for all; the others just wait
652 + to be signalled when group commit is done.
654 + if (orig_queue != NULL)
655 + entry->thd->wait_for_wakeup_ready();
657 + trx_group_commit_leader(entry);
659 + if (likely(!entry->error))
662 + switch (entry->error)
664 + case ER_ERROR_ON_WRITE:
665 + my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
667 + case ER_ERROR_ON_READ:
668 + my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
669 + entry->cache_data->cache_log.file_name, entry->commit_errno);
673 + There are not (and should not be) any errors thrown not covered above.
674 + But just in case one is added later without updating the above switch
675 + statement, include a catch-all.
677 + my_printf_error(entry->error,
678 + "Error writing transaction to binary log: %d",
679 + MYF(ME_NOREFRESH), entry->error);
683 + Since we return error, this transaction XID will not be committed, so
684 + we need to mark it as not needed for recovery (unlog() is not called
685 + for a transaction if log_xid() fails).
687 + if (entry->cache_data->using_xa && entry->cache_data->xa_xid)
694 + Do binlog group commit as the lead thread.
696 + This must be called when this thread/transaction is queued at the start of
697 + the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
698 + commit all the transactions in the queue (more may have entered while waiting
699 + for LOCK_log). After commit is done, all other threads in the queue will be
704 +MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
706 + DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
708 + uint write_count= 0;
711 + Lock the LOCK_log(), and once we get it, collect any additional writes
712 + that queued up while we were waiting.
714 mysql_mutex_lock(&LOCK_log);
715 + DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
716 + mysql_mutex_lock(&LOCK_group_commit_queue);
717 + group_commit_entry *current= group_commit_queue;
718 + group_commit_queue= NULL;
719 + mysql_mutex_unlock(&LOCK_group_commit_queue);
721 + /* As the queue is in reverse order of entering, reverse it. */
722 + group_commit_entry *queue= NULL;
725 + group_commit_entry *next= current->next;
726 + current->next= queue;
730 + DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
732 + /* Now we have in queue the list of transactions to be committed in order. */
733 DBUG_ASSERT(is_open());
734 if (likely(is_open())) // Should always be true
737 - We only bother to write to the binary log if there is anything
740 - if (my_b_tell(cache) > 0)
741 + Commit every transaction in the queue.
743 + Note that we are doing this in a different thread than the one running
744 + the transaction! So we are limited in the operations we can do. In
745 + particular, we cannot call my_error() on behalf of a transaction, as
746 + that obtains the THD from thread local storage. Instead, we must set
747 + current->error and let the thread do the error reporting itself once
750 + for (current= queue; current != NULL; current= current->next)
752 + binlog_cache_data *cache_data= current->cache_data;
753 + IO_CACHE *cache= &cache_data->cache_log;
756 - Log "BEGIN" at the beginning of every transaction. Here, a
757 - transaction is either a BEGIN..COMMIT block or a single
758 - statement in autocommit mode.
759 + We only bother to write to the binary log if there is anything
762 - Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
763 - if (qinfo.write(&log_file))
765 - thd->binlog_bytes_written+= qinfo.data_written;
766 - DBUG_EXECUTE_IF("crash_before_writing_xid",
768 - if ((write_error= write_cache(thd, cache, false, true)))
769 - DBUG_PRINT("info", ("error writing binlog cache: %d",
771 - DBUG_PRINT("info", ("crashing before writing xid"));
774 + if (my_b_tell(cache) > 0)
776 + if ((current->error= write_transaction(current)))
777 + current->commit_errno= errno;
779 - if ((write_error= write_cache(thd, cache, false, false)))
784 - if (commit_event && commit_event->write(&log_file))
787 - thd->binlog_bytes_written+= commit_event->data_written;
788 + cache_data->commit_bin_log_file_pos= my_b_write_tell(&log_file);
789 + if (cache_data->using_xa && cache_data->xa_xid)
793 - if (incident && write_incident(thd, FALSE))
796 + if (write_count > 0)
799 if (flush_and_sync(&synced))
801 - DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
802 - if (cache->error) // Error on read
804 - sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
805 - write_error=1; // Don't give more errors
807 + for (current= queue; current != NULL; current= current->next)
809 + if (!current->error)
811 + current->error= ER_ERROR_ON_WRITE;
812 + current->commit_errno= errno;
821 if (RUN_HOOK(binlog_storage, after_flush,
822 - (thd, log_file_name, log_file.pos_in_file, synced)))
823 + (leader->thd, log_file_name, log_file.pos_in_file, synced)))
825 sql_print_error("Failed to run 'after_flush' hooks");
828 + for (current= queue; current != NULL; current= current->next)
830 + if (!current->error)
832 + current->error= ER_ERROR_ON_WRITE;
833 + current->commit_errno= errno;
842 - if commit_event is Xid_log_event, increase the number of
843 - prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
844 + if any commit_events are Xid_log_event, increase the number of
845 + prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
846 if there're prepared xids in it - see the comment in new_file() for
848 - If the commit_event is not Xid_log_event (then it's a Query_log_event)
849 - rotate binlog, if necessary.
850 + If no Xid_log_events (then it's all Query_log_event) rotate binlog,
853 - if (commit_event && commit_event->get_type_code() == XID_EVENT)
856 - mysql_mutex_lock(&LOCK_prep_xids);
858 - mysql_mutex_unlock(&LOCK_prep_xids);
859 + mark_xids_active(xid_count);
862 if (rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED))
865 + for (current= queue; current != NULL; current= current->next)
867 + if (!current->error)
869 + current->error= ER_ERROR_ON_WRITE;
870 + current->commit_errno= errno;
875 + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
876 + mysql_mutex_lock(&LOCK_commit_ordered);
878 + We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
879 + otherwise scheduling could allow the next group commit to run ahead of us,
880 + messing up the order of commit_ordered() calls. But as soon as
881 + LOCK_commit_ordered is obtained, we can let the next group commit start.
883 mysql_mutex_unlock(&LOCK_log);
884 + DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
885 + ++num_group_commits;
892 + Wakeup each participant waiting for our group commit, first calling the
893 + commit_ordered() methods for any transactions doing 2-phase commit.
896 + while (current != NULL)
899 - sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
900 + group_commit_entry *next;
902 + DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
904 + if (current->cache_data->using_xa && !current->error)
905 + run_commit_ordered(current->thd, current->all);
908 + Careful not to access current->next after waking up the other thread! As
909 + it may change immediately after wakeup.
911 + next= current->next;
912 + if (current != leader) // Don't wake up ourself
913 + current->thd->signal_wakeup_ready();
916 - mysql_mutex_unlock(&LOCK_log);
918 + DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
919 + mysql_mutex_unlock(&LOCK_commit_ordered);
925 +MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry)
927 + binlog_cache_data *cache_data= entry->cache_data;
928 + IO_CACHE *cache= &cache_data->cache_log;
930 + if (entry->begin_event->write(&log_file))
931 + return ER_ERROR_ON_WRITE;
933 + DBUG_EXECUTE_IF("crash_before_writing_xid",
935 + if ((write_cache(entry->thd, cache)))
936 + DBUG_PRINT("info", ("error writing binlog cache"));
940 + DBUG_PRINT("info", ("crashing before writing xid"));
944 + if (write_cache(entry->thd, cache))
945 + return ER_ERROR_ON_WRITE;
947 + if (entry->end_event->write(&log_file))
948 + return ER_ERROR_ON_WRITE;
950 + if (entry->incident_event && entry->incident_event->write(&log_file))
951 + return ER_ERROR_ON_WRITE;
953 + if (cache->error) // Error on read
954 + return ER_ERROR_ON_READ;
960 Wait until we get a signal that the relay log has been updated.
961 @@ -5999,6 +6211,68 @@
968 + mysql_mutex_init(key_LOCK_group_commit_queue, &LOCK_group_commit_queue, MY_MUTEX_INIT_SLOW);
969 + mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered, MY_MUTEX_INIT_SLOW);
970 + mutexes_inited= TRUE;
977 + if (mutexes_inited)
979 + mysql_mutex_destroy(&LOCK_group_commit_queue);
980 + mysql_mutex_destroy(&LOCK_commit_ordered);
981 + mutexes_inited= FALSE;
987 +TC_LOG::run_commit_ordered(THD *thd, bool all)
989 + Ha_trx_info *ha_info=
990 + all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
992 + mysql_mutex_assert_owner(&LOCK_commit_ordered);
993 + for (; ha_info; ha_info= ha_info->next())
995 + handlerton *ht= ha_info->ht();
996 + if (!ht->commit_ordered)
998 + ht->commit_ordered(ht, thd, all);
999 + DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
1003 +int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
1004 + bool need_commit_ordered)
1010 + cookie= log_one_transaction(xid);
1012 + if (need_commit_ordered)
1014 + /* Only run commit_ordered() if log_xid was successful. */
1017 + mysql_mutex_lock(&LOCK_commit_ordered);
1018 + run_commit_ordered(thd, all);
1019 + mysql_mutex_unlock(&LOCK_commit_ordered);
1027 /********* transaction coordinator log for 2pc - mmap() based solution *******/
1030 @@ -6135,6 +6409,7 @@
1031 mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
1032 mysql_cond_init(key_COND_active, &COND_active, 0);
1033 mysql_cond_init(key_COND_pool, &COND_pool, 0);
1034 + mysql_cond_init(key_COND_queue_busy, &COND_queue_busy, 0);
1038 @@ -6142,6 +6417,8 @@
1041 pool_last=pages+npages-1;
1042 + commit_ordered_queue= NULL;
1043 + commit_ordered_queue_busy= false;
1047 @@ -6247,7 +6524,7 @@
1048 to the position in memory where xid was logged to.
1051 -int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
1052 +int TC_LOG_MMAP::log_one_transaction(my_xid xid)
1056 @@ -6386,7 +6663,9 @@
1057 mysql_mutex_destroy(&LOCK_sync);
1058 mysql_mutex_destroy(&LOCK_active);
1059 mysql_mutex_destroy(&LOCK_pool);
1060 + mysql_cond_destroy(&COND_active);
1061 mysql_cond_destroy(&COND_pool);
1062 + mysql_cond_destroy(&COND_queue_busy);
1064 data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
1066 @@ -6596,42 +6875,87 @@
1067 mysql_cond_destroy(&COND_prep_xids);
1074 + Do a binlog log_xid() for a group of transactions, linked through
1075 + thd->next_commit_ordered.
1082 -int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
1083 +int TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
1084 + bool need_commit_ordered __attribute__((unused)))
1086 - DBUG_ENTER("TC_LOG_BINLOG::log");
1087 + DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
1088 binlog_cache_mngr *cache_mngr=
1089 (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1091 + cache_mngr->trx_cache.using_xa= TRUE;
1092 + cache_mngr->trx_cache.xa_xid= xid;
1094 We always commit the entire transaction when writing an XID. Also
1095 note that the return value is inverted.
1097 - DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr) &&
1098 - !binlog_commit_flush_trx_cache(thd, cache_mngr, xid));
1099 + DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr, all) &&
1100 + !binlog_commit_flush_trx_cache(thd, cache_mngr, xid, all));
1103 -int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1105 + After an XID is logged, we need to hold on to the current binlog file until
1106 + it is fully committed in the storage engine. The reason is that crash
1107 + recovery only looks at the latest binlog, so we must make sure there are no
1108 + outstanding prepared (but not committed) transactions before rotating the
1111 + To handle this, we keep a count of outstanding XIDs. This function is used
1112 + to increase this count when committing one or more transactions to the
1116 +TC_LOG_BINLOG::mark_xids_active(uint xid_count)
1118 - DBUG_ENTER("TC_LOG_BINLOG::unlog");
1119 + DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
1120 + DBUG_PRINT("info", ("xid_count=%u", xid_count));
1121 + mysql_mutex_lock(&LOCK_prep_xids);
1122 + prepared_xids+= xid_count;
1123 + mysql_mutex_unlock(&LOCK_prep_xids);
1128 + Once an XID is committed, it is safe to rotate the binary log, as it can no
1129 + longer be needed during crash recovery.
1131 + This function is called to mark an XID this way. It needs to decrease the
1132 + count of pending XIDs, and signal the log rotator thread when it reaches zero.
1135 +TC_LOG_BINLOG::mark_xid_done()
1137 + my_bool send_signal;
1139 + DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
1140 mysql_mutex_lock(&LOCK_prep_xids);
1141 // prepared_xids can be 0 if the transaction had ignorable errors.
1142 DBUG_ASSERT(prepared_xids >= 0);
1143 if (prepared_xids > 0)
1145 - if (prepared_xids == 0) {
1146 + send_signal= (prepared_xids == 0);
1147 + mysql_mutex_unlock(&LOCK_prep_xids);
1148 + if (send_signal) {
1149 DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
1150 mysql_cond_signal(&COND_prep_xids);
1152 - mysql_mutex_unlock(&LOCK_prep_xids);
1153 - DBUG_RETURN(rotate_and_purge(0)); // as ::write() did not rotate
1157 +int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1159 + DBUG_ENTER("TC_LOG_BINLOG::unlog");
1162 + DBUG_RETURN(rotate_and_purge(0));
1165 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
1166 @@ -6700,9 +7024,67 @@
1168 return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
1171 + Get the current position of the MySQL binlog for transaction currently being
1174 + This is valid to call from within storage engine commit_ordered() and
1175 + commit() methods only.
1177 + Since it stores the position inside THD, it is safe to call without any
1180 + Note that currently the binlog file name is not stored inside THD, but this
1181 + is still safe as it can only change when the log is rotated, and we never
1182 + rotate the binlog while commits are pending inside storage engines.
1185 +void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
1187 + binlog_cache_mngr *cache_mngr;
1188 + if (binlog_hton->state == SHOW_OPTION_YES
1189 + && (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
1191 + *out_pos= cache_mngr->trx_cache.commit_bin_log_file_pos;
1192 + *out_file= mysql_bin_log.get_log_fname();
1200 #endif /* INNODB_COMPATIBILITY_HOOKS */
1203 +static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
1205 + mysql_bin_log.set_status_variables();
1206 + var->type= SHOW_ARRAY;
1207 + var->value= (char *)&binlog_status_vars_detail;
1211 +static SHOW_VAR binlog_status_vars_top[]= {
1212 + {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
1213 + {NullS, NullS, SHOW_LONG}
1217 + Copy out current values of status variables, for SHOW STATUS or
1218 + information_schema.global_status.
1220 + This is called only under LOCK_status, so we can fill in a static array.
1223 +TC_LOG_BINLOG::set_status_variables()
1225 + mysql_mutex_lock(&LOCK_commit_ordered);
1226 + binlog_status_var_num_commits= this->num_commits;
1227 + binlog_status_var_num_group_commits= this->num_group_commits;
1228 + mysql_mutex_unlock(&LOCK_commit_ordered);
1231 struct st_mysql_storage_engine binlog_storage_engine=
1232 { MYSQL_HANDLERTON_INTERFACE_VERSION };
1234 @@ -6717,7 +7099,7 @@
1235 binlog_init, /* Plugin Init */
1236 NULL, /* Plugin Deinit */
1238 - NULL, /* status variables */
1239 + binlog_status_vars_top, /* status variables */
1240 NULL, /* system variables */
1241 NULL, /* config options */
1247 virtual int open(const char *opt_name)=0;
1248 virtual void close()=0;
1249 - virtual int log_xid(THD *thd, my_xid xid)=0;
1250 + virtual int log_and_order(THD *thd, my_xid xid, bool all,
1251 + bool need_commit_ordered)=0;
1252 virtual int unlog(ulong cookie, my_xid xid)=0;
1255 + void run_commit_ordered(THD *thd, bool all);
1259 + Locks used to ensure serialised execution of
1260 + TC_LOG::run_commit_ordered(), or any other code that calls handler
1261 + commit_ordered() methods.
1263 +extern mysql_mutex_t LOCK_group_commit_queue;
1264 +extern mysql_mutex_t LOCK_commit_ordered;
1266 +extern void TC_init();
1267 +extern void TC_destroy();
1269 class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
1273 int open(const char *opt_name) { return 0; }
1275 - int log_xid(THD *thd, my_xid xid) { return 1; }
1277 + TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we
1278 + only use internal XA during commit when >= 2 XA-capable engines
1281 + int log_and_order(THD *thd, my_xid xid, bool all,
1282 + bool need_commit_ordered)
1284 + DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */);
1287 int unlog(ulong cookie, my_xid xid) { return 0; }
1291 mysql_cond_t cond; // to wait for a sync
1294 + /* List of THDs for which to invoke commit_ordered(), in order. */
1295 + struct commit_entry
1297 + struct commit_entry *next;
1301 char logname[FN_REFLEN];
1303 my_off_t file_length;
1304 @@ -94,16 +126,38 @@
1306 mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
1307 mysql_cond_t COND_pool, COND_active;
1309 + Queue of threads that need to call commit_ordered().
1310 + Access to this queue must be protected by LOCK_group_commit_queue
1312 + commit_entry *commit_ordered_queue;
1314 + This flag and condition is used to reserve the queue while threads in it
1315 + each run the commit_ordered() methods one after the other. Only once the
1316 + last commit_ordered() in the queue is done can we start on a new queue
1319 + Since we start this process in the first thread in the queue and finish in
1320 + the last (and possibly different) thread, we need a condition variable for
1321 + this (we cannot unlock a mutex in a different thread than the one who
1324 + The condition is used together with the LOCK_group_commit_queue mutex.
1326 + my_bool commit_ordered_queue_busy;
1327 + mysql_cond_t COND_queue_busy;
1330 TC_LOG_MMAP(): inited(0) {}
1331 int open(const char *opt_name);
1333 - int log_xid(THD *thd, my_xid xid);
1334 + int log_and_order(THD *thd, my_xid xid, bool all,
1335 + bool need_commit_ordered);
1336 int unlog(ulong cookie, my_xid xid);
1340 + int log_one_transaction(my_xid xid);
1341 void get_active_from_pool();
1344 @@ -271,9 +325,31 @@
1348 +class binlog_cache_data;
1349 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
1352 + struct group_commit_entry
1354 + struct group_commit_entry *next;
1356 + binlog_cache_data *cache_data;
1358 + Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
1359 + written during group commit. The incident_event is only valid if
1360 + trx_data->has_incident() is true.
1362 + Log_event *begin_event;
1363 + Log_event *end_event;
1364 + Log_event *incident_event;
1365 + /* Set during group commit to record any per-thread error. */
1368 + /* This is the `all' parameter for ha_commit_ordered(). */
1370 + /* True if we come in through XA log_and_order(), false otherwise. */
1373 #ifdef HAVE_PSI_INTERFACE
1374 /** The instrumentation key to use for @ LOCK_index. */
1375 PSI_mutex_key m_key_LOCK_index;
1376 @@ -325,6 +401,12 @@
1377 In 5.0 it's 0 for relay logs too!
1379 bool no_auto_events;
1380 + /* Queue of transactions queued up to participate in group commit. */
1381 + group_commit_entry *group_commit_queue;
1382 + /* Total number of committed transactions. */
1383 + ulonglong num_commits;
1384 + /* Number of group commits done. */
1385 + ulonglong num_group_commits;
1387 /* pointer to the sync period variable, for binlog this will be
1388 sync_binlog_period, for relay log this will be
1389 @@ -346,6 +428,11 @@
1391 int new_file_without_locking();
1392 int new_file_impl(bool need_lock);
1393 + int write_transaction(group_commit_entry *entry);
1394 + bool write_transaction_to_binlog_events(group_commit_entry *entry);
1395 + void trx_group_commit_leader(group_commit_entry *leader);
1396 + void mark_xid_done();
1397 + void mark_xids_active(uint xid_count);
1400 MYSQL_LOG::generate_name;
1403 int open(const char *opt_name);
1405 - int log_xid(THD *thd, my_xid xid);
1406 + int log_and_order(THD *thd, my_xid xid, bool all,
1407 + bool need_commit_ordered);
1408 int unlog(ulong cookie, my_xid xid);
1409 int recover(IO_CACHE *log, Format_description_log_event *fdle);
1410 #if !defined(MYSQL_CLIENT)
1411 @@ -434,11 +522,11 @@
1414 bool write(Log_event* event_info); // binary log write
1415 - bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
1416 + bool write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
1417 + Log_event *end_ev, bool all);
1418 bool write_incident(THD *thd, bool lock);
1420 - int write_cache(THD *thd, IO_CACHE *cache,
1421 - bool lock_log, bool flush_and_sync);
1422 + int write_cache(THD *thd, IO_CACHE *cache);
1423 void set_write_error(THD *thd, bool is_transactional);
1424 bool check_write_error(THD *thd);
1427 inline void unlock_index() { mysql_mutex_unlock(&LOCK_index);}
1428 inline IO_CACHE *get_index_file() { return &index_file;}
1429 inline uint32 get_open_count() { return open_count; }
1430 + void set_status_variables();
1433 class Log_event_handler
1436 @@ -1490,6 +1490,7 @@
1441 delegates_destroy();
1444 @@ -4061,6 +4062,8 @@
1445 query_response_time_init();
1446 #endif // HAVE_RESPONSE_TIME_DISTRIBUTION
1447 /* We have to initialize the storage engines before CSV logging */
1450 init_global_table_stats();
1451 init_global_index_stats();
1453 @@ -8004,6 +8007,7 @@
1454 key_LOCK_error_messages, key_LOG_INFO_lock, key_LOCK_thread_count,
1455 key_PARTITION_LOCK_auto_inc;
1456 PSI_mutex_key key_RELAYLOG_LOCK_index;
1457 +PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1459 static PSI_mutex_info all_server_mutexes[]=
1461 @@ -8024,6 +8028,7 @@
1462 { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
1463 { &key_hash_filo_lock, "hash_filo::lock", 0},
1464 { &key_LOCK_active_mi, "LOCK_active_mi", PSI_FLAG_GLOBAL},
1465 + { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
1466 { &key_LOCK_connection_count, "LOCK_connection_count", PSI_FLAG_GLOBAL},
1467 { &key_LOCK_crypt, "LOCK_crypt", PSI_FLAG_GLOBAL},
1468 { &key_LOCK_delayed_create, "LOCK_delayed_create", PSI_FLAG_GLOBAL},
1469 @@ -8039,6 +8044,7 @@
1470 "LOCK_global_index_stats", PSI_FLAG_GLOBAL},
1471 { &key_LOCK_gdl, "LOCK_gdl", PSI_FLAG_GLOBAL},
1472 { &key_LOCK_global_system_variables, "LOCK_global_system_variables", PSI_FLAG_GLOBAL},
1473 + { &key_LOCK_group_commit_queue, "LOCK_group_commit_queue", PSI_FLAG_GLOBAL},
1474 { &key_LOCK_manager, "LOCK_manager", PSI_FLAG_GLOBAL},
1475 { &key_LOCK_prepared_stmt_count, "LOCK_prepared_stmt_count", PSI_FLAG_GLOBAL},
1476 { &key_LOCK_rpl_status, "LOCK_rpl_status", PSI_FLAG_GLOBAL},
1477 @@ -8050,6 +8056,7 @@
1478 { &key_LOCK_temporary_tables, "THD::LOCK_temporary_tables", 0},
1479 { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
1480 { &key_LOCK_uuid_generator, "LOCK_uuid_generator", PSI_FLAG_GLOBAL},
1481 + { &key_LOCK_wakeup_ready, "THD::LOCK_wakeup_ready", 0},
1482 { &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
1483 { &key_master_info_data_lock, "Master_info::data_lock", 0},
1484 { &key_master_info_run_lock, "Master_info::run_lock", 0},
1485 @@ -8097,6 +8104,7 @@
1486 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1487 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1488 PSI_cond_key key_RELAYLOG_update_cond;
1489 +PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1491 static PSI_cond_info all_server_conds[]=
1493 @@ -8113,8 +8121,10 @@
1494 { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
1495 { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
1496 { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
1497 + { &key_COND_queue_busy, "COND_queue_busy", PSI_FLAG_GLOBAL},
1498 { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
1499 { &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
1500 + { &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
1501 { &key_delayed_insert_cond, "Delayed_insert::cond", 0},
1502 { &key_delayed_insert_cond_client, "Delayed_insert::cond_client", 0},
1503 { &key_item_func_sleep_cond, "Item_func_sleep::cond", 0},
1507 key_structure_guard_mutex, key_TABLE_SHARE_LOCK_ha_data,
1508 key_LOCK_error_messages, key_LOCK_thread_count, key_PARTITION_LOCK_auto_inc;
1509 extern PSI_mutex_key key_RELAYLOG_LOCK_index;
1510 +extern PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1512 extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger,
1513 key_rwlock_LOCK_sys_init_connect, key_rwlock_LOCK_sys_init_slave,
1515 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1516 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1517 extern PSI_cond_key key_RELAYLOG_update_cond;
1518 +extern PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1520 extern PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
1521 key_thread_handle_manager, key_thread_kill_server, key_thread_main,
1522 --- a/sql/sql_class.cc
1523 +++ b/sql/sql_class.cc
1525 mysql_mutex_init(key_LOCK_thd_data, &LOCK_thd_data, MY_MUTEX_INIT_FAST);
1526 mysql_mutex_init(key_LOCK_temporary_tables, &LOCK_temporary_tables,
1527 MY_MUTEX_INIT_FAST);
1528 + mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
1529 + mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, NULL);
1531 /* Variables with default values */
1533 @@ -1516,6 +1518,8 @@
1536 free_root(&transaction.mem_root,MYF(0));
1537 + mysql_cond_destroy(&COND_wakeup_ready);
1538 + mysql_mutex_destroy(&LOCK_wakeup_ready);
1539 mysql_mutex_destroy(&LOCK_thd_data);
1540 mysql_mutex_destroy(&LOCK_temporary_tables);
1542 @@ -5199,6 +5203,24 @@
1547 +THD::wait_for_wakeup_ready()
1549 + mysql_mutex_lock(&LOCK_wakeup_ready);
1550 + while (!wakeup_ready)
1551 + mysql_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready);
1552 + mysql_mutex_unlock(&LOCK_wakeup_ready);
1556 +THD::signal_wakeup_ready()
1558 + mysql_mutex_lock(&LOCK_wakeup_ready);
1559 + wakeup_ready= true;
1560 + mysql_mutex_unlock(&LOCK_wakeup_ready);
1561 + mysql_cond_signal(&COND_wakeup_ready);
1564 bool Discrete_intervals_list::append(ulonglong start, ulonglong val,
1567 --- a/sql/sql_class.h
1568 +++ b/sql/sql_class.h
1569 @@ -3017,6 +3017,14 @@
1570 LEX_STRING get_invoker_user() { return invoker_user; }
1571 LEX_STRING get_invoker_host() { return invoker_host; }
1572 bool has_invoker() { return invoker_user.length > 0; }
1573 + void clear_wakeup_ready() { wakeup_ready= false; }
1575 + Sleep waiting for others to wake us up with signal_wakeup_ready().
1576 + Must call clear_wakeup_ready() before waiting.
1578 + void wait_for_wakeup_ready();
1579 + /* Wake this thread up from wait_for_wakeup_ready(). */
1580 + void signal_wakeup_ready();
1583 /** The current internal error handler for this thread, or NULL. */
1584 @@ -3059,6 +3067,16 @@
1586 LEX_STRING invoker_user;
1587 LEX_STRING invoker_host;
1589 + Flag, mutex and condition for a thread to wait for a signal from another
1592 + Currently used to wait for group commit to complete, can also be used for
1595 + bool wakeup_ready;
1596 + mysql_mutex_t LOCK_wakeup_ready;
1597 + mysql_cond_t COND_wakeup_ready;
1600 /* Returns string as 'IP' for the client-side of the connection represented by
1601 --- a/sql/sql_parse.cc
1602 +++ b/sql/sql_parse.cc
1603 @@ -889,6 +889,10 @@
1604 DBUG_ENTER("dispatch_command");
1605 DBUG_PRINT("info",("packet: '%*.s'; command: %d", packet_length, packet, command));
1607 + DBUG_EXECUTE_IF("crash_dispatch_command_before",
1608 + { DBUG_PRINT("crash_dispatch_command_before", ("now"));
1611 #if defined(ENABLED_PROFILING)
1612 thd->profiling.start_new_query();
1614 --- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1615 +++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1617 wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
1618 wait/synch/mutex/sql/LOCK_active_mi YES YES
1619 wait/synch/mutex/sql/LOCK_audit_mask YES YES
1620 +wait/synch/mutex/sql/LOCK_commit_ordered YES YES
1621 wait/synch/mutex/sql/LOCK_connection_count YES YES
1622 wait/synch/mutex/sql/LOCK_crypt YES YES
1623 -wait/synch/mutex/sql/LOCK_delayed_create YES YES
1624 select * from performance_schema.setup_instruments
1625 where name like 'Wait/Synch/Rwlock/sql/%'
1626 and name not in ('wait/synch/rwlock/sql/CRYPTO_dynlock_value::lock')
1629 wait/synch/cond/sql/COND_flush_thread_cache YES YES
1630 wait/synch/cond/sql/COND_manager YES YES
1631 +wait/synch/cond/sql/COND_queue_busy YES YES
1632 wait/synch/cond/sql/COND_queue_state YES YES
1633 wait/synch/cond/sql/COND_rpl_status YES YES
1634 wait/synch/cond/sql/COND_server_started YES YES
1636 wait/synch/cond/sql/COND_thread_count YES YES
1637 wait/synch/cond/sql/Delayed_insert::cond YES YES
1638 wait/synch/cond/sql/Delayed_insert::cond_client YES YES
1639 -wait/synch/cond/sql/Event_scheduler::COND_state YES YES
1640 select * from performance_schema.setup_instruments
1642 select * from performance_schema.setup_instruments
1643 --- a/storage/innobase/handler/ha_innodb.cc
1644 +++ b/storage/innobase/handler/ha_innodb.cc
1646 static INNOBASE_SHARE *get_share(const char *table_name);
1647 static void free_share(INNOBASE_SHARE *share);
1648 static int innobase_close_connection(handlerton *hton, THD* thd);
1649 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1650 +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
1652 static int innobase_commit(handlerton *hton, THD* thd, bool all);
1653 static int innobase_rollback(handlerton *hton, THD* thd, bool all);
1654 static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
1655 @@ -1699,7 +1702,10 @@
1656 trx_t* trx) /*!< in/out: InnoDB transaction handle */
1658 DBUG_ENTER("innobase_trx_init");
1659 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1660 + /* used by innobase_commit_ordered */
1661 DBUG_ASSERT(EQ_CURRENT_THD(thd));
1663 DBUG_ASSERT(thd == trx->mysql_thd);
1665 trx->check_foreigns = !thd_test_options(
1666 @@ -1760,7 +1766,10 @@
1668 trx_t*& trx = thd_to_trx(thd);
1670 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1671 + /* used by innobase_commit_ordered */
1672 ut_ad(EQ_CURRENT_THD(thd));
1676 trx = innobase_trx_allocate(thd);
1677 @@ -1846,6 +1855,7 @@
1679 trx->is_registered = 0;
1680 trx->owns_prepare_mutex = 0;
1681 + trx->called_commit_ordered = 0;
1684 /*********************************************************************//**
1685 @@ -1861,6 +1871,29 @@
1688 /*********************************************************************//**
1692 +trx_called_commit_ordered_set(
1693 +/*==========================*/
1696 + ut_a(trx_is_registered_for_2pc(trx));
1697 + trx->called_commit_ordered = 1;
1700 +/*********************************************************************//**
1704 +trx_called_commit_ordered(
1705 +/*======================*/
1708 + return(trx->called_commit_ordered == 1);
1711 +/*********************************************************************//**
1712 Check if transaction is started.
1713 @reutrn true if transaction is in state started */
1715 @@ -2435,6 +2468,9 @@
1716 innobase_hton->savepoint_set=innobase_savepoint;
1717 innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
1718 innobase_hton->savepoint_release=innobase_release_savepoint;
1719 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1720 + innobase_hton->commit_ordered=innobase_commit_ordered;
1722 innobase_hton->commit=innobase_commit;
1723 innobase_hton->rollback=innobase_rollback;
1724 innobase_hton->prepare=innobase_xa_prepare;
1725 @@ -3187,6 +3223,126 @@
1729 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1731 + InnoDB is coded with intention that always trx is accessed by the owner thd.
1732 + (not protected by any mutex/lock)
1733 + So, the caller of innobase_commit_ordered() should be conscious of
1734 + cache coherency between multi CPU about the trx, if called from another thd.
1736 + MariaDB's first implementation about it seems the cherency is protected by
1737 + the pthread_mutex LOCK_wakeup_ready. So, no problem for now.
1739 + But we should be aware the importance of the coherency.
1741 +/*****************************************************************//**
1742 +low function function innobase_commit_ordered().*/
1745 +innobase_commit_ordered_low(
1746 +/*========================*/
1747 + trx_t* trx, /*!< in: Innodb transaction */
1748 + THD* thd) /*!< in: MySQL thread handle */
1750 + ulonglong tmp_pos;
1751 + DBUG_ENTER("innobase_commit_ordered");
1753 + /* This part was from innobase_commit() */
1755 + /* We need current binlog position for ibbackup to work.
1756 + Note, the position is current because commit_ordered is guaranteed
1757 + to be called in same sequenece as writing to binlog. */
1759 + if (innobase_commit_concurrency > 0) {
1760 + mysql_mutex_lock(&commit_cond_m);
1763 + if (commit_threads > innobase_commit_concurrency) {
1765 + mysql_cond_wait(&commit_cond,
1767 + mysql_mutex_unlock(&commit_cond_m);
1771 + mysql_mutex_unlock(&commit_cond_m);
1775 + mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
1776 + trx->mysql_log_offset = (ib_int64_t) tmp_pos;
1778 + /* Don't do write + flush right now. For group commit
1779 + to work we want to do the flush in the innobase_commit()
1780 + method, which runs without holding any locks. */
1781 + trx->flush_log_later = TRUE;
1782 + innobase_commit_low(trx);
1783 + trx->flush_log_later = FALSE;
1785 + if (innobase_commit_concurrency > 0) {
1786 + mysql_mutex_lock(&commit_cond_m);
1788 + mysql_cond_signal(&commit_cond);
1789 + mysql_mutex_unlock(&commit_cond_m);
1795 +/*****************************************************************//**
1796 +Perform the first, fast part of InnoDB commit.
1798 +Doing it in this call ensures that we get the same commit order here
1799 +as in binlog and any other participating transactional storage engines.
1801 +Note that we want to do as little as really needed here, as we run
1802 +under a global mutex. The expensive fsync() is done later, in
1803 +innobase_commit(), without a lock so group commit can take place.
1805 +Note also that this method can be called from a different thread than
1806 +the one handling the rest of the transaction. */
1809 +innobase_commit_ordered(
1810 +/*====================*/
1811 + handlerton *hton, /*!< in: Innodb handlerton */
1812 + THD* thd, /*!< in: MySQL thread handle of the user for whom
1813 + the transaction should be committed */
1814 + bool all) /*!< in: TRUE - commit transaction
1815 + FALSE - the current SQL statement ended */
1818 + DBUG_ENTER("innobase_commit_ordered");
1819 + DBUG_ASSERT(hton == innodb_hton_ptr);
1821 + trx = check_trx_exists(thd);
1823 + /* Since we will reserve the kernel mutex, we have to release
1824 + the search system latch first to obey the latching order. */
1826 + if (trx->has_search_latch) {
1827 + trx_search_latch_release_if_reserved(trx);
1830 + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
1831 + /* We cannot throw error here; instead we will catch this error
1832 + again in innobase_commit() and report it from there. */
1836 + /* commit_ordered is only called when committing the whole transaction
1837 + (or an SQL statement when autocommit is on). */
1838 + DBUG_ASSERT(all ||
1839 + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
1841 + innobase_commit_ordered_low(trx, thd);
1843 + trx_called_commit_ordered_set(trx);
1847 +#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1849 /*****************************************************************//**
1850 Commits a transaction in an InnoDB database or marks an SQL statement
1852 @@ -3238,6 +3394,16 @@
1853 /* We were instructed to commit the whole transaction, or
1854 this is an SQL statement end and autocommit is on */
1856 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1857 + ut_ad(!trx_has_prepare_commit_mutex(trx));
1859 + /* Run the fast part of commit if we did not already. */
1860 + if (!trx_called_commit_ordered(trx)) {
1861 + innobase_commit_ordered_low(trx, thd);
1864 + ut_ad(!trx_called_commit_ordered(trx));
1866 /* We need current binlog position for ibbackup to work.
1867 Note, the position is current because of
1868 prepare_commit_mutex */
1869 @@ -3292,6 +3458,7 @@
1871 mysql_mutex_unlock(&prepare_commit_mutex);
1873 +#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1875 trx_deregister_from_2pc(trx);
1877 @@ -10973,6 +11140,7 @@
1879 srv_active_wake_master_thread();
1881 +#ifndef EXTENDED_FOR_COMMIT_ORDERED
1882 if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
1884 || !thd_test_options(
1885 @@ -10999,6 +11167,7 @@
1886 mysql_mutex_lock(&prepare_commit_mutex);
1887 trx_owns_prepare_commit_mutex_set(trx);
1889 +#endif /* ifndef EXTENDED_FOR_COMMIT_ORDERED */
1893 --- a/storage/innobase/handler/ha_innodb.h
1894 +++ b/storage/innobase/handler/ha_innodb.h
1895 @@ -240,6 +240,12 @@
1896 struct charset_info_st *thd_charset(MYSQL_THD thd);
1897 LEX_STRING *thd_query_string(MYSQL_THD thd);
1899 +#ifdef EXTENDED_FOR_COMMIT_ORDERED
1900 +/** Get the file name and position of the MySQL binlog corresponding to the
1903 +void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
1905 /** Get the file name of the MySQL binlog.
1906 * @return the name of the binlog file
1909 * @return byte offset from the beginning of the binlog
1911 ulonglong mysql_bin_log_file_pos(void);
1915 Check if a user thread is a replication slave thread
1916 --- a/storage/innobase/include/trx0trx.h
1917 +++ b/storage/innobase/include/trx0trx.h
1919 this is set to 1 then registered should
1920 also be set to 1. This is used in the
1922 + unsigned called_commit_ordered:1;/* 1 if innobase_commit_ordered has run. */
1923 /*------------------------------*/
1924 ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
1925 ulint check_foreigns; /* normally TRUE, but if the user
1926 --- a/storage/innobase/trx/trx0trx.c
1927 +++ b/storage/innobase/trx/trx0trx.c
1930 trx->is_registered = 0;
1931 trx->owns_prepare_mutex = 0;
1932 + trx->called_commit_ordered = 0;
1934 trx->start_time = ut_time();
1937 +++ b/mysql-test/r/group_commit.result
1939 +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
1940 +SELECT variable_value INTO @commits FROM information_schema.global_status
1941 +WHERE variable_name = 'binlog_commits';
1942 +SELECT variable_value INTO @group_commits FROM information_schema.global_status
1943 +WHERE variable_name = 'binlog_group_commits';
1944 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
1945 +INSERT INTO t1 VALUES ("con1");
1946 +set DEBUG_SYNC= "now WAIT_FOR group1_running";
1947 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
1948 +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
1949 +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
1950 +INSERT INTO t1 VALUES ("con2");
1951 +SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
1952 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
1953 +INSERT INTO t1 VALUES ("con3");
1954 +SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
1955 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
1956 +INSERT INTO t1 VALUES ("con4");
1957 +SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
1958 +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
1959 +SELECT * FROM t1 ORDER BY a;
1961 +SET DEBUG_SYNC= "now SIGNAL group2_queued";
1962 +SELECT * FROM t1 ORDER BY a;
1965 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
1966 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
1967 +INSERT INTO t1 VALUES ("con5");
1968 +SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
1969 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
1970 +INSERT INTO t1 VALUES ("con6");
1971 +SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
1972 +SELECT * FROM t1 ORDER BY a;
1975 +SET DEBUG_SYNC= "now SIGNAL group3_committed";
1976 +SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
1977 +SELECT * FROM t1 ORDER BY a;
1983 +SET DEBUG_SYNC= "now SIGNAL group2_checked";
1984 +SELECT * FROM t1 ORDER BY a;
1992 +SELECT variable_value - @commits FROM information_schema.global_status
1993 +WHERE variable_name = 'binlog_commits';
1994 +variable_value - @commits
1996 +SELECT variable_value - @group_commits FROM information_schema.global_status
1997 +WHERE variable_name = 'binlog_group_commits';
1998 +variable_value - @group_commits
2000 +SET DEBUG_SYNC= 'RESET';
2003 +++ b/mysql-test/r/group_commit_binlog_pos.result
2005 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2006 +INSERT INTO t1 VALUES (0);
2007 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2008 +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2009 +INSERT INTO t1 VALUES (1);
2010 +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2011 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2012 +INSERT INTO t1 VALUES (2);
2013 +SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2014 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2015 +INSERT INTO t1 VALUES (3);
2016 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2017 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2018 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2019 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2020 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2021 +SELECT * FROM t1 ORDER BY a;
2026 +SET SESSION debug="+d,crash_dispatch_command_before";
2028 +Got one of the listed errors
2029 +Got one of the listed errors
2030 +Got one of the listed errors
2031 +SELECT * FROM t1 ORDER BY a;
2037 +InnoDB: Last MySQL binlog file position 0 768, file name ./master-bin.000001
2038 +SET DEBUG_SYNC= 'RESET';
2041 +++ b/mysql-test/r/group_commit_crash.result
2043 +CREATE TABLE t1(a CHAR(255),
2047 +id INT AUTO_INCREMENT,
2048 +PRIMARY KEY(id)) ENGINE=InnoDB;
2049 +create table t2 like t1;
2050 +create procedure setcrash(IN i INT)
2053 +WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2054 +WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2055 +WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2056 +WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2057 +WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2062 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2063 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2064 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2065 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2066 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2067 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2068 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2069 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2070 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2071 +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2074 +insert into t1 select * from t2;
2077 +Got one of the listed errors
2078 +SELECT * FROM t1 ORDER BY id;
2080 +SHOW BINLOG EVENTS LIMIT 2,1;
2081 +Log_name Pos Event_type Server_id End_log_pos Info
2085 +insert into t1 select * from t2;
2088 +Got one of the listed errors
2089 +SELECT * FROM t1 ORDER BY id;
2101 +SHOW BINLOG EVENTS LIMIT 2,1;
2102 +Log_name Pos Event_type Server_id End_log_pos Info
2103 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2107 +insert into t1 select * from t2;
2110 +Got one of the listed errors
2111 +SELECT * FROM t1 ORDER BY id;
2123 +SHOW BINLOG EVENTS LIMIT 2,1;
2124 +Log_name Pos Event_type Server_id End_log_pos Info
2125 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2129 +insert into t1 select * from t2;
2132 +Got one of the listed errors
2133 +SELECT * FROM t1 ORDER BY id;
2145 +SHOW BINLOG EVENTS LIMIT 2,1;
2146 +Log_name Pos Event_type Server_id End_log_pos Info
2147 +master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2151 +insert into t1 select * from t2;
2154 +Got one of the listed errors
2155 +SELECT * FROM t1 ORDER BY id;
2157 +SHOW BINLOG EVENTS LIMIT 2,1;
2158 +Log_name Pos Event_type Server_id End_log_pos Info
2162 +DROP PROCEDURE setcrash;
2164 +++ b/mysql-test/r/xa_binlog.result
2166 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2167 +SET binlog_format= mixed;
2170 +INSERT INTO t1 VALUES (1);
2172 +XA PREPARE 'xatest';
2173 +XA COMMIT 'xatest';
2175 +INSERT INTO t1 VALUES (2);
2177 +XA COMMIT 'xatest' ONE PHASE;
2179 +INSERT INTO t1 VALUES (3);
2181 +SELECT * FROM t1 ORDER BY a;
2186 +SHOW BINLOG EVENTS LIMIT 1,9;
2187 +Log_name Pos Event_type Server_id End_log_pos Info
2188 +master-bin.000001 # Query 1 # BEGIN
2189 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (1)
2190 +master-bin.000001 # Query 1 # COMMIT
2191 +master-bin.000001 # Query 1 # BEGIN
2192 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (2)
2193 +master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2194 +master-bin.000001 # Query 1 # BEGIN
2195 +master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (3)
2196 +master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2199 +++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
2201 +CALL mtr.add_suppression("Error writing file 'master-bin'");
2203 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2204 +INSERT INTO t1 VALUES(0);
2205 +SET SESSION debug='+d,fail_binlog_write_1';
2206 +INSERT INTO t1 VALUES(1);
2207 +ERROR HY000: Error writing file 'master-bin' (errno: 28)
2208 +INSERT INTO t1 VALUES(2);
2209 +ERROR HY000: Error writing file 'master-bin' (errno: 28)
2210 +SET SESSION debug='';
2211 +INSERT INTO t1 VALUES(3);
2216 +SHOW BINLOG EVENTS;
2217 +Log_name Pos Event_type Server_id End_log_pos Info
2218 +BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
2219 +BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
2220 +BINLOG POS Query 1 ENDPOS BEGIN
2221 +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
2222 +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2223 +BINLOG POS Query 1 ENDPOS BEGIN
2224 +BINLOG POS Query 1 ENDPOS BEGIN
2225 +BINLOG POS Query 1 ENDPOS BEGIN
2226 +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
2227 +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2230 +++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
2232 +source include/have_debug.inc;
2233 +source include/have_innodb.inc;
2234 +source include/have_log_bin.inc;
2235 +source include/have_binlog_format_mixed_or_statement.inc;
2237 +CALL mtr.add_suppression("Error writing file 'master-bin'");
2241 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2242 +INSERT INTO t1 VALUES(0);
2243 +SET SESSION debug='+d,fail_binlog_write_1';
2244 +--error ER_ERROR_ON_WRITE
2245 +INSERT INTO t1 VALUES(1);
2246 +--error ER_ERROR_ON_WRITE
2247 +INSERT INTO t1 VALUES(2);
2248 +SET SESSION debug='';
2249 +INSERT INTO t1 VALUES(3);
2252 +# Actually the output from this currently shows a bug.
2253 +# The injected IO error leaves partially written transactions in the binlog in
2254 +# the form of stray "BEGIN" events.
2255 +# These should disappear from the output if binlog error handling is improved
2256 +# (see MySQL Bug#37148 and WL#1790).
2257 +--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
2258 +--replace_column 1 BINLOG 2 POS 5 ENDPOS
2259 +SHOW BINLOG EVENTS;
2263 +++ b/mysql-test/t/group_commit.test
2265 +--source include/have_debug_sync.inc
2266 +--source include/have_innodb.inc
2267 +--source include/have_log_bin.inc
2269 +# Test some group commit code paths by using debug_sync to do controlled
2270 +# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
2273 +# Group 3 is allowed to race as far as possible ahead before group 2 finishes
2274 +# to check some edge case for concurrency control.
2276 +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
2278 +SELECT variable_value INTO @commits FROM information_schema.global_status
2279 + WHERE variable_name = 'binlog_commits';
2280 +SELECT variable_value INTO @group_commits FROM information_schema.global_status
2281 + WHERE variable_name = 'binlog_group_commits';
2283 +connect(con1,localhost,root,,);
2284 +connect(con2,localhost,root,,);
2285 +connect(con3,localhost,root,,);
2286 +connect(con4,localhost,root,,);
2287 +connect(con5,localhost,root,,);
2288 +connect(con6,localhost,root,,);
2290 +# Start group1 (with one thread) doing commit, waiting for
2291 +# group2 to queue up before finishing.
2294 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
2295 +send INSERT INTO t1 VALUES ("con1");
2297 +# Make group2 (with three threads) queue up.
2298 +# Make sure con2 is the group commit leader for group2.
2299 +# Make group2 wait with running commit_ordered() until group3 has committed.
2302 +set DEBUG_SYNC= "now WAIT_FOR group1_running";
2303 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
2304 +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
2305 +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
2306 +send INSERT INTO t1 VALUES ("con2");
2308 +SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
2309 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
2310 +send INSERT INTO t1 VALUES ("con3");
2312 +SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
2313 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
2314 +send INSERT INTO t1 VALUES ("con4");
2316 +# When group2 is queued, let group1 continue and queue group3.
2318 +connection default;
2319 +SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
2321 +# At this point, trasaction 1 is still not visible as commit_ordered() has not
2323 +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
2324 +SELECT * FROM t1 ORDER BY a;
2326 +SET DEBUG_SYNC= "now SIGNAL group2_queued";
2330 +# Now transaction 1 is visible.
2331 +connection default;
2332 +SELECT * FROM t1 ORDER BY a;
2335 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
2336 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
2337 +send INSERT INTO t1 VALUES ("con5");
2340 +SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
2341 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
2342 +send INSERT INTO t1 VALUES ("con6");
2344 +connection default;
2345 +SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
2346 +# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
2347 +SELECT * FROM t1 ORDER BY a;
2348 +SET DEBUG_SYNC= "now SIGNAL group3_committed";
2349 +SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
2350 +# Now transactions 1-4 visible.
2351 +SELECT * FROM t1 ORDER BY a;
2352 +SET DEBUG_SYNC= "now SIGNAL group2_checked";
2369 +connection default;
2370 +# Check all transactions finally visible.
2371 +SELECT * FROM t1 ORDER BY a;
2373 +SELECT variable_value - @commits FROM information_schema.global_status
2374 + WHERE variable_name = 'binlog_commits';
2375 +SELECT variable_value - @group_commits FROM information_schema.global_status
2376 + WHERE variable_name = 'binlog_group_commits';
2378 +SET DEBUG_SYNC= 'RESET';
2381 +++ b/mysql-test/t/group_commit_binlog_pos-master.opt
2383 +--skip-stack-trace --skip-core-file
2385 +++ b/mysql-test/t/group_commit_binlog_pos.test
2387 +--source include/have_debug_sync.inc
2388 +--source include/have_innodb.inc
2389 +--source include/have_log_bin.inc
2390 +--source include/have_binlog_format_mixed_or_statement.inc
2392 +# Need DBUG to crash the server intentionally
2393 +--source include/have_debug.inc
2394 +# Don't test this under valgrind, memory leaks will occur as we crash
2395 +--source include/not_valgrind.inc
2397 +# The test case currently uses grep and tail, which may be unavailable on
2398 +# some windows systems. But see MWL#191 for how to remove the need for grep.
2399 +--source include/not_windows.inc
2401 +# XtraDB stores the binlog position corresponding to the last commit, and
2402 +# prints it during crash recovery.
2403 +# Test that we get the correct position when we group commit several
2404 +# transactions together.
2406 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2407 +INSERT INTO t1 VALUES (0);
2409 +connect(con1,localhost,root,,);
2410 +connect(con2,localhost,root,,);
2411 +connect(con3,localhost,root,,);
2413 +# Queue up three commits for group commit.
2416 +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2417 +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2418 +send INSERT INTO t1 VALUES (1);
2421 +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2422 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2423 +send INSERT INTO t1 VALUES (2);
2426 +SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2427 +SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2428 +send INSERT INTO t1 VALUES (3);
2430 +connection default;
2431 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2432 +# At this point, no transactions are committed.
2433 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2434 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2435 +# At this point, 1 transaction is committed.
2436 +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2437 +SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2439 +# At this point, 2 transactions are committed.
2440 +SELECT * FROM t1 ORDER BY a;
2445 +# Now crash the server with 1+2 in-memory committed, 3 only prepared.
2446 +connection default;
2447 +system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2448 +SET SESSION debug="+d,crash_dispatch_command_before";
2459 +system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2461 +connection default;
2463 +--source include/wait_until_connected_again.inc
2465 +# Crash recovery should recover all three transactions.
2466 +SELECT * FROM t1 ORDER BY a;
2468 +# Check that the binlog position reported by InnoDB is the correct one
2469 +# for the end of the second transaction (as can be checked with
2471 +let $MYSQLD_DATADIR= `SELECT @@datadir`;
2472 +--exec grep 'InnoDB: Last MySQL binlog file position' $MYSQLD_DATADIR/../../log/mysqld.1.err | tail -1
2474 +SET DEBUG_SYNC= 'RESET';
2477 +++ b/mysql-test/t/group_commit_crash-master.opt
2479 +--skip-stack-trace --skip-core-file
2481 +++ b/mysql-test/t/group_commit_crash.test
2483 +# Testing group commit by crashing a few times.
2484 +# Test adapted from the Facebook patch: lp:mysqlatfacebook
2485 +--source include/not_embedded.inc
2486 +# Don't test this under valgrind, memory leaks will occur
2487 +--source include/not_valgrind.inc
2489 +# Binary must be compiled with debug for crash to occur
2490 +--source include/have_debug.inc
2491 +--source include/have_innodb.inc
2492 +--source include/have_log_bin.inc
2494 +let $innodb_file_format_max_orig=`select @@innodb_file_format_max`;
2495 +CREATE TABLE t1(a CHAR(255),
2499 + id INT AUTO_INCREMENT,
2500 + PRIMARY KEY(id)) ENGINE=InnoDB;
2501 +create table t2 like t1;
2503 +create procedure setcrash(IN i INT)
2506 + WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2507 + WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2508 + WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2509 + WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2510 + WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2515 +# Avoid getting a crashed mysql.proc table.
2520 +let $numinserts = 10;
2521 +while ($numinserts)
2524 + INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2533 + START TRANSACTION;
2534 + insert into t1 select * from t2;
2535 + # Write file to make mysql-test-run.pl expect crash
2536 + --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
2538 + eval call setcrash($numtests);
2540 + # Run the crashing query
2544 + # Poll the server waiting for it to be back online again.
2545 + --source include/wait_until_connected_again.inc
2547 + # table and binlog should be in sync.
2548 + SELECT * FROM t1 ORDER BY id;
2549 + SHOW BINLOG EVENTS LIMIT 2,1;
2559 +DROP PROCEDURE setcrash;
2560 +--disable_query_log
2561 +eval SET GLOBAL innodb_file_format_max=$innodb_file_format_max_orig;
2564 +++ b/mysql-test/t/xa_binlog.test
2566 +--source include/have_innodb.inc
2567 +--source include/have_log_bin.inc
2569 +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2571 +# Fix binlog format (otherwise SHOW BINLOG EVENTS will fluctuate).
2572 +SET binlog_format= mixed;
2577 +INSERT INTO t1 VALUES (1);
2579 +XA PREPARE 'xatest';
2580 +XA COMMIT 'xatest';
2583 +INSERT INTO t1 VALUES (2);
2585 +XA COMMIT 'xatest' ONE PHASE;
2588 +INSERT INTO t1 VALUES (3);
2591 +SELECT * FROM t1 ORDER BY a;
2593 +--replace_column 2 # 5 #
2594 +--replace_regex /xid=[0-9]+/xid=XX/
2595 +SHOW BINLOG EVENTS LIMIT 1,9;