]> git.pld-linux.org Git - packages/mysql.git/blame - group_commit.patch
- up to 5.5.18
[packages/mysql.git] / group_commit.patch
CommitLineData
13ceb006
AM
1--- a/include/my_sys.h
2+++ b/include/my_sys.h
3@@ -524,6 +524,8 @@
4
5 #define my_b_tell(info) ((info)->pos_in_file + \
6 (size_t) (*(info)->current_pos - (info)->request_pos))
7+#define my_b_write_tell(info) ((info)->pos_in_file + \
8+ ((info)->write_pos - (info)->write_buffer))
9
10 #define my_b_get_buffer_start(info) (info)->request_pos
11 #define my_b_get_bytes_in_buffer(info) (char*) (info)->read_end - \
12--- a/include/mysql/plugin.h
13+++ b/include/mysql/plugin.h
14@@ -559,6 +559,8 @@
15
16 #define EXTENDED_FOR_USERSTAT
17
18+#define EXTENDED_FOR_COMMIT_ORDERED
19+
20 /**
21 Create a temporary file.
22
23--- a/sql/handler.cc
24+++ b/sql/handler.cc
25@@ -90,6 +90,8 @@
26 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
27 uint known_extensions_id= 0;
28
29+static int commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans,
30+ bool is_real_trans);
31
32
33 static plugin_ref ha_default_plugin(THD *thd)
34@@ -1119,7 +1121,8 @@
35 */
36 bool is_real_trans= all || thd->transaction.all.ha_list == 0;
37 Ha_trx_info *ha_info= trans->ha_list;
38- my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
39+ bool need_commit_ordered;
40+ my_xid xid;
41 DBUG_ENTER("ha_commit_trans");
42
43 /*
44@@ -1152,13 +1155,20 @@
45 DBUG_RETURN(2);
46 }
47
48- if (ha_info)
49+ if (!ha_info)
50+ {
51+ /* Free resources and perform other cleanup even for 'empty' transactions. */
52+ if (is_real_trans)
53+ thd->transaction.cleanup();
54+ DBUG_RETURN(0);
55+ }
56+ else
57 {
58 uint rw_ha_count;
59 bool rw_trans;
60 MDL_request mdl_request;
61
62- DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
63+ DBUG_EXECUTE_IF("crash_commit_before", abort(););
64
65 /* Close all cursors that can not survive COMMIT */
66 if (is_real_trans) /* not a statement commit */
67@@ -1197,57 +1207,80 @@
68 !thd->slave_thread)
69 {
70 my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
71- ha_rollback_trans(thd, all);
72- error= 1;
73- goto end;
74+ goto err;
75 }
76
77- if (!trans->no_2pc && (rw_ha_count > 1))
78+ if (trans->no_2pc || (rw_ha_count <= 1))
79 {
80- for (; ha_info && !error; ha_info= ha_info->next())
81+ error= ha_commit_one_phase(thd, all);
82+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
83+ goto end;
84+ }
85+
86+ need_commit_ordered= FALSE;
87+ xid= thd->transaction.xid_state.xid.get_my_xid();
88+
89+ for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
90 {
91 int err;
92- handlerton *ht= ha_info->ht();
93+ handlerton *ht= hi->ht();
94 /*
95 Do not call two-phase commit if this particular
96 transaction is read-only. This allows for simpler
97 implementation in engines that are always read-only.
98 */
99- if (! ha_info->is_trx_read_write())
100+ if (! hi->is_trx_read_write())
101 continue;
102 /*
103 Sic: we know that prepare() is not NULL since otherwise
104 trans->no_2pc would have been set.
105 */
106- if ((err= ht->prepare(ht, thd, all)))
107- {
108- my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
109- error= 1;
110- }
111+ err= ht->prepare(ht, thd, all);
112 status_var_increment(thd->status_var.ha_prepare_count);
113+ if (err)
114+ my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
115+
116+ if (err)
117+ goto err;
118+
119+ need_commit_ordered|= (ht->commit_ordered != NULL);
120 }
121- DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
122- if (error || (is_real_trans && xid &&
123- (error= !(cookie= tc_log->log_xid(thd, xid)))))
124+ DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
125+
126+ if (!is_real_trans)
127 {
128- ha_rollback_trans(thd, all);
129- error= 1;
130+ error= commit_one_phase_low(thd, all, trans, is_real_trans);
131+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
132 goto end;
133 }
134- DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
135- }
136- error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
137- DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
138- if (cookie)
139+
140+ cookie= tc_log->log_and_order(thd, xid, all, need_commit_ordered);
141+ if (!cookie)
142+ goto err;
143+
144+ DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
145+
146+ error= commit_one_phase_low(thd, all, trans, is_real_trans) ? 2 : 0;
147+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
148+ if (is_real_trans) /* userstat.patch */
149+ thd->diff_commit_trans++; /* userstat.patch */
150+ RUN_HOOK(transaction, after_commit, (thd, FALSE));
151+
152+ DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
153 if(tc_log->unlog(cookie, xid))
154 {
155 error= 2;
156 goto end;
157 }
158- DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
159- if (is_real_trans)
160- thd->diff_commit_trans++;
161- RUN_HOOK(transaction, after_commit, (thd, FALSE));
162+
163+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
164+ goto end;
165+
166+ /* Come here if error and we need to rollback. */
167+err:
168+ error= 1; /* Transaction was rolled back */
169+ ha_rollback_trans(thd, all);
170+
171 end:
172 if (rw_trans && mdl_request.ticket)
173 {
174@@ -1260,9 +1293,6 @@
175 thd->mdl_context.release_lock(mdl_request.ticket);
176 }
177 }
178- /* Free resources and perform other cleanup even for 'empty' transactions. */
179- else if (is_real_trans)
180- thd->transaction.cleanup();
181 DBUG_RETURN(error);
182 }
183
184@@ -1279,7 +1309,6 @@
185
186 int ha_commit_one_phase(THD *thd, bool all)
187 {
188- int error=0;
189 THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
190 /*
191 "real" is a nick name for a transaction for which a commit will
192@@ -1295,8 +1324,16 @@
193 transaction.all.ha_list, see why in trans_register_ha()).
194 */
195 bool is_real_trans=all || thd->transaction.all.ha_list == 0;
196- Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
197 DBUG_ENTER("ha_commit_one_phase");
198+ DBUG_RETURN(commit_one_phase_low(thd, all, trans, is_real_trans));
199+}
200+
201+static int
202+commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
203+{
204+ int error= 0;
205+ Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
206+ DBUG_ENTER("commit_one_phase_low");
207
208 if (ha_info)
209 {
210@@ -1894,7 +1931,16 @@
211 {
212 bool warn= true;
213
214+ /*
215+ Holding the LOCK_commit_ordered mutex ensures that we get the same
216+ snapshot for all engines (including the binary log). This allows us
217+ among other things to do backups with
218+ START TRANSACTION WITH CONSISTENT SNAPSHOT and
219+ have a consistent binlog position.
220+ */
221+ mysql_mutex_lock(&LOCK_commit_ordered);
222 plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
223+ mysql_mutex_unlock(&LOCK_commit_ordered);
224
225 /*
226 Same idea as when one wants to CREATE TABLE in one engine which does not
227--- a/sql/handler.h
228+++ b/sql/handler.h
229@@ -756,6 +756,53 @@
230 and 'real commit' mean the same event.
231 */
232 int (*commit)(handlerton *hton, THD *thd, bool all);
233+ /*
234+ The commit_ordered() method is called prior to the commit() method, after
235+ the transaction manager has decided to commit (not rollback) the
236+ transaction. Unlike commit(), commit_ordered() is called only when the
237+ full transaction is committed, not for each commit of statement
238+ transaction in a multi-statement transaction.
239+
240+ Not that like prepare(), commit_ordered() is only called when 2-phase
241+ commit takes place. Ie. when no binary log and only a single engine
242+ participates in a transaction, one commit() is called, no
243+ commit_ordered(). So engines must be prepared for this.
244+
245+ The calls to commit_ordered() in multiple parallel transactions is
246+ guaranteed to happen in the same order in every participating
247+ handler. This can be used to ensure the same commit order among multiple
248+ handlers (eg. in table handler and binlog). So if transaction T1 calls
249+ into commit_ordered() of handler A before T2, then T1 will also call
250+ commit_ordered() of handler B before T2.
251+
252+ Engines that implement this method should during this call make the
253+ transaction visible to other transactions, thereby making the order of
254+ transaction commits be defined by the order of commit_ordered() calls.
255+
256+ The intention is that commit_ordered() should do the minimal amount of
257+ work that needs to happen in consistent commit order among handlers. To
258+ preserve ordering, calls need to be serialised on a global mutex, so
259+ doing any time-consuming or blocking operations in commit_ordered() will
260+ limit scalability.
261+
262+ Handlers can rely on commit_ordered() calls to be serialised (no two
263+ calls can run in parallel, so no extra locking on the handler part is
264+ required to ensure this).
265+
266+ Note that commit_ordered() can be called from a different thread than the
267+ one handling the transaction! So it can not do anything that depends on
268+ thread local storage, in particular it can not call my_error() and
269+ friends (instead it can store the error code and delay the call of
270+ my_error() to the commit() method).
271+
272+ Similarly, since commit_ordered() returns void, any return error code
273+ must be saved and returned from the commit() method instead.
274+
275+ The commit_ordered method is optional, and can be left unset if not
276+ needed in a particular handler (then there will be no ordering guarantees
277+ wrt. other engines and binary log).
278+ */
279+ void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
280 int (*rollback)(handlerton *hton, THD *thd, bool all);
281 int (*prepare)(handlerton *hton, THD *thd, bool all);
282 int (*recover)(handlerton *hton, XID *xid_list, uint len);
283--- a/sql/log.cc
284+++ b/sql/log.cc
285@@ -49,6 +49,7 @@
286
287 #include "sql_plugin.h"
288 #include "rpl_handler.h"
289+#include "debug_sync.h"
290
291 /* max size of the log message */
292 #define MAX_LOG_BUFFER_SIZE 1024
293@@ -71,6 +72,25 @@
294 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
295 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
296
297+static LEX_STRING const write_error_msg=
298+ { C_STRING_WITH_LEN("error writing to the binary log") };
299+
300+static my_bool mutexes_inited;
301+mysql_mutex_t LOCK_group_commit_queue;
302+mysql_mutex_t LOCK_commit_ordered;
303+
304+static ulonglong binlog_status_var_num_commits;
305+static ulonglong binlog_status_var_num_group_commits;
306+
307+static SHOW_VAR binlog_status_vars_detail[]=
308+{
309+ {"commits",
310+ (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
311+ {"group_commits",
312+ (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
313+ {NullS, NullS, SHOW_LONG}
314+};
315+
316 /**
317 purge logs, master and slave sides both, related error code
318 convertor.
319@@ -167,41 +187,6 @@
320 }
321
322 /*
323- Helper class to hold a mutex for the duration of the
324- block.
325-
326- Eliminates the need for explicit unlocking of mutexes on, e.g.,
327- error returns. On passing a null pointer, the sentry will not do
328- anything.
329- */
330-class Mutex_sentry
331-{
332-public:
333- Mutex_sentry(mysql_mutex_t *mutex)
334- : m_mutex(mutex)
335- {
336- if (m_mutex)
337- mysql_mutex_lock(mutex);
338- }
339-
340- ~Mutex_sentry()
341- {
342- if (m_mutex)
343- mysql_mutex_unlock(m_mutex);
344-#ifndef DBUG_OFF
345- m_mutex= 0;
346-#endif
347- }
348-
349-private:
350- mysql_mutex_t *m_mutex;
351-
352- // It's not allowed to copy this object in any way
353- Mutex_sentry(Mutex_sentry const&);
354- void operator=(Mutex_sentry const&);
355-};
356-
357-/*
358 Helper classes to store non-transactional and transactional data
359 before copying it to the binary log.
360 */
361@@ -211,7 +196,8 @@
362 binlog_cache_data(): m_pending(0), before_stmt_pos(MY_OFF_T_UNDEF),
363 incident(FALSE), changes_to_non_trans_temp_table_flag(FALSE),
364 saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
365- ptr_binlog_cache_disk_use(0)
366+ ptr_binlog_cache_disk_use(0), commit_bin_log_file_pos(0),
367+ using_xa(FALSE), xa_xid(0)
368 { }
369
370 ~binlog_cache_data()
371@@ -270,6 +256,8 @@
372 variable after truncating the cache.
373 */
374 cache_log.disk_writes= 0;
375+ using_xa= FALSE;
376+ commit_bin_log_file_pos= 0;
377 DBUG_ASSERT(empty());
378 }
379
380@@ -411,6 +399,20 @@
381
382 binlog_cache_data& operator=(const binlog_cache_data& info);
383 binlog_cache_data(const binlog_cache_data& info);
384+
385+public:
386+ /*
387+ Binlog position after current commit, available to storage engines during
388+ commit_ordered() and commit().
389+ */
390+ ulonglong commit_bin_log_file_pos;
391+
392+ /*
393+ Flag set true if this transaction is committed with log_xid() as part of
394+ XA, false if not.
395+ */
396+ bool using_xa;
397+ my_xid xa_xid;
398 };
399
400 class binlog_cache_mngr {
401@@ -1627,7 +1629,7 @@
402 */
403 static inline int
404 binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
405- bool is_transactional)
406+ bool is_transactional, bool all)
407 {
408 DBUG_ENTER("binlog_flush_cache");
409 int error= 0;
410@@ -1646,8 +1648,8 @@
411 were, we would have to ensure that we're not ending a statement
412 inside a stored function.
413 */
414- error= mysql_bin_log.write(thd, &cache_data->cache_log, end_evt,
415- cache_data->has_incident());
416+ error= mysql_bin_log.write_transaction_to_binlog(thd, cache_data,
417+ end_evt, all);
418 }
419 cache_data->reset();
420
421@@ -1666,12 +1668,12 @@
422 */
423 static inline int
424 binlog_commit_flush_stmt_cache(THD *thd,
425- binlog_cache_mngr *cache_mngr)
426+ binlog_cache_mngr *cache_mngr, bool all)
427 {
428 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
429 FALSE, FALSE, TRUE, 0);
430 return (binlog_flush_cache(thd, &cache_mngr->stmt_cache, &end_evt,
431- FALSE));
432+ FALSE, all));
433 }
434
435 /**
436@@ -1684,12 +1686,12 @@
437 nonzero if an error pops up when flushing the cache.
438 */
439 static inline int
440-binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
441+binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
442 {
443 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
444 TRUE, FALSE, TRUE, 0);
445 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
446- TRUE));
447+ TRUE, all));
448 }
449
450 /**
451@@ -1702,12 +1704,12 @@
452 nonzero if an error pops up when flushing the cache.
453 */
454 static inline int
455-binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
456+binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
457 {
458 Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
459 TRUE, FALSE, TRUE, 0);
460 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
461- TRUE));
462+ TRUE, all));
463 }
464
465 /**
466@@ -1722,11 +1724,11 @@
467 */
468 static inline int
469 binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr,
470- my_xid xid)
471+ my_xid xid, bool all)
472 {
473 Xid_log_event end_evt(thd, xid);
474 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
475- TRUE));
476+ TRUE, all));
477 }
478
479 /**
480@@ -1788,7 +1790,7 @@
481 do nothing.
482 just pretend we can do 2pc, so that MySQL won't
483 switch to 1pc.
484- real work will be done in MYSQL_BIN_LOG::log_xid()
485+ real work will be done in MYSQL_BIN_LOG::log_and_order()
486 */
487 return 0;
488 }
489@@ -1821,7 +1823,7 @@
490
491 if (!cache_mngr->stmt_cache.empty())
492 {
493- error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
494+ error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
495 }
496
497 if (cache_mngr->trx_cache.empty())
498@@ -1840,7 +1842,7 @@
499 Otherwise, we accumulate the changes.
500 */
501 if (!error && ending_trans(thd, all))
502- error= binlog_commit_flush_trx_cache(thd, cache_mngr);
503+ error= binlog_commit_flush_trx_cache(thd, cache_mngr, all);
504
505 /*
506 This is part of the stmt rollback.
507@@ -1884,7 +1886,7 @@
508 }
509 else if (!cache_mngr->stmt_cache.empty())
510 {
511- error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
512+ error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
513 }
514
515 if (cache_mngr->trx_cache.empty())
516@@ -1932,7 +1934,7 @@
517 (trans_has_updated_non_trans_table(thd) &&
518 ending_single_stmt_trans(thd,all) &&
519 thd->variables.binlog_format == BINLOG_FORMAT_MIXED)))
520- error= binlog_rollback_flush_trx_cache(thd, cache_mngr);
521+ error= binlog_rollback_flush_trx_cache(thd, cache_mngr, all);
522 /*
523 Truncate the cache if:
524 . aborting a single or multi-statement transaction or;
525@@ -2907,6 +2909,7 @@
526 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
527 :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
528 need_start_event(TRUE),
529+ group_commit_queue(0), num_commits(0), num_group_commits(0),
530 sync_period_ptr(sync_period),
531 is_relay_log(0), signal_cnt(0),
532 description_event_for_exec(0), description_event_for_queue(0)
533@@ -5279,19 +5282,15 @@
534 SYNOPSIS
535 write_cache()
536 cache Cache to write to the binary log
537- lock_log True if the LOCK_log mutex should be aquired, false otherwise
538- sync_log True if the log should be flushed and synced
539
540 DESCRIPTION
541 Write the contents of the cache to the binary log. The cache will
542 be reset as a READ_CACHE to be able to read the contents from it.
543 */
544
545-int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache,
546- bool lock_log, bool sync_log)
547+int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
548 {
549- Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
550-
551+ mysql_mutex_assert_owner(&LOCK_log);
552 if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
553 return ER_ERROR_ON_WRITE;
554 uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
555@@ -5402,6 +5401,8 @@
556 }
557
558 /* Write data to the binary log file */
559+ DBUG_EXECUTE_IF("fail_binlog_write_1",
560+ errno= 28; return ER_ERROR_ON_WRITE;);
561 if (my_b_write(&log_file, cache->read_pos, length))
562 return ER_ERROR_ON_WRITE;
563 thd->binlog_bytes_written+= length;
564@@ -5410,9 +5411,6 @@
565
566 DBUG_ASSERT(carry == 0);
567
568- if (sync_log)
569- return flush_and_sync(0);
570-
571 return 0; // All OK
572 }
573
574@@ -5453,8 +5451,6 @@
575 if (!is_open())
576 DBUG_RETURN(error);
577
578- LEX_STRING const write_error_msg=
579- { C_STRING_WITH_LEN("error writing to the binary log") };
580 Incident incident= INCIDENT_LOST_EVENTS;
581 Incident_log_event ev(thd, incident, write_error_msg);
582 if (lock)
583@@ -5496,104 +5492,320 @@
584 'cache' needs to be reinitialized after this functions returns.
585 */
586
587-bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
588- bool incident)
589+bool
590+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
591+ Log_event *end_ev, bool all)
592 {
593- DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
594+ group_commit_entry entry;
595+ bool ret;
596+ DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
597+
598+ entry.thd= thd;
599+ entry.cache_data= cache_data;
600+ entry.error= 0;
601+ entry.all= all;
602+
603+ /*
604+ Log "BEGIN" at the beginning of every transaction. Here, a transaction is
605+ either a BEGIN..COMMIT block or a single statement in autocommit mode.
606+
607+ Create the necessary events here, where we have the correct THD (and
608+ thread context).
609+
610+ Due to group commit the actual writing to binlog may happen in a different
611+ thread.
612+ */
613+ Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
614+ entry.begin_event= &qinfo;
615+ entry.end_event= end_ev;
616+ if (cache_data->has_incident())
617+ {
618+ Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
619+ entry.incident_event= &inc_ev;
620+ ret = write_transaction_to_binlog_events(&entry);
621+ }
622+ else
623+ {
624+ entry.incident_event= NULL;
625+ ret = write_transaction_to_binlog_events(&entry);
626+ }
627+ if (!ret) /* userstat.patch */
628+ thd->binlog_bytes_written += qinfo.data_written; /* userstat.patch */
629+ DBUG_RETURN(ret);
630+}
631+
632+bool
633+MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
634+{
635+ /*
636+ To facilitate group commit for the binlog, we first queue up ourselves in
637+ the group commit queue. Then the first thread to enter the queue waits for
638+ the LOCK_log mutex, and commits for everyone in the queue once it gets the
639+ lock. Any other threads in the queue just wait for the first one to finish
640+ the commit and wake them up.
641+ */
642+ entry->thd->clear_wakeup_ready();
643+ mysql_mutex_lock(&LOCK_group_commit_queue);
644+ group_commit_entry *orig_queue= group_commit_queue;
645+ entry->next= orig_queue;
646+ group_commit_queue= entry;
647+ DEBUG_SYNC(entry->thd, "commit_group_commit_queue");
648+ mysql_mutex_unlock(&LOCK_group_commit_queue);
649+
650+ /*
651+ The first in the queue handle group commit for all; the others just wait
652+ to be signalled when group commit is done.
653+ */
654+ if (orig_queue != NULL)
655+ entry->thd->wait_for_wakeup_ready();
656+ else
657+ trx_group_commit_leader(entry);
658+
659+ if (likely(!entry->error))
660+ return 0;
661+
662+ switch (entry->error)
663+ {
664+ case ER_ERROR_ON_WRITE:
665+ my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
666+ break;
667+ case ER_ERROR_ON_READ:
668+ my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
669+ entry->cache_data->cache_log.file_name, entry->commit_errno);
670+ break;
671+ default:
672+ /*
673+ There are not (and should not be) any errors thrown not covered above.
674+ But just in case one is added later without updating the above switch
675+ statement, include a catch-all.
676+ */
677+ my_printf_error(entry->error,
678+ "Error writing transaction to binary log: %d",
679+ MYF(ME_NOREFRESH), entry->error);
680+ }
681+
682+ /*
683+ Since we return error, this transaction XID will not be committed, so
684+ we need to mark it as not needed for recovery (unlog() is not called
685+ for a transaction if log_xid() fails).
686+ */
687+ if (entry->cache_data->using_xa && entry->cache_data->xa_xid)
688+ mark_xid_done();
689+
690+ return 1;
691+}
692+
693+/*
694+ Do binlog group commit as the lead thread.
695+
696+ This must be called when this thread/transaction is queued at the start of
697+ the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
698+ commit all the transactions in the queue (more may have entered while waiting
699+ for LOCK_log). After commit is done, all other threads in the queue will be
700+ signalled.
701+
702+ */
703+void
704+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
705+{
706+ DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
707+ uint xid_count= 0;
708+ uint write_count= 0;
709+
710+ /*
711+ Lock the LOCK_log(), and once we get it, collect any additional writes
712+ that queued up while we were waiting.
713+ */
714 mysql_mutex_lock(&LOCK_log);
715+ DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
716+ mysql_mutex_lock(&LOCK_group_commit_queue);
717+ group_commit_entry *current= group_commit_queue;
718+ group_commit_queue= NULL;
719+ mysql_mutex_unlock(&LOCK_group_commit_queue);
720+
721+ /* As the queue is in reverse order of entering, reverse it. */
722+ group_commit_entry *queue= NULL;
723+ while (current)
724+ {
725+ group_commit_entry *next= current->next;
726+ current->next= queue;
727+ queue= current;
728+ current= next;
729+ }
730+ DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
731
732+ /* Now we have in queue the list of transactions to be committed in order. */
733 DBUG_ASSERT(is_open());
734 if (likely(is_open())) // Should always be true
735 {
736 /*
737- We only bother to write to the binary log if there is anything
738- to write.
739- */
740- if (my_b_tell(cache) > 0)
741+ Commit every transaction in the queue.
742+
743+ Note that we are doing this in a different thread than the one running
744+ the transaction! So we are limited in the operations we can do. In
745+ particular, we cannot call my_error() on behalf of a transaction, as
746+ that obtains the THD from thread local storage. Instead, we must set
747+ current->error and let the thread do the error reporting itself once
748+ we wake it up.
749+ */
750+ for (current= queue; current != NULL; current= current->next)
751 {
752+ binlog_cache_data *cache_data= current->cache_data;
753+ IO_CACHE *cache= &cache_data->cache_log;
754+
755 /*
756- Log "BEGIN" at the beginning of every transaction. Here, a
757- transaction is either a BEGIN..COMMIT block or a single
758- statement in autocommit mode.
759+ We only bother to write to the binary log if there is anything
760+ to write.
761 */
762- Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
763- if (qinfo.write(&log_file))
764- goto err;
765- thd->binlog_bytes_written+= qinfo.data_written;
766- DBUG_EXECUTE_IF("crash_before_writing_xid",
767- {
768- if ((write_error= write_cache(thd, cache, false, true)))
769- DBUG_PRINT("info", ("error writing binlog cache: %d",
770- write_error));
771- DBUG_PRINT("info", ("crashing before writing xid"));
772- DBUG_SUICIDE();
773- });
774+ if (my_b_tell(cache) > 0)
775+ {
776+ if ((current->error= write_transaction(current)))
777+ current->commit_errno= errno;
778
779- if ((write_error= write_cache(thd, cache, false, false)))
780- goto err;
781+ write_count++;
782+ }
783
784- if (commit_event && commit_event->write(&log_file))
785- goto err;
786- if (commit_event)
787- thd->binlog_bytes_written+= commit_event->data_written;
788+ cache_data->commit_bin_log_file_pos= my_b_write_tell(&log_file);
789+ if (cache_data->using_xa && cache_data->xa_xid)
790+ xid_count++;
791+ }
792
793- if (incident && write_incident(thd, FALSE))
794- goto err;
795
796+ if (write_count > 0)
797+ {
798 bool synced= 0;
799 if (flush_and_sync(&synced))
800- goto err;
801- DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
802- if (cache->error) // Error on read
803 {
804- sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
805- write_error=1; // Don't give more errors
806- goto err;
807+ for (current= queue; current != NULL; current= current->next)
808+ {
809+ if (!current->error)
810+ {
811+ current->error= ER_ERROR_ON_WRITE;
812+ current->commit_errno= errno;
813+ }
814+ }
815+ }
816+ else
817+ {
818+ signal_update();
819 }
820
821 if (RUN_HOOK(binlog_storage, after_flush,
822- (thd, log_file_name, log_file.pos_in_file, synced)))
823+ (leader->thd, log_file_name, log_file.pos_in_file, synced)))
824 {
825 sql_print_error("Failed to run 'after_flush' hooks");
826- write_error=1;
827- goto err;
828+ for (current= queue; current != NULL; current= current->next)
829+ {
830+ if (!current->error)
831+ {
832+ current->error= ER_ERROR_ON_WRITE;
833+ current->commit_errno= errno;
834+ }
835+ }
836 }
837
838- signal_update();
839 }
840
841 /*
842- if commit_event is Xid_log_event, increase the number of
843- prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
844+ if any commit_events are Xid_log_event, increase the number of
845+ prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
846 if there're prepared xids in it - see the comment in new_file() for
847 an explanation.
848- If the commit_event is not Xid_log_event (then it's a Query_log_event)
849- rotate binlog, if necessary.
850+ If no Xid_log_events (then it's all Query_log_event) rotate binlog,
851+ if necessary.
852 */
853- if (commit_event && commit_event->get_type_code() == XID_EVENT)
854+ if (xid_count > 0)
855 {
856- mysql_mutex_lock(&LOCK_prep_xids);
857- prepared_xids++;
858- mysql_mutex_unlock(&LOCK_prep_xids);
859+ mark_xids_active(xid_count);
860 }
861 else
862 if (rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED))
863- goto err;
864+ {
865+ for (current= queue; current != NULL; current= current->next)
866+ {
867+ if (!current->error)
868+ {
869+ current->error= ER_ERROR_ON_WRITE;
870+ current->commit_errno= errno;
871+ }
872+ }
873+ }
874 }
875+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
876+ mysql_mutex_lock(&LOCK_commit_ordered);
877+ /*
878+ We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
879+ otherwise scheduling could allow the next group commit to run ahead of us,
880+ messing up the order of commit_ordered() calls. But as soon as
881+ LOCK_commit_ordered is obtained, we can let the next group commit start.
882+ */
883 mysql_mutex_unlock(&LOCK_log);
884+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
885+ ++num_group_commits;
886
887- DBUG_RETURN(0);
888-
889-err:
890- if (!write_error)
891+ /*
892+ Wakeup each participant waiting for our group commit, first calling the
893+ commit_ordered() methods for any transactions doing 2-phase commit.
894+ */
895+ current= queue;
896+ while (current != NULL)
897 {
898- write_error= 1;
899- sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
900+ group_commit_entry *next;
901+
902+ DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
903+ ++num_commits;
904+ if (current->cache_data->using_xa && !current->error)
905+ run_commit_ordered(current->thd, current->all);
906+
907+ /*
908+ Careful not to access current->next after waking up the other thread! As
909+ it may change immediately after wakeup.
910+ */
911+ next= current->next;
912+ if (current != leader) // Don't wake up ourself
913+ current->thd->signal_wakeup_ready();
914+ current= next;
915 }
916- mysql_mutex_unlock(&LOCK_log);
917- DBUG_RETURN(1);
918+ DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
919+ mysql_mutex_unlock(&LOCK_commit_ordered);
920+
921+ DBUG_VOID_RETURN;
922 }
923
924+int
925+MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry)
926+{
927+ binlog_cache_data *cache_data= entry->cache_data;
928+ IO_CACHE *cache= &cache_data->cache_log;
929+
930+ if (entry->begin_event->write(&log_file))
931+ return ER_ERROR_ON_WRITE;
932+
933+ DBUG_EXECUTE_IF("crash_before_writing_xid",
934+ {
935+ if ((write_cache(entry->thd, cache)))
936+ DBUG_PRINT("info", ("error writing binlog cache"));
937+ else
938+ flush_and_sync(0);
939+
940+ DBUG_PRINT("info", ("crashing before writing xid"));
941+ abort();
942+ });
943+
944+ if (write_cache(entry->thd, cache))
945+ return ER_ERROR_ON_WRITE;
946+
947+ if (entry->end_event->write(&log_file))
948+ return ER_ERROR_ON_WRITE;
949+
950+ if (entry->incident_event && entry->incident_event->write(&log_file))
951+ return ER_ERROR_ON_WRITE;
952+
953+ if (cache->error) // Error on read
954+ return ER_ERROR_ON_READ;
955+
956+ return 0;
957+}
958
959 /**
960 Wait until we get a signal that the relay log has been updated.
961@@ -5999,6 +6211,68 @@
962 }
963
964
965+void
966+TC_init()
967+{
968+ mysql_mutex_init(key_LOCK_group_commit_queue, &LOCK_group_commit_queue, MY_MUTEX_INIT_SLOW);
969+ mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered, MY_MUTEX_INIT_SLOW);
970+ mutexes_inited= TRUE;
971+}
972+
973+
974+void
975+TC_destroy()
976+{
977+ if (mutexes_inited)
978+ {
979+ mysql_mutex_destroy(&LOCK_group_commit_queue);
980+ mysql_mutex_destroy(&LOCK_commit_ordered);
981+ mutexes_inited= FALSE;
982+ }
983+}
984+
985+
986+void
987+TC_LOG::run_commit_ordered(THD *thd, bool all)
988+{
989+ Ha_trx_info *ha_info=
990+ all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
991+
992+ mysql_mutex_assert_owner(&LOCK_commit_ordered);
993+ for (; ha_info; ha_info= ha_info->next())
994+ {
995+ handlerton *ht= ha_info->ht();
996+ if (!ht->commit_ordered)
997+ continue;
998+ ht->commit_ordered(ht, thd, all);
999+ DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
1000+ }
1001+}
1002+
1003+int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
1004+ bool need_commit_ordered)
1005+{
1006+ int cookie;
1007+
1008+ cookie= 0;
1009+ if (xid)
1010+ cookie= log_one_transaction(xid);
1011+
1012+ if (need_commit_ordered)
1013+ {
1014+ /* Only run commit_ordered() if log_xid was successful. */
1015+ if (cookie)
1016+ {
1017+ mysql_mutex_lock(&LOCK_commit_ordered);
1018+ run_commit_ordered(thd, all);
1019+ mysql_mutex_unlock(&LOCK_commit_ordered);
1020+ }
1021+ }
1022+
1023+ return cookie;
1024+}
1025+
1026+
1027 /********* transaction coordinator log for 2pc - mmap() based solution *******/
1028
1029 /*
1030@@ -6135,6 +6409,7 @@
1031 mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
1032 mysql_cond_init(key_COND_active, &COND_active, 0);
1033 mysql_cond_init(key_COND_pool, &COND_pool, 0);
1034+ mysql_cond_init(key_COND_queue_busy, &COND_queue_busy, 0);
1035
1036 inited=6;
1037
1038@@ -6142,6 +6417,8 @@
1039 active=pages;
1040 pool=pages+1;
1041 pool_last=pages+npages-1;
1042+ commit_ordered_queue= NULL;
1043+ commit_ordered_queue_busy= false;
1044
1045 return 0;
1046
1047@@ -6247,7 +6524,7 @@
1048 to the position in memory where xid was logged to.
1049 */
1050
1051-int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
1052+int TC_LOG_MMAP::log_one_transaction(my_xid xid)
1053 {
1054 int err;
1055 PAGE *p;
1056@@ -6386,7 +6663,9 @@
1057 mysql_mutex_destroy(&LOCK_sync);
1058 mysql_mutex_destroy(&LOCK_active);
1059 mysql_mutex_destroy(&LOCK_pool);
1060+ mysql_cond_destroy(&COND_active);
1061 mysql_cond_destroy(&COND_pool);
1062+ mysql_cond_destroy(&COND_queue_busy);
1063 case 5:
1064 data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
1065 case 4:
1066@@ -6596,42 +6875,87 @@
1067 mysql_cond_destroy(&COND_prep_xids);
1068 }
1069
1070-/**
1071- @todo
1072- group commit
1073+/*
1074+ Do a binlog log_xid() for a group of transactions, linked through
1075+ thd->next_commit_ordered.
1076
1077 @retval
1078 0 error
1079 @retval
1080 1 success
1081 */
1082-int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
1083+int TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
1084+ bool need_commit_ordered __attribute__((unused)))
1085 {
1086- DBUG_ENTER("TC_LOG_BINLOG::log");
1087+ DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
1088 binlog_cache_mngr *cache_mngr=
1089 (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1090+
1091+ cache_mngr->trx_cache.using_xa= TRUE;
1092+ cache_mngr->trx_cache.xa_xid= xid;
1093 /*
1094 We always commit the entire transaction when writing an XID. Also
1095 note that the return value is inverted.
1096 */
1097- DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr) &&
1098- !binlog_commit_flush_trx_cache(thd, cache_mngr, xid));
1099+ DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr, all) &&
1100+ !binlog_commit_flush_trx_cache(thd, cache_mngr, xid, all));
1101 }
1102
1103-int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1104+/*
1105+ After an XID is logged, we need to hold on to the current binlog file until
1106+ it is fully committed in the storage engine. The reason is that crash
1107+ recovery only looks at the latest binlog, so we must make sure there are no
1108+ outstanding prepared (but not committed) transactions before rotating the
1109+ binlog.
1110+
1111+ To handle this, we keep a count of outstanding XIDs. This function is used
1112+ to increase this count when committing one or more transactions to the
1113+ binary log.
1114+*/
1115+void
1116+TC_LOG_BINLOG::mark_xids_active(uint xid_count)
1117 {
1118- DBUG_ENTER("TC_LOG_BINLOG::unlog");
1119+ DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
1120+ DBUG_PRINT("info", ("xid_count=%u", xid_count));
1121+ mysql_mutex_lock(&LOCK_prep_xids);
1122+ prepared_xids+= xid_count;
1123+ mysql_mutex_unlock(&LOCK_prep_xids);
1124+ DBUG_VOID_RETURN;
1125+}
1126+
1127+/*
1128+ Once an XID is committed, it is safe to rotate the binary log, as it can no
1129+ longer be needed during crash recovery.
1130+
1131+ This function is called to mark an XID this way. It needs to decrease the
1132+ count of pending XIDs, and signal the log rotator thread when it reaches zero.
1133+*/
1134+void
1135+TC_LOG_BINLOG::mark_xid_done()
1136+{
1137+ my_bool send_signal;
1138+
1139+ DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
1140 mysql_mutex_lock(&LOCK_prep_xids);
1141 // prepared_xids can be 0 if the transaction had ignorable errors.
1142 DBUG_ASSERT(prepared_xids >= 0);
1143 if (prepared_xids > 0)
1144 prepared_xids--;
1145- if (prepared_xids == 0) {
1146+ send_signal= (prepared_xids == 0);
1147+ mysql_mutex_unlock(&LOCK_prep_xids);
1148+ if (send_signal) {
1149 DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
1150 mysql_cond_signal(&COND_prep_xids);
1151 }
1152- mysql_mutex_unlock(&LOCK_prep_xids);
1153- DBUG_RETURN(rotate_and_purge(0)); // as ::write() did not rotate
1154+ DBUG_VOID_RETURN;
1155+}
1156+
1157+int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1158+{
1159+ DBUG_ENTER("TC_LOG_BINLOG::unlog");
1160+ if (xid)
1161+ mark_xid_done();
1162+ DBUG_RETURN(rotate_and_purge(0));
1163 }
1164
1165 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
1166@@ -6700,9 +7024,67 @@
1167 {
1168 return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
1169 }
1170+/*
1171+ Get the current position of the MySQL binlog for transaction currently being
1172+ committed.
1173+
1174+ This is valid to call from within storage engine commit_ordered() and
1175+ commit() methods only.
1176+
1177+ Since it stores the position inside THD, it is safe to call without any
1178+ locking.
1179+
1180+ Note that currently the binlog file name is not stored inside THD, but this
1181+ is still safe as it can only change when the log is rotated, and we never
1182+ rotate the binlog while commits are pending inside storage engines.
1183+*/
1184+extern "C"
1185+void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
1186+{
1187+ binlog_cache_mngr *cache_mngr;
1188+ if (binlog_hton->state == SHOW_OPTION_YES
1189+ && (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
1190+ {
1191+ *out_pos= cache_mngr->trx_cache.commit_bin_log_file_pos;
1192+ *out_file= mysql_bin_log.get_log_fname();
1193+ }
1194+ else
1195+ {
1196+ *out_pos= NULL;
1197+ *out_file= NULL;
1198+ }
1199+}
1200 #endif /* INNODB_COMPATIBILITY_HOOKS */
1201
1202
1203+static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
1204+{
1205+ mysql_bin_log.set_status_variables();
1206+ var->type= SHOW_ARRAY;
1207+ var->value= (char *)&binlog_status_vars_detail;
1208+ return 0;
1209+}
1210+
1211+static SHOW_VAR binlog_status_vars_top[]= {
1212+ {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
1213+ {NullS, NullS, SHOW_LONG}
1214+};
1215+
1216+/*
1217+ Copy out current values of status variables, for SHOW STATUS or
1218+ information_schema.global_status.
1219+
1220+ This is called only under LOCK_status, so we can fill in a static array.
1221+*/
1222+void
1223+TC_LOG_BINLOG::set_status_variables()
1224+{
1225+ mysql_mutex_lock(&LOCK_commit_ordered);
1226+ binlog_status_var_num_commits= this->num_commits;
1227+ binlog_status_var_num_group_commits= this->num_group_commits;
1228+ mysql_mutex_unlock(&LOCK_commit_ordered);
1229+}
1230+
1231 struct st_mysql_storage_engine binlog_storage_engine=
1232 { MYSQL_HANDLERTON_INTERFACE_VERSION };
1233
1234@@ -6717,7 +7099,7 @@
1235 binlog_init, /* Plugin Init */
1236 NULL, /* Plugin Deinit */
1237 0x0100 /* 1.0 */,
1238- NULL, /* status variables */
1239+ binlog_status_vars_top, /* status variables */
1240 NULL, /* system variables */
1241 NULL, /* config options */
1242 0, /* flags */
1243--- a/sql/log.h
1244+++ b/sql/log.h
1245@@ -44,17 +44,42 @@
1246
1247 virtual int open(const char *opt_name)=0;
1248 virtual void close()=0;
1249- virtual int log_xid(THD *thd, my_xid xid)=0;
1250+ virtual int log_and_order(THD *thd, my_xid xid, bool all,
1251+ bool need_commit_ordered)=0;
1252 virtual int unlog(ulong cookie, my_xid xid)=0;
1253+
1254+ protected:
1255+ void run_commit_ordered(THD *thd, bool all);
1256 };
1257
1258+/*
1259+ Locks used to ensure serialised execution of
1260+ TC_LOG::run_commit_ordered(), or any other code that calls handler
1261+ commit_ordered() methods.
1262+*/
1263+extern mysql_mutex_t LOCK_group_commit_queue;
1264+extern mysql_mutex_t LOCK_commit_ordered;
1265+
1266+extern void TC_init();
1267+extern void TC_destroy();
1268+
1269 class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
1270 {
1271 public:
1272 TC_LOG_DUMMY() {}
1273 int open(const char *opt_name) { return 0; }
1274 void close() { }
1275- int log_xid(THD *thd, my_xid xid) { return 1; }
1276+ /*
1277+ TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we
1278+ only use internal XA during commit when >= 2 XA-capable engines
1279+ participate.
1280+ */
1281+ int log_and_order(THD *thd, my_xid xid, bool all,
1282+ bool need_commit_ordered)
1283+ {
1284+ DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */);
1285+ return 1;
1286+ }
1287 int unlog(ulong cookie, my_xid xid) { return 0; }
1288 };
1289
1290@@ -80,6 +105,13 @@
1291 mysql_cond_t cond; // to wait for a sync
1292 } PAGE;
1293
1294+ /* List of THDs for which to invoke commit_ordered(), in order. */
1295+ struct commit_entry
1296+ {
1297+ struct commit_entry *next;
1298+ THD *thd;
1299+ };
1300+
1301 char logname[FN_REFLEN];
1302 File fd;
1303 my_off_t file_length;
1304@@ -94,16 +126,38 @@
1305 */
1306 mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
1307 mysql_cond_t COND_pool, COND_active;
1308+ /*
1309+ Queue of threads that need to call commit_ordered().
1310+ Access to this queue must be protected by LOCK_group_commit_queue
1311+ */
1312+ commit_entry *commit_ordered_queue;
1313+ /*
1314+ This flag and condition is used to reserve the queue while threads in it
1315+ each run the commit_ordered() methods one after the other. Only once the
1316+ last commit_ordered() in the queue is done can we start on a new queue
1317+ run.
1318+
1319+ Since we start this process in the first thread in the queue and finish in
1320+ the last (and possibly different) thread, we need a condition variable for
1321+ this (we cannot unlock a mutex in a different thread than the one who
1322+ locked it).
1323+
1324+ The condition is used together with the LOCK_group_commit_queue mutex.
1325+ */
1326+ my_bool commit_ordered_queue_busy;
1327+ mysql_cond_t COND_queue_busy;
1328
1329 public:
1330 TC_LOG_MMAP(): inited(0) {}
1331 int open(const char *opt_name);
1332 void close();
1333- int log_xid(THD *thd, my_xid xid);
1334+ int log_and_order(THD *thd, my_xid xid, bool all,
1335+ bool need_commit_ordered);
1336 int unlog(ulong cookie, my_xid xid);
1337 int recover();
1338
1339 private:
1340+ int log_one_transaction(my_xid xid);
1341 void get_active_from_pool();
1342 int sync();
1343 int overflow();
1344@@ -271,9 +325,31 @@
1345 time_t last_time;
1346 };
1347
1348+class binlog_cache_data;
1349 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
1350 {
1351 private:
1352+ struct group_commit_entry
1353+ {
1354+ struct group_commit_entry *next;
1355+ THD *thd;
1356+ binlog_cache_data *cache_data;
1357+ /*
1358+ Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
1359+ written during group commit. The incident_event is only valid if
1360+ trx_data->has_incident() is true.
1361+ */
1362+ Log_event *begin_event;
1363+ Log_event *end_event;
1364+ Log_event *incident_event;
1365+ /* Set during group commit to record any per-thread error. */
1366+ int error;
1367+ int commit_errno;
1368+ /* This is the `all' parameter for ha_commit_ordered(). */
1369+ bool all;
1370+ /* True if we come in through XA log_and_order(), false otherwise. */
1371+ };
1372+
1373 #ifdef HAVE_PSI_INTERFACE
1374 /** The instrumentation key to use for @ LOCK_index. */
1375 PSI_mutex_key m_key_LOCK_index;
1376@@ -325,6 +401,12 @@
1377 In 5.0 it's 0 for relay logs too!
1378 */
1379 bool no_auto_events;
1380+ /* Queue of transactions queued up to participate in group commit. */
1381+ group_commit_entry *group_commit_queue;
1382+ /* Total number of committed transactions. */
1383+ ulonglong num_commits;
1384+ /* Number of group commits done. */
1385+ ulonglong num_group_commits;
1386
1387 /* pointer to the sync period variable, for binlog this will be
1388 sync_binlog_period, for relay log this will be
1389@@ -346,6 +428,11 @@
1390 */
1391 int new_file_without_locking();
1392 int new_file_impl(bool need_lock);
1393+ int write_transaction(group_commit_entry *entry);
1394+ bool write_transaction_to_binlog_events(group_commit_entry *entry);
1395+ void trx_group_commit_leader(group_commit_entry *leader);
1396+ void mark_xid_done();
1397+ void mark_xids_active(uint xid_count);
1398
1399 public:
1400 MYSQL_LOG::generate_name;
1401@@ -387,7 +474,8 @@
1402
1403 int open(const char *opt_name);
1404 void close();
1405- int log_xid(THD *thd, my_xid xid);
1406+ int log_and_order(THD *thd, my_xid xid, bool all,
1407+ bool need_commit_ordered);
1408 int unlog(ulong cookie, my_xid xid);
1409 int recover(IO_CACHE *log, Format_description_log_event *fdle);
1410 #if !defined(MYSQL_CLIENT)
1411@@ -434,11 +522,11 @@
1412 int new_file();
1413
1414 bool write(Log_event* event_info); // binary log write
1415- bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
1416+ bool write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
1417+ Log_event *end_ev, bool all);
1418 bool write_incident(THD *thd, bool lock);
1419
1420- int write_cache(THD *thd, IO_CACHE *cache,
1421- bool lock_log, bool flush_and_sync);
1422+ int write_cache(THD *thd, IO_CACHE *cache);
1423 void set_write_error(THD *thd, bool is_transactional);
1424 bool check_write_error(THD *thd);
1425
1426@@ -507,6 +595,7 @@
1427 inline void unlock_index() { mysql_mutex_unlock(&LOCK_index);}
1428 inline IO_CACHE *get_index_file() { return &index_file;}
1429 inline uint32 get_open_count() { return open_count; }
1430+ void set_status_variables();
1431 };
1432
1433 class Log_event_handler
1434--- a/sql/mysqld.cc
1435+++ b/sql/mysqld.cc
1436@@ -1490,6 +1490,7 @@
1437 ha_end();
1438 if (tc_log)
1439 tc_log->close();
1440+ TC_destroy();
1441 delegates_destroy();
1442 xid_cache_free();
1443 table_def_free();
1444@@ -4061,6 +4062,8 @@
1445 query_response_time_init();
1446 #endif // HAVE_RESPONSE_TIME_DISTRIBUTION
1447 /* We have to initialize the storage engines before CSV logging */
1448+ TC_init();
1449+
1450 init_global_table_stats();
1451 init_global_index_stats();
1452
1453@@ -8004,6 +8007,7 @@
1454 key_LOCK_error_messages, key_LOG_INFO_lock, key_LOCK_thread_count,
1455 key_PARTITION_LOCK_auto_inc;
1456 PSI_mutex_key key_RELAYLOG_LOCK_index;
1457+PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1458
1459 static PSI_mutex_info all_server_mutexes[]=
1460 {
1461@@ -8024,6 +8028,7 @@
1462 { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
1463 { &key_hash_filo_lock, "hash_filo::lock", 0},
1464 { &key_LOCK_active_mi, "LOCK_active_mi", PSI_FLAG_GLOBAL},
1465+ { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
1466 { &key_LOCK_connection_count, "LOCK_connection_count", PSI_FLAG_GLOBAL},
1467 { &key_LOCK_crypt, "LOCK_crypt", PSI_FLAG_GLOBAL},
1468 { &key_LOCK_delayed_create, "LOCK_delayed_create", PSI_FLAG_GLOBAL},
1469@@ -8039,6 +8044,7 @@
1470 "LOCK_global_index_stats", PSI_FLAG_GLOBAL},
1471 { &key_LOCK_gdl, "LOCK_gdl", PSI_FLAG_GLOBAL},
1472 { &key_LOCK_global_system_variables, "LOCK_global_system_variables", PSI_FLAG_GLOBAL},
1473+ { &key_LOCK_group_commit_queue, "LOCK_group_commit_queue", PSI_FLAG_GLOBAL},
1474 { &key_LOCK_manager, "LOCK_manager", PSI_FLAG_GLOBAL},
1475 { &key_LOCK_prepared_stmt_count, "LOCK_prepared_stmt_count", PSI_FLAG_GLOBAL},
1476 { &key_LOCK_rpl_status, "LOCK_rpl_status", PSI_FLAG_GLOBAL},
1477@@ -8050,6 +8056,7 @@
1478 { &key_LOCK_temporary_tables, "THD::LOCK_temporary_tables", 0},
1479 { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
1480 { &key_LOCK_uuid_generator, "LOCK_uuid_generator", PSI_FLAG_GLOBAL},
1481+ { &key_LOCK_wakeup_ready, "THD::LOCK_wakeup_ready", 0},
1482 { &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
1483 { &key_master_info_data_lock, "Master_info::data_lock", 0},
1484 { &key_master_info_run_lock, "Master_info::run_lock", 0},
1485@@ -8097,6 +8104,7 @@
1486 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1487 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1488 PSI_cond_key key_RELAYLOG_update_cond;
1489+PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1490
1491 static PSI_cond_info all_server_conds[]=
1492 {
1493@@ -8113,8 +8121,10 @@
1494 { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
1495 { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
1496 { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
1497+ { &key_COND_queue_busy, "COND_queue_busy", PSI_FLAG_GLOBAL},
1498 { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
1499 { &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
1500+ { &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
1501 { &key_delayed_insert_cond, "Delayed_insert::cond", 0},
1502 { &key_delayed_insert_cond_client, "Delayed_insert::cond_client", 0},
1503 { &key_item_func_sleep_cond, "Item_func_sleep::cond", 0},
1504--- a/sql/mysqld.h
1505+++ b/sql/mysqld.h
1506@@ -273,6 +273,7 @@
1507 key_structure_guard_mutex, key_TABLE_SHARE_LOCK_ha_data,
1508 key_LOCK_error_messages, key_LOCK_thread_count, key_PARTITION_LOCK_auto_inc;
1509 extern PSI_mutex_key key_RELAYLOG_LOCK_index;
1510+extern PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1511
1512 extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger,
1513 key_rwlock_LOCK_sys_init_connect, key_rwlock_LOCK_sys_init_slave,
1514@@ -293,6 +294,7 @@
1515 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1516 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1517 extern PSI_cond_key key_RELAYLOG_update_cond;
1518+extern PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1519
1520 extern PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
1521 key_thread_handle_manager, key_thread_kill_server, key_thread_main,
1522--- a/sql/sql_class.cc
1523+++ b/sql/sql_class.cc
1524@@ -912,6 +912,8 @@
1525 mysql_mutex_init(key_LOCK_thd_data, &LOCK_thd_data, MY_MUTEX_INIT_FAST);
1526 mysql_mutex_init(key_LOCK_temporary_tables, &LOCK_temporary_tables,
1527 MY_MUTEX_INIT_FAST);
1528+ mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
1529+ mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, NULL);
1530
1531 /* Variables with default values */
1532 proc_info="login";
1533@@ -1516,6 +1518,8 @@
1534 my_free(db);
1535 db= NULL;
1536 free_root(&transaction.mem_root,MYF(0));
1537+ mysql_cond_destroy(&COND_wakeup_ready);
1538+ mysql_mutex_destroy(&LOCK_wakeup_ready);
1539 mysql_mutex_destroy(&LOCK_thd_data);
1540 mysql_mutex_destroy(&LOCK_temporary_tables);
1541 #ifndef DBUG_OFF
1542@@ -5199,6 +5203,24 @@
1543 DBUG_RETURN(0);
1544 }
1545
1546+void
1547+THD::wait_for_wakeup_ready()
1548+{
1549+ mysql_mutex_lock(&LOCK_wakeup_ready);
1550+ while (!wakeup_ready)
1551+ mysql_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready);
1552+ mysql_mutex_unlock(&LOCK_wakeup_ready);
1553+}
1554+
1555+void
1556+THD::signal_wakeup_ready()
1557+{
1558+ mysql_mutex_lock(&LOCK_wakeup_ready);
1559+ wakeup_ready= true;
1560+ mysql_mutex_unlock(&LOCK_wakeup_ready);
1561+ mysql_cond_signal(&COND_wakeup_ready);
1562+}
1563+
1564 bool Discrete_intervals_list::append(ulonglong start, ulonglong val,
1565 ulonglong incr)
1566 {
1567--- a/sql/sql_class.h
1568+++ b/sql/sql_class.h
1569@@ -3017,6 +3017,14 @@
1570 LEX_STRING get_invoker_user() { return invoker_user; }
1571 LEX_STRING get_invoker_host() { return invoker_host; }
1572 bool has_invoker() { return invoker_user.length > 0; }
1573+ void clear_wakeup_ready() { wakeup_ready= false; }
1574+ /*
1575+ Sleep waiting for others to wake us up with signal_wakeup_ready().
1576+ Must call clear_wakeup_ready() before waiting.
1577+ */
1578+ void wait_for_wakeup_ready();
1579+ /* Wake this thread up from wait_for_wakeup_ready(). */
1580+ void signal_wakeup_ready();
1581 private:
1582
1583 /** The current internal error handler for this thread, or NULL. */
1584@@ -3059,6 +3067,16 @@
1585 */
1586 LEX_STRING invoker_user;
1587 LEX_STRING invoker_host;
1588+ /*
1589+ Flag, mutex and condition for a thread to wait for a signal from another
1590+ thread.
1591+
1592+ Currently used to wait for group commit to complete, can also be used for
1593+ other purposes.
1594+ */
1595+ bool wakeup_ready;
1596+ mysql_mutex_t LOCK_wakeup_ready;
1597+ mysql_cond_t COND_wakeup_ready;
1598 };
1599
1600 /* Returns string as 'IP' for the client-side of the connection represented by
1601--- a/sql/sql_parse.cc
1602+++ b/sql/sql_parse.cc
1603@@ -889,6 +889,10 @@
1604 DBUG_ENTER("dispatch_command");
1605 DBUG_PRINT("info",("packet: '%*.s'; command: %d", packet_length, packet, command));
1606
1607+ DBUG_EXECUTE_IF("crash_dispatch_command_before",
1608+ { DBUG_PRINT("crash_dispatch_command_before", ("now"));
1609+ DBUG_ABORT(); });
1610+
1611 #if defined(ENABLED_PROFILING)
1612 thd->profiling.start_new_query();
1613 #endif
1614--- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1615+++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1616@@ -11,9 +11,9 @@
1617 wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
1618 wait/synch/mutex/sql/LOCK_active_mi YES YES
1619 wait/synch/mutex/sql/LOCK_audit_mask YES YES
1620+wait/synch/mutex/sql/LOCK_commit_ordered YES YES
1621 wait/synch/mutex/sql/LOCK_connection_count YES YES
1622 wait/synch/mutex/sql/LOCK_crypt YES YES
1623-wait/synch/mutex/sql/LOCK_delayed_create YES YES
1624 select * from performance_schema.setup_instruments
1625 where name like 'Wait/Synch/Rwlock/sql/%'
1626 and name not in ('wait/synch/rwlock/sql/CRYPTO_dynlock_value::lock')
1627@@ -38,6 +38,7 @@
1628 NAME ENABLED TIMED
1629 wait/synch/cond/sql/COND_flush_thread_cache YES YES
1630 wait/synch/cond/sql/COND_manager YES YES
1631+wait/synch/cond/sql/COND_queue_busy YES YES
1632 wait/synch/cond/sql/COND_queue_state YES YES
1633 wait/synch/cond/sql/COND_rpl_status YES YES
1634 wait/synch/cond/sql/COND_server_started YES YES
1635@@ -45,7 +46,6 @@
1636 wait/synch/cond/sql/COND_thread_count YES YES
1637 wait/synch/cond/sql/Delayed_insert::cond YES YES
1638 wait/synch/cond/sql/Delayed_insert::cond_client YES YES
1639-wait/synch/cond/sql/Event_scheduler::COND_state YES YES
1640 select * from performance_schema.setup_instruments
1641 where name='Wait';
1642 select * from performance_schema.setup_instruments
1643--- a/storage/innobase/handler/ha_innodb.cc
1644+++ b/storage/innobase/handler/ha_innodb.cc
1645@@ -375,6 +375,9 @@
1646 static INNOBASE_SHARE *get_share(const char *table_name);
1647 static void free_share(INNOBASE_SHARE *share);
1648 static int innobase_close_connection(handlerton *hton, THD* thd);
1649+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1650+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
1651+#endif
1652 static int innobase_commit(handlerton *hton, THD* thd, bool all);
1653 static int innobase_rollback(handlerton *hton, THD* thd, bool all);
1654 static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
1655@@ -1699,7 +1702,10 @@
1656 trx_t* trx) /*!< in/out: InnoDB transaction handle */
1657 {
1658 DBUG_ENTER("innobase_trx_init");
1659+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1660+ /* used by innobase_commit_ordered */
1661 DBUG_ASSERT(EQ_CURRENT_THD(thd));
1662+#endif
1663 DBUG_ASSERT(thd == trx->mysql_thd);
1664
1665 trx->check_foreigns = !thd_test_options(
1666@@ -1760,7 +1766,10 @@
1667 {
1668 trx_t*& trx = thd_to_trx(thd);
1669
1670+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1671+ /* used by innobase_commit_ordered */
1672 ut_ad(EQ_CURRENT_THD(thd));
1673+#endif
1674
1675 if (trx == NULL) {
1676 trx = innobase_trx_allocate(thd);
1677@@ -1846,6 +1855,7 @@
1678 {
1679 trx->is_registered = 0;
1680 trx->owns_prepare_mutex = 0;
1681+ trx->called_commit_ordered = 0;
1682 }
1683
1684 /*********************************************************************//**
1685@@ -1861,6 +1871,29 @@
1686 }
1687
1688 /*********************************************************************//**
1689+*/
1690+static inline
1691+void
1692+trx_called_commit_ordered_set(
1693+/*==========================*/
1694+ trx_t* trx)
1695+{
1696+ ut_a(trx_is_registered_for_2pc(trx));
1697+ trx->called_commit_ordered = 1;
1698+}
1699+
1700+/*********************************************************************//**
1701+*/
1702+static inline
1703+bool
1704+trx_called_commit_ordered(
1705+/*======================*/
1706+ const trx_t* trx)
1707+{
1708+ return(trx->called_commit_ordered == 1);
1709+}
1710+
1711+/*********************************************************************//**
1712 Check if transaction is started.
1713 @reutrn true if transaction is in state started */
1714 static
1715@@ -2435,6 +2468,9 @@
1716 innobase_hton->savepoint_set=innobase_savepoint;
1717 innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
1718 innobase_hton->savepoint_release=innobase_release_savepoint;
1719+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1720+ innobase_hton->commit_ordered=innobase_commit_ordered;
1721+#endif
1722 innobase_hton->commit=innobase_commit;
1723 innobase_hton->rollback=innobase_rollback;
1724 innobase_hton->prepare=innobase_xa_prepare;
1725@@ -3187,6 +3223,126 @@
1726 DBUG_RETURN(0);
1727 }
1728
1729+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1730+/* MEMO:
1731+ InnoDB is coded with intention that always trx is accessed by the owner thd.
1732+ (not protected by any mutex/lock)
1733+ So, the caller of innobase_commit_ordered() should be conscious of
1734+ cache coherency between multi CPU about the trx, if called from another thd.
1735+
1736+ MariaDB's first implementation about it seems the cherency is protected by
1737+ the pthread_mutex LOCK_wakeup_ready. So, no problem for now.
1738+
1739+ But we should be aware the importance of the coherency.
1740+ */
1741+/*****************************************************************//**
1742+low function function innobase_commit_ordered().*/
1743+static
1744+void
1745+innobase_commit_ordered_low(
1746+/*========================*/
1747+ trx_t* trx, /*!< in: Innodb transaction */
1748+ THD* thd) /*!< in: MySQL thread handle */
1749+{
1750+ ulonglong tmp_pos;
1751+ DBUG_ENTER("innobase_commit_ordered");
1752+
1753+ /* This part was from innobase_commit() */
1754+
1755+ /* We need current binlog position for ibbackup to work.
1756+ Note, the position is current because commit_ordered is guaranteed
1757+ to be called in same sequenece as writing to binlog. */
1758+retry:
1759+ if (innobase_commit_concurrency > 0) {
1760+ mysql_mutex_lock(&commit_cond_m);
1761+ commit_threads++;
1762+
1763+ if (commit_threads > innobase_commit_concurrency) {
1764+ commit_threads--;
1765+ mysql_cond_wait(&commit_cond,
1766+ &commit_cond_m);
1767+ mysql_mutex_unlock(&commit_cond_m);
1768+ goto retry;
1769+ }
1770+ else {
1771+ mysql_mutex_unlock(&commit_cond_m);
1772+ }
1773+ }
1774+
1775+ mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
1776+ trx->mysql_log_offset = (ib_int64_t) tmp_pos;
1777+
1778+ /* Don't do write + flush right now. For group commit
1779+ to work we want to do the flush in the innobase_commit()
1780+ method, which runs without holding any locks. */
1781+ trx->flush_log_later = TRUE;
1782+ innobase_commit_low(trx);
1783+ trx->flush_log_later = FALSE;
1784+
1785+ if (innobase_commit_concurrency > 0) {
1786+ mysql_mutex_lock(&commit_cond_m);
1787+ commit_threads--;
1788+ mysql_cond_signal(&commit_cond);
1789+ mysql_mutex_unlock(&commit_cond_m);
1790+ }
1791+
1792+ DBUG_VOID_RETURN;
1793+}
1794+
1795+/*****************************************************************//**
1796+Perform the first, fast part of InnoDB commit.
1797+
1798+Doing it in this call ensures that we get the same commit order here
1799+as in binlog and any other participating transactional storage engines.
1800+
1801+Note that we want to do as little as really needed here, as we run
1802+under a global mutex. The expensive fsync() is done later, in
1803+innobase_commit(), without a lock so group commit can take place.
1804+
1805+Note also that this method can be called from a different thread than
1806+the one handling the rest of the transaction. */
1807+static
1808+void
1809+innobase_commit_ordered(
1810+/*====================*/
1811+ handlerton *hton, /*!< in: Innodb handlerton */
1812+ THD* thd, /*!< in: MySQL thread handle of the user for whom
1813+ the transaction should be committed */
1814+ bool all) /*!< in: TRUE - commit transaction
1815+ FALSE - the current SQL statement ended */
1816+{
1817+ trx_t* trx;
1818+ DBUG_ENTER("innobase_commit_ordered");
1819+ DBUG_ASSERT(hton == innodb_hton_ptr);
1820+
1821+ trx = check_trx_exists(thd);
1822+
1823+ /* Since we will reserve the kernel mutex, we have to release
1824+ the search system latch first to obey the latching order. */
1825+
1826+ if (trx->has_search_latch) {
1827+ trx_search_latch_release_if_reserved(trx);
1828+ }
1829+
1830+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
1831+ /* We cannot throw error here; instead we will catch this error
1832+ again in innobase_commit() and report it from there. */
1833+ DBUG_VOID_RETURN;
1834+ }
1835+
1836+ /* commit_ordered is only called when committing the whole transaction
1837+ (or an SQL statement when autocommit is on). */
1838+ DBUG_ASSERT(all ||
1839+ (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
1840+
1841+ innobase_commit_ordered_low(trx, thd);
1842+
1843+ trx_called_commit_ordered_set(trx);
1844+
1845+ DBUG_VOID_RETURN;
1846+}
1847+#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1848+
1849 /*****************************************************************//**
1850 Commits a transaction in an InnoDB database or marks an SQL statement
1851 ended.
1852@@ -3238,6 +3394,16 @@
1853 /* We were instructed to commit the whole transaction, or
1854 this is an SQL statement end and autocommit is on */
1855
1856+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1857+ ut_ad(!trx_has_prepare_commit_mutex(trx));
1858+
1859+ /* Run the fast part of commit if we did not already. */
1860+ if (!trx_called_commit_ordered(trx)) {
1861+ innobase_commit_ordered_low(trx, thd);
1862+ }
1863+#else
1864+ ut_ad(!trx_called_commit_ordered(trx));
1865+
1866 /* We need current binlog position for ibbackup to work.
1867 Note, the position is current because of
1868 prepare_commit_mutex */
1869@@ -3292,6 +3458,7 @@
1870
1871 mysql_mutex_unlock(&prepare_commit_mutex);
1872 }
1873+#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1874
1875 trx_deregister_from_2pc(trx);
1876
1877@@ -10973,6 +11140,7 @@
1878
1879 srv_active_wake_master_thread();
1880
1881+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1882 if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
1883 && (all
1884 || !thd_test_options(
1885@@ -10999,6 +11167,7 @@
1886 mysql_mutex_lock(&prepare_commit_mutex);
1887 trx_owns_prepare_commit_mutex_set(trx);
1888 }
1889+#endif /* ifndef EXTENDED_FOR_COMMIT_ORDERED */
1890
1891 return(error);
1892 }
1893--- a/storage/innobase/handler/ha_innodb.h
1894+++ b/storage/innobase/handler/ha_innodb.h
1895@@ -240,6 +240,12 @@
1896 struct charset_info_st *thd_charset(MYSQL_THD thd);
1897 LEX_STRING *thd_query_string(MYSQL_THD thd);
1898
1899+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1900+/** Get the file name and position of the MySQL binlog corresponding to the
1901+ * current commit.
1902+ */
1903+void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
1904+#else
1905 /** Get the file name of the MySQL binlog.
1906 * @return the name of the binlog file
1907 */
1908@@ -249,6 +255,7 @@
1909 * @return byte offset from the beginning of the binlog
1910 */
1911 ulonglong mysql_bin_log_file_pos(void);
1912+#endif
1913
1914 /**
1915 Check if a user thread is a replication slave thread
1916--- a/storage/innobase/include/trx0trx.h
1917+++ b/storage/innobase/include/trx0trx.h
1918@@ -494,6 +494,7 @@
1919 this is set to 1 then registered should
1920 also be set to 1. This is used in the
1921 XA code */
1922+ unsigned called_commit_ordered:1;/* 1 if innobase_commit_ordered has run. */
1923 /*------------------------------*/
1924 ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
1925 ulint check_foreigns; /* normally TRUE, but if the user
1926--- a/storage/innobase/trx/trx0trx.c
1927+++ b/storage/innobase/trx/trx0trx.c
1928@@ -111,6 +111,7 @@
1929
1930 trx->is_registered = 0;
1931 trx->owns_prepare_mutex = 0;
1932+ trx->called_commit_ordered = 0;
1933
1934 trx->start_time = ut_time();
1935
1936--- /dev/null
1937+++ b/mysql-test/r/group_commit.result
1938@@ -0,0 +1,63 @@
1939+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
1940+SELECT variable_value INTO @commits FROM information_schema.global_status
1941+WHERE variable_name = 'binlog_commits';
1942+SELECT variable_value INTO @group_commits FROM information_schema.global_status
1943+WHERE variable_name = 'binlog_group_commits';
1944+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
1945+INSERT INTO t1 VALUES ("con1");
1946+set DEBUG_SYNC= "now WAIT_FOR group1_running";
1947+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
1948+SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
1949+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
1950+INSERT INTO t1 VALUES ("con2");
1951+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
1952+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
1953+INSERT INTO t1 VALUES ("con3");
1954+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
1955+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
1956+INSERT INTO t1 VALUES ("con4");
1957+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
1958+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
1959+SELECT * FROM t1 ORDER BY a;
1960+a
1961+SET DEBUG_SYNC= "now SIGNAL group2_queued";
1962+SELECT * FROM t1 ORDER BY a;
1963+a
1964+con1
1965+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
1966+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
1967+INSERT INTO t1 VALUES ("con5");
1968+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
1969+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
1970+INSERT INTO t1 VALUES ("con6");
1971+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
1972+SELECT * FROM t1 ORDER BY a;
1973+a
1974+con1
1975+SET DEBUG_SYNC= "now SIGNAL group3_committed";
1976+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
1977+SELECT * FROM t1 ORDER BY a;
1978+a
1979+con1
1980+con2
1981+con3
1982+con4
1983+SET DEBUG_SYNC= "now SIGNAL group2_checked";
1984+SELECT * FROM t1 ORDER BY a;
1985+a
1986+con1
1987+con2
1988+con3
1989+con4
1990+con5
1991+con6
1992+SELECT variable_value - @commits FROM information_schema.global_status
1993+WHERE variable_name = 'binlog_commits';
1994+variable_value - @commits
1995+6
1996+SELECT variable_value - @group_commits FROM information_schema.global_status
1997+WHERE variable_name = 'binlog_group_commits';
1998+variable_value - @group_commits
1999+3
2000+SET DEBUG_SYNC= 'RESET';
2001+DROP TABLE t1;
2002--- /dev/null
2003+++ b/mysql-test/r/group_commit_binlog_pos.result
2004@@ -0,0 +1,35 @@
2005+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2006+INSERT INTO t1 VALUES (0);
2007+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2008+SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2009+INSERT INTO t1 VALUES (1);
2010+SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2011+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2012+INSERT INTO t1 VALUES (2);
2013+SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2014+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2015+INSERT INTO t1 VALUES (3);
2016+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2017+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2018+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2019+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2020+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2021+SELECT * FROM t1 ORDER BY a;
2022+a
2023+0
2024+1
2025+2
2026+SET SESSION debug="+d,crash_dispatch_command_before";
2027+SELECT 1;
2028+Got one of the listed errors
2029+Got one of the listed errors
2030+Got one of the listed errors
2031+SELECT * FROM t1 ORDER BY a;
2032+a
2033+0
2034+1
2035+2
2036+3
2037+InnoDB: Last MySQL binlog file position 0 768, file name ./master-bin.000001
2038+SET DEBUG_SYNC= 'RESET';
2039+DROP TABLE t1;
2040--- /dev/null
2041+++ b/mysql-test/r/group_commit_crash.result
2042@@ -0,0 +1,120 @@
2043+CREATE TABLE t1(a CHAR(255),
2044+b CHAR(255),
2045+c CHAR(255),
2046+d CHAR(255),
2047+id INT AUTO_INCREMENT,
2048+PRIMARY KEY(id)) ENGINE=InnoDB;
2049+create table t2 like t1;
2050+create procedure setcrash(IN i INT)
2051+begin
2052+CASE i
2053+WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2054+WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2055+WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2056+WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2057+WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2058+ELSE BEGIN END;
2059+END CASE;
2060+end //
2061+FLUSH TABLES;
2062+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2063+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2064+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2065+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2066+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2067+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2068+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2069+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2070+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2071+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2072+RESET MASTER;
2073+START TRANSACTION;
2074+insert into t1 select * from t2;
2075+call setcrash(5);
2076+COMMIT;
2077+Got one of the listed errors
2078+SELECT * FROM t1 ORDER BY id;
2079+a b c d id
2080+SHOW BINLOG EVENTS LIMIT 2,1;
2081+Log_name Pos Event_type Server_id End_log_pos Info
2082+delete from t1;
2083+RESET MASTER;
2084+START TRANSACTION;
2085+insert into t1 select * from t2;
2086+call setcrash(4);
2087+COMMIT;
2088+Got one of the listed errors
2089+SELECT * FROM t1 ORDER BY id;
2090+a b c d id
2091+a b c d 1
2092+a b c d 2
2093+a b c d 3
2094+a b c d 4
2095+a b c d 5
2096+a b c d 6
2097+a b c d 7
2098+a b c d 8
2099+a b c d 9
2100+a b c d 10
2101+SHOW BINLOG EVENTS LIMIT 2,1;
2102+Log_name Pos Event_type Server_id End_log_pos Info
2103+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2104+delete from t1;
2105+RESET MASTER;
2106+START TRANSACTION;
2107+insert into t1 select * from t2;
2108+call setcrash(3);
2109+COMMIT;
2110+Got one of the listed errors
2111+SELECT * FROM t1 ORDER BY id;
2112+a b c d id
2113+a b c d 1
2114+a b c d 2
2115+a b c d 3
2116+a b c d 4
2117+a b c d 5
2118+a b c d 6
2119+a b c d 7
2120+a b c d 8
2121+a b c d 9
2122+a b c d 10
2123+SHOW BINLOG EVENTS LIMIT 2,1;
2124+Log_name Pos Event_type Server_id End_log_pos Info
2125+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2126+delete from t1;
2127+RESET MASTER;
2128+START TRANSACTION;
2129+insert into t1 select * from t2;
2130+call setcrash(2);
2131+COMMIT;
2132+Got one of the listed errors
2133+SELECT * FROM t1 ORDER BY id;
2134+a b c d id
2135+a b c d 1
2136+a b c d 2
2137+a b c d 3
2138+a b c d 4
2139+a b c d 5
2140+a b c d 6
2141+a b c d 7
2142+a b c d 8
2143+a b c d 9
2144+a b c d 10
2145+SHOW BINLOG EVENTS LIMIT 2,1;
2146+Log_name Pos Event_type Server_id End_log_pos Info
2147+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2148+delete from t1;
2149+RESET MASTER;
2150+START TRANSACTION;
2151+insert into t1 select * from t2;
2152+call setcrash(1);
2153+COMMIT;
2154+Got one of the listed errors
2155+SELECT * FROM t1 ORDER BY id;
2156+a b c d id
2157+SHOW BINLOG EVENTS LIMIT 2,1;
2158+Log_name Pos Event_type Server_id End_log_pos Info
2159+delete from t1;
2160+DROP TABLE t1;
2161+DROP TABLE t2;
2162+DROP PROCEDURE setcrash;
2163--- /dev/null
2164+++ b/mysql-test/r/xa_binlog.result
2165@@ -0,0 +1,32 @@
2166+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2167+SET binlog_format= mixed;
2168+RESET MASTER;
2169+XA START 'xatest';
2170+INSERT INTO t1 VALUES (1);
2171+XA END 'xatest';
2172+XA PREPARE 'xatest';
2173+XA COMMIT 'xatest';
2174+XA START 'xatest';
2175+INSERT INTO t1 VALUES (2);
2176+XA END 'xatest';
2177+XA COMMIT 'xatest' ONE PHASE;
2178+BEGIN;
2179+INSERT INTO t1 VALUES (3);
2180+COMMIT;
2181+SELECT * FROM t1 ORDER BY a;
2182+a
2183+1
2184+2
2185+3
2186+SHOW BINLOG EVENTS LIMIT 1,9;
2187+Log_name Pos Event_type Server_id End_log_pos Info
2188+master-bin.000001 # Query 1 # BEGIN
2189+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (1)
2190+master-bin.000001 # Query 1 # COMMIT
2191+master-bin.000001 # Query 1 # BEGIN
2192+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (2)
2193+master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2194+master-bin.000001 # Query 1 # BEGIN
2195+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (3)
2196+master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2197+DROP TABLE t1;
2198--- /dev/null
2199+++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
2200@@ -0,0 +1,28 @@
2201+CALL mtr.add_suppression("Error writing file 'master-bin'");
2202+RESET MASTER;
2203+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2204+INSERT INTO t1 VALUES(0);
2205+SET SESSION debug='+d,fail_binlog_write_1';
2206+INSERT INTO t1 VALUES(1);
2207+ERROR HY000: Error writing file 'master-bin' (errno: 28)
2208+INSERT INTO t1 VALUES(2);
2209+ERROR HY000: Error writing file 'master-bin' (errno: 28)
2210+SET SESSION debug='';
2211+INSERT INTO t1 VALUES(3);
2212+SELECT * FROM t1;
2213+a
2214+0
2215+3
2216+SHOW BINLOG EVENTS;
2217+Log_name Pos Event_type Server_id End_log_pos Info
2218+BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
2219+BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
2220+BINLOG POS Query 1 ENDPOS BEGIN
2221+BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
2222+BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2223+BINLOG POS Query 1 ENDPOS BEGIN
2224+BINLOG POS Query 1 ENDPOS BEGIN
2225+BINLOG POS Query 1 ENDPOS BEGIN
2226+BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
2227+BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2228+DROP TABLE t1;
2229--- /dev/null
2230+++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
2231@@ -0,0 +1,30 @@
2232+source include/have_debug.inc;
2233+source include/have_innodb.inc;
2234+source include/have_log_bin.inc;
2235+source include/have_binlog_format_mixed_or_statement.inc;
2236+
2237+CALL mtr.add_suppression("Error writing file 'master-bin'");
2238+
2239+RESET MASTER;
2240+
2241+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2242+INSERT INTO t1 VALUES(0);
2243+SET SESSION debug='+d,fail_binlog_write_1';
2244+--error ER_ERROR_ON_WRITE
2245+INSERT INTO t1 VALUES(1);
2246+--error ER_ERROR_ON_WRITE
2247+INSERT INTO t1 VALUES(2);
2248+SET SESSION debug='';
2249+INSERT INTO t1 VALUES(3);
2250+SELECT * FROM t1;
2251+
2252+# Actually the output from this currently shows a bug.
2253+# The injected IO error leaves partially written transactions in the binlog in
2254+# the form of stray "BEGIN" events.
2255+# These should disappear from the output if binlog error handling is improved
2256+# (see MySQL Bug#37148 and WL#1790).
2257+--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
2258+--replace_column 1 BINLOG 2 POS 5 ENDPOS
2259+SHOW BINLOG EVENTS;
2260+
2261+DROP TABLE t1;
2262--- /dev/null
2263+++ b/mysql-test/t/group_commit.test
2264@@ -0,0 +1,115 @@
2265+--source include/have_debug_sync.inc
2266+--source include/have_innodb.inc
2267+--source include/have_log_bin.inc
2268+
2269+# Test some group commit code paths by using debug_sync to do controlled
2270+# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
2271+# group.
2272+#
2273+# Group 3 is allowed to race as far as possible ahead before group 2 finishes
2274+# to check some edge case for concurrency control.
2275+
2276+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
2277+
2278+SELECT variable_value INTO @commits FROM information_schema.global_status
2279+ WHERE variable_name = 'binlog_commits';
2280+SELECT variable_value INTO @group_commits FROM information_schema.global_status
2281+ WHERE variable_name = 'binlog_group_commits';
2282+
2283+connect(con1,localhost,root,,);
2284+connect(con2,localhost,root,,);
2285+connect(con3,localhost,root,,);
2286+connect(con4,localhost,root,,);
2287+connect(con5,localhost,root,,);
2288+connect(con6,localhost,root,,);
2289+
2290+# Start group1 (with one thread) doing commit, waiting for
2291+# group2 to queue up before finishing.
2292+
2293+connection con1;
2294+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
2295+send INSERT INTO t1 VALUES ("con1");
2296+
2297+# Make group2 (with three threads) queue up.
2298+# Make sure con2 is the group commit leader for group2.
2299+# Make group2 wait with running commit_ordered() until group3 has committed.
2300+
2301+connection con2;
2302+set DEBUG_SYNC= "now WAIT_FOR group1_running";
2303+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
2304+SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
2305+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
2306+send INSERT INTO t1 VALUES ("con2");
2307+connection con3;
2308+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
2309+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
2310+send INSERT INTO t1 VALUES ("con3");
2311+connection con4;
2312+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
2313+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
2314+send INSERT INTO t1 VALUES ("con4");
2315+
2316+# When group2 is queued, let group1 continue and queue group3.
2317+
2318+connection default;
2319+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
2320+
2321+# At this point, trasaction 1 is still not visible as commit_ordered() has not
2322+# been called yet.
2323+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
2324+SELECT * FROM t1 ORDER BY a;
2325+
2326+SET DEBUG_SYNC= "now SIGNAL group2_queued";
2327+connection con1;
2328+reap;
2329+
2330+# Now transaction 1 is visible.
2331+connection default;
2332+SELECT * FROM t1 ORDER BY a;
2333+
2334+connection con5;
2335+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
2336+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
2337+send INSERT INTO t1 VALUES ("con5");
2338+
2339+connection con6;
2340+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
2341+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
2342+send INSERT INTO t1 VALUES ("con6");
2343+
2344+connection default;
2345+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
2346+# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
2347+SELECT * FROM t1 ORDER BY a;
2348+SET DEBUG_SYNC= "now SIGNAL group3_committed";
2349+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
2350+# Now transactions 1-4 visible.
2351+SELECT * FROM t1 ORDER BY a;
2352+SET DEBUG_SYNC= "now SIGNAL group2_checked";
2353+
2354+connection con2;
2355+reap;
2356+
2357+connection con3;
2358+reap;
2359+
2360+connection con4;
2361+reap;
2362+
2363+connection con5;
2364+reap;
2365+
2366+connection con6;
2367+reap;
2368+
2369+connection default;
2370+# Check all transactions finally visible.
2371+SELECT * FROM t1 ORDER BY a;
2372+
2373+SELECT variable_value - @commits FROM information_schema.global_status
2374+ WHERE variable_name = 'binlog_commits';
2375+SELECT variable_value - @group_commits FROM information_schema.global_status
2376+ WHERE variable_name = 'binlog_group_commits';
2377+
2378+SET DEBUG_SYNC= 'RESET';
2379+DROP TABLE t1;
2380--- /dev/null
2381+++ b/mysql-test/t/group_commit_binlog_pos-master.opt
2382@@ -0,0 +1 @@
2383+--skip-stack-trace --skip-core-file
2384--- /dev/null
2385+++ b/mysql-test/t/group_commit_binlog_pos.test
2386@@ -0,0 +1,89 @@
2387+--source include/have_debug_sync.inc
2388+--source include/have_innodb.inc
2389+--source include/have_log_bin.inc
2390+--source include/have_binlog_format_mixed_or_statement.inc
2391+
2392+# Need DBUG to crash the server intentionally
2393+--source include/have_debug.inc
2394+# Don't test this under valgrind, memory leaks will occur as we crash
2395+--source include/not_valgrind.inc
2396+
2397+# The test case currently uses grep and tail, which may be unavailable on
2398+# some windows systems. But see MWL#191 for how to remove the need for grep.
2399+--source include/not_windows.inc
2400+
2401+# XtraDB stores the binlog position corresponding to the last commit, and
2402+# prints it during crash recovery.
2403+# Test that we get the correct position when we group commit several
2404+# transactions together.
2405+
2406+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2407+INSERT INTO t1 VALUES (0);
2408+
2409+connect(con1,localhost,root,,);
2410+connect(con2,localhost,root,,);
2411+connect(con3,localhost,root,,);
2412+
2413+# Queue up three commits for group commit.
2414+
2415+connection con1;
2416+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2417+SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2418+send INSERT INTO t1 VALUES (1);
2419+
2420+connection con2;
2421+SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2422+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2423+send INSERT INTO t1 VALUES (2);
2424+
2425+connection con3;
2426+SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2427+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2428+send INSERT INTO t1 VALUES (3);
2429+
2430+connection default;
2431+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2432+# At this point, no transactions are committed.
2433+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2434+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2435+# At this point, 1 transaction is committed.
2436+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2437+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2438+
2439+# At this point, 2 transactions are committed.
2440+SELECT * FROM t1 ORDER BY a;
2441+
2442+connection con2;
2443+reap;
2444+
2445+# Now crash the server with 1+2 in-memory committed, 3 only prepared.
2446+connection default;
2447+system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2448+SET SESSION debug="+d,crash_dispatch_command_before";
2449+--error 2006,2013
2450+SELECT 1;
2451+
2452+connection con1;
2453+--error 2006,2013
2454+reap;
2455+connection con3;
2456+--error 2006,2013
2457+reap;
2458+
2459+system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2460+
2461+connection default;
2462+--enable_reconnect
2463+--source include/wait_until_connected_again.inc
2464+
2465+# Crash recovery should recover all three transactions.
2466+SELECT * FROM t1 ORDER BY a;
2467+
2468+# Check that the binlog position reported by InnoDB is the correct one
2469+# for the end of the second transaction (as can be checked with
2470+# mysqlbinlog).
2471+let $MYSQLD_DATADIR= `SELECT @@datadir`;
2472+--exec grep 'InnoDB: Last MySQL binlog file position' $MYSQLD_DATADIR/../../log/mysqld.1.err | tail -1
2473+
2474+SET DEBUG_SYNC= 'RESET';
2475+DROP TABLE t1;
2476--- /dev/null
2477+++ b/mysql-test/t/group_commit_crash-master.opt
2478@@ -0,0 +1 @@
2479+--skip-stack-trace --skip-core-file
2480--- /dev/null
2481+++ b/mysql-test/t/group_commit_crash.test
2482@@ -0,0 +1,80 @@
2483+# Testing group commit by crashing a few times.
2484+# Test adapted from the Facebook patch: lp:mysqlatfacebook
2485+--source include/not_embedded.inc
2486+# Don't test this under valgrind, memory leaks will occur
2487+--source include/not_valgrind.inc
2488+
2489+# Binary must be compiled with debug for crash to occur
2490+--source include/have_debug.inc
2491+--source include/have_innodb.inc
2492+--source include/have_log_bin.inc
2493+
2494+let $innodb_file_format_max_orig=`select @@innodb_file_format_max`;
2495+CREATE TABLE t1(a CHAR(255),
2496+ b CHAR(255),
2497+ c CHAR(255),
2498+ d CHAR(255),
2499+ id INT AUTO_INCREMENT,
2500+ PRIMARY KEY(id)) ENGINE=InnoDB;
2501+create table t2 like t1;
2502+delimiter //;
2503+create procedure setcrash(IN i INT)
2504+begin
2505+ CASE i
2506+ WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2507+ WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2508+ WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2509+ WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2510+ WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2511+ ELSE BEGIN END;
2512+ END CASE;
2513+end //
2514+delimiter ;//
2515+# Avoid getting a crashed mysql.proc table.
2516+FLUSH TABLES;
2517+
2518+let $numtests = 5;
2519+
2520+let $numinserts = 10;
2521+while ($numinserts)
2522+{
2523+ dec $numinserts;
2524+ INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2525+}
2526+
2527+--enable_reconnect
2528+
2529+while ($numtests)
2530+{
2531+ RESET MASTER;
2532+
2533+ START TRANSACTION;
2534+ insert into t1 select * from t2;
2535+ # Write file to make mysql-test-run.pl expect crash
2536+ --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
2537+
2538+ eval call setcrash($numtests);
2539+
2540+ # Run the crashing query
2541+ --error 2006,2013
2542+ COMMIT;
2543+
2544+ # Poll the server waiting for it to be back online again.
2545+ --source include/wait_until_connected_again.inc
2546+
2547+ # table and binlog should be in sync.
2548+ SELECT * FROM t1 ORDER BY id;
2549+ SHOW BINLOG EVENTS LIMIT 2,1;
2550+
2551+ delete from t1;
2552+
2553+ dec $numtests;
2554+}
2555+
2556+# final cleanup
2557+DROP TABLE t1;
2558+DROP TABLE t2;
2559+DROP PROCEDURE setcrash;
2560+--disable_query_log
2561+eval SET GLOBAL innodb_file_format_max=$innodb_file_format_max_orig;
2562+--enable_query_log
2563--- /dev/null
2564+++ b/mysql-test/t/xa_binlog.test
2565@@ -0,0 +1,32 @@
2566+--source include/have_innodb.inc
2567+--source include/have_log_bin.inc
2568+
2569+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2570+
2571+# Fix binlog format (otherwise SHOW BINLOG EVENTS will fluctuate).
2572+SET binlog_format= mixed;
2573+
2574+RESET MASTER;
2575+
2576+XA START 'xatest';
2577+INSERT INTO t1 VALUES (1);
2578+XA END 'xatest';
2579+XA PREPARE 'xatest';
2580+XA COMMIT 'xatest';
2581+
2582+XA START 'xatest';
2583+INSERT INTO t1 VALUES (2);
2584+XA END 'xatest';
2585+XA COMMIT 'xatest' ONE PHASE;
2586+
2587+BEGIN;
2588+INSERT INTO t1 VALUES (3);
2589+COMMIT;
2590+
2591+SELECT * FROM t1 ORDER BY a;
2592+
2593+--replace_column 2 # 5 #
2594+--replace_regex /xid=[0-9]+/xid=XX/
2595+SHOW BINLOG EVENTS LIMIT 1,9;
2596+
2597+DROP TABLE t1;
This page took 0.316293 seconds and 4 git commands to generate.