]> git.pld-linux.org Git - packages/mysql.git/blame - group_commit.patch
- raise error visibility
[packages/mysql.git] / group_commit.patch
CommitLineData
13ceb006
AM
1--- a/include/my_sys.h
2+++ b/include/my_sys.h
3@@ -524,6 +524,8 @@
4
5 #define my_b_tell(info) ((info)->pos_in_file + \
6 (size_t) (*(info)->current_pos - (info)->request_pos))
7+#define my_b_write_tell(info) ((info)->pos_in_file + \
8+ ((info)->write_pos - (info)->write_buffer))
9
10 #define my_b_get_buffer_start(info) (info)->request_pos
11 #define my_b_get_bytes_in_buffer(info) (char*) (info)->read_end - \
12--- a/include/mysql/plugin.h
13+++ b/include/mysql/plugin.h
14@@ -559,6 +559,8 @@
15
16 #define EXTENDED_FOR_USERSTAT
17
18+#define EXTENDED_FOR_COMMIT_ORDERED
19+
20 /**
21 Create a temporary file.
22
23--- a/sql/handler.cc
24+++ b/sql/handler.cc
25@@ -90,6 +90,8 @@
26 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
27 uint known_extensions_id= 0;
28
29+static int commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans,
30+ bool is_real_trans);
31
32
33 static plugin_ref ha_default_plugin(THD *thd)
34@@ -1119,7 +1121,8 @@
35 */
36 bool is_real_trans= all || thd->transaction.all.ha_list == 0;
37 Ha_trx_info *ha_info= trans->ha_list;
38- my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
39+ bool need_commit_ordered;
40+ my_xid xid;
41 DBUG_ENTER("ha_commit_trans");
42
43 /*
44@@ -1152,13 +1155,20 @@
45 DBUG_RETURN(2);
46 }
47
48- if (ha_info)
49+ if (!ha_info)
50+ {
51+ /* Free resources and perform other cleanup even for 'empty' transactions. */
52+ if (is_real_trans)
53+ thd->transaction.cleanup();
54+ DBUG_RETURN(0);
55+ }
56+ else
57 {
58 uint rw_ha_count;
59 bool rw_trans;
60 MDL_request mdl_request;
61
62- DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
63+ DBUG_EXECUTE_IF("crash_commit_before", abort(););
64
65 /* Close all cursors that can not survive COMMIT */
66 if (is_real_trans) /* not a statement commit */
67@@ -1197,57 +1207,80 @@
68 !thd->slave_thread)
69 {
70 my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
71- ha_rollback_trans(thd, all);
72- error= 1;
73- goto end;
74+ goto err;
75 }
76
77- if (!trans->no_2pc && (rw_ha_count > 1))
78+ if (trans->no_2pc || (rw_ha_count <= 1))
79 {
80- for (; ha_info && !error; ha_info= ha_info->next())
81+ error= ha_commit_one_phase(thd, all);
82+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
83+ goto end;
84+ }
85+
86+ need_commit_ordered= FALSE;
87+ xid= thd->transaction.xid_state.xid.get_my_xid();
88+
89+ for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
90 {
91 int err;
92- handlerton *ht= ha_info->ht();
93+ handlerton *ht= hi->ht();
94 /*
95 Do not call two-phase commit if this particular
96 transaction is read-only. This allows for simpler
97 implementation in engines that are always read-only.
98 */
99- if (! ha_info->is_trx_read_write())
100+ if (! hi->is_trx_read_write())
101 continue;
102 /*
103 Sic: we know that prepare() is not NULL since otherwise
104 trans->no_2pc would have been set.
105 */
106- if ((err= ht->prepare(ht, thd, all)))
107- {
108- my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
109- error= 1;
110- }
111+ err= ht->prepare(ht, thd, all);
112 status_var_increment(thd->status_var.ha_prepare_count);
113+ if (err)
114+ my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
115+
116+ if (err)
117+ goto err;
118+
119+ need_commit_ordered|= (ht->commit_ordered != NULL);
120 }
121- DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
122- if (error || (is_real_trans && xid &&
123- (error= !(cookie= tc_log->log_xid(thd, xid)))))
124+ DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
125+
126+ if (!is_real_trans)
127 {
128- ha_rollback_trans(thd, all);
129- error= 1;
130+ error= commit_one_phase_low(thd, all, trans, is_real_trans);
131+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
132 goto end;
133 }
134- DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
135- }
136- error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
137- DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
138- if (cookie)
139+
140+ cookie= tc_log->log_and_order(thd, xid, all, need_commit_ordered);
141+ if (!cookie)
142+ goto err;
143+
144+ DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
145+
146+ error= commit_one_phase_low(thd, all, trans, is_real_trans) ? 2 : 0;
147+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
148+ if (is_real_trans) /* userstat.patch */
149+ thd->diff_commit_trans++; /* userstat.patch */
150+ RUN_HOOK(transaction, after_commit, (thd, FALSE));
151+
152+ DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
153 if(tc_log->unlog(cookie, xid))
154 {
155 error= 2;
156 goto end;
157 }
158- DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
159- if (is_real_trans)
160- thd->diff_commit_trans++;
161- RUN_HOOK(transaction, after_commit, (thd, FALSE));
162+
163+ DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
164+ goto end;
165+
166+ /* Come here if error and we need to rollback. */
167+err:
168+ error= 1; /* Transaction was rolled back */
169+ ha_rollback_trans(thd, all);
170+
171 end:
172 if (rw_trans && mdl_request.ticket)
173 {
174@@ -1260,9 +1293,6 @@
175 thd->mdl_context.release_lock(mdl_request.ticket);
176 }
177 }
178- /* Free resources and perform other cleanup even for 'empty' transactions. */
179- else if (is_real_trans)
180- thd->transaction.cleanup();
181 DBUG_RETURN(error);
182 }
183
184@@ -1279,7 +1309,6 @@
185
186 int ha_commit_one_phase(THD *thd, bool all)
187 {
188- int error=0;
189 THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
190 /*
191 "real" is a nick name for a transaction for which a commit will
192@@ -1295,8 +1324,16 @@
193 transaction.all.ha_list, see why in trans_register_ha()).
194 */
195 bool is_real_trans=all || thd->transaction.all.ha_list == 0;
196- Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
197 DBUG_ENTER("ha_commit_one_phase");
198+ DBUG_RETURN(commit_one_phase_low(thd, all, trans, is_real_trans));
199+}
200+
201+static int
202+commit_one_phase_low(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
203+{
204+ int error= 0;
205+ Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
206+ DBUG_ENTER("commit_one_phase_low");
207
208 if (ha_info)
209 {
210@@ -1894,7 +1931,16 @@
211 {
212 bool warn= true;
213
214+ /*
215+ Holding the LOCK_commit_ordered mutex ensures that we get the same
216+ snapshot for all engines (including the binary log). This allows us
217+ among other things to do backups with
218+ START TRANSACTION WITH CONSISTENT SNAPSHOT and
219+ have a consistent binlog position.
220+ */
221+ mysql_mutex_lock(&LOCK_commit_ordered);
222 plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
223+ mysql_mutex_unlock(&LOCK_commit_ordered);
224
225 /*
226 Same idea as when one wants to CREATE TABLE in one engine which does not
227--- a/sql/handler.h
228+++ b/sql/handler.h
229@@ -756,6 +756,53 @@
230 and 'real commit' mean the same event.
231 */
232 int (*commit)(handlerton *hton, THD *thd, bool all);
233+ /*
234+ The commit_ordered() method is called prior to the commit() method, after
235+ the transaction manager has decided to commit (not rollback) the
236+ transaction. Unlike commit(), commit_ordered() is called only when the
237+ full transaction is committed, not for each commit of statement
238+ transaction in a multi-statement transaction.
239+
240+ Not that like prepare(), commit_ordered() is only called when 2-phase
241+ commit takes place. Ie. when no binary log and only a single engine
242+ participates in a transaction, one commit() is called, no
243+ commit_ordered(). So engines must be prepared for this.
244+
245+ The calls to commit_ordered() in multiple parallel transactions is
246+ guaranteed to happen in the same order in every participating
247+ handler. This can be used to ensure the same commit order among multiple
248+ handlers (eg. in table handler and binlog). So if transaction T1 calls
249+ into commit_ordered() of handler A before T2, then T1 will also call
250+ commit_ordered() of handler B before T2.
251+
252+ Engines that implement this method should during this call make the
253+ transaction visible to other transactions, thereby making the order of
254+ transaction commits be defined by the order of commit_ordered() calls.
255+
256+ The intention is that commit_ordered() should do the minimal amount of
257+ work that needs to happen in consistent commit order among handlers. To
258+ preserve ordering, calls need to be serialised on a global mutex, so
259+ doing any time-consuming or blocking operations in commit_ordered() will
260+ limit scalability.
261+
262+ Handlers can rely on commit_ordered() calls to be serialised (no two
263+ calls can run in parallel, so no extra locking on the handler part is
264+ required to ensure this).
265+
266+ Note that commit_ordered() can be called from a different thread than the
267+ one handling the transaction! So it can not do anything that depends on
268+ thread local storage, in particular it can not call my_error() and
269+ friends (instead it can store the error code and delay the call of
270+ my_error() to the commit() method).
271+
272+ Similarly, since commit_ordered() returns void, any return error code
273+ must be saved and returned from the commit() method instead.
274+
275+ The commit_ordered method is optional, and can be left unset if not
276+ needed in a particular handler (then there will be no ordering guarantees
277+ wrt. other engines and binary log).
278+ */
279+ void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
280 int (*rollback)(handlerton *hton, THD *thd, bool all);
281 int (*prepare)(handlerton *hton, THD *thd, bool all);
282 int (*recover)(handlerton *hton, XID *xid_list, uint len);
283--- a/sql/log.cc
284+++ b/sql/log.cc
1bfc1981 285@@ -71,6 +71,25 @@
13ceb006
AM
286 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
287 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
288
289+static LEX_STRING const write_error_msg=
290+ { C_STRING_WITH_LEN("error writing to the binary log") };
291+
292+static my_bool mutexes_inited;
293+mysql_mutex_t LOCK_group_commit_queue;
294+mysql_mutex_t LOCK_commit_ordered;
295+
296+static ulonglong binlog_status_var_num_commits;
297+static ulonglong binlog_status_var_num_group_commits;
298+
299+static SHOW_VAR binlog_status_vars_detail[]=
300+{
301+ {"commits",
302+ (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
303+ {"group_commits",
304+ (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
305+ {NullS, NullS, SHOW_LONG}
306+};
307+
308 /**
309 purge logs, master and slave sides both, related error code
310 convertor.
1bfc1981 311@@ -167,41 +186,6 @@
13ceb006
AM
312 }
313
314 /*
315- Helper class to hold a mutex for the duration of the
316- block.
317-
318- Eliminates the need for explicit unlocking of mutexes on, e.g.,
319- error returns. On passing a null pointer, the sentry will not do
320- anything.
321- */
322-class Mutex_sentry
323-{
324-public:
325- Mutex_sentry(mysql_mutex_t *mutex)
326- : m_mutex(mutex)
327- {
328- if (m_mutex)
329- mysql_mutex_lock(mutex);
330- }
331-
332- ~Mutex_sentry()
333- {
334- if (m_mutex)
335- mysql_mutex_unlock(m_mutex);
336-#ifndef DBUG_OFF
337- m_mutex= 0;
338-#endif
339- }
340-
341-private:
342- mysql_mutex_t *m_mutex;
343-
344- // It's not allowed to copy this object in any way
345- Mutex_sentry(Mutex_sentry const&);
346- void operator=(Mutex_sentry const&);
347-};
348-
349-/*
350 Helper classes to store non-transactional and transactional data
351 before copying it to the binary log.
352 */
1bfc1981 353@@ -211,7 +195,8 @@
13ceb006
AM
354 binlog_cache_data(): m_pending(0), before_stmt_pos(MY_OFF_T_UNDEF),
355 incident(FALSE), changes_to_non_trans_temp_table_flag(FALSE),
356 saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
357- ptr_binlog_cache_disk_use(0)
358+ ptr_binlog_cache_disk_use(0), commit_bin_log_file_pos(0),
359+ using_xa(FALSE), xa_xid(0)
360 { }
361
362 ~binlog_cache_data()
1bfc1981 363@@ -270,6 +255,8 @@
13ceb006
AM
364 variable after truncating the cache.
365 */
366 cache_log.disk_writes= 0;
367+ using_xa= FALSE;
368+ commit_bin_log_file_pos= 0;
369 DBUG_ASSERT(empty());
370 }
371
1bfc1981 372@@ -411,6 +398,20 @@
13ceb006
AM
373
374 binlog_cache_data& operator=(const binlog_cache_data& info);
375 binlog_cache_data(const binlog_cache_data& info);
376+
377+public:
378+ /*
379+ Binlog position after current commit, available to storage engines during
380+ commit_ordered() and commit().
381+ */
382+ ulonglong commit_bin_log_file_pos;
383+
384+ /*
385+ Flag set true if this transaction is committed with log_xid() as part of
386+ XA, false if not.
387+ */
388+ bool using_xa;
389+ my_xid xa_xid;
390 };
391
392 class binlog_cache_mngr {
1bfc1981 393@@ -1624,7 +1625,7 @@
13ceb006
AM
394 */
395 static inline int
396 binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
397- bool is_transactional)
398+ bool is_transactional, bool all)
399 {
400 DBUG_ENTER("binlog_flush_cache");
401 int error= 0;
1bfc1981 402@@ -1643,8 +1644,8 @@
13ceb006
AM
403 were, we would have to ensure that we're not ending a statement
404 inside a stored function.
405 */
406- error= mysql_bin_log.write(thd, &cache_data->cache_log, end_evt,
407- cache_data->has_incident());
408+ error= mysql_bin_log.write_transaction_to_binlog(thd, cache_data,
409+ end_evt, all);
410 }
411 cache_data->reset();
412
1bfc1981 413@@ -1663,12 +1664,12 @@
13ceb006
AM
414 */
415 static inline int
416 binlog_commit_flush_stmt_cache(THD *thd,
417- binlog_cache_mngr *cache_mngr)
418+ binlog_cache_mngr *cache_mngr, bool all)
419 {
420 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
421 FALSE, FALSE, TRUE, 0);
422 return (binlog_flush_cache(thd, &cache_mngr->stmt_cache, &end_evt,
423- FALSE));
424+ FALSE, all));
425 }
426
427 /**
1bfc1981 428@@ -1681,12 +1682,12 @@
13ceb006
AM
429 nonzero if an error pops up when flushing the cache.
430 */
431 static inline int
432-binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
433+binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
434 {
435 Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
436 TRUE, FALSE, TRUE, 0);
437 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
438- TRUE));
439+ TRUE, all));
440 }
441
442 /**
1bfc1981 443@@ -1699,12 +1700,12 @@
13ceb006
AM
444 nonzero if an error pops up when flushing the cache.
445 */
446 static inline int
447-binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
448+binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
449 {
450 Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
451 TRUE, FALSE, TRUE, 0);
452 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
453- TRUE));
454+ TRUE, all));
455 }
456
457 /**
1bfc1981 458@@ -1719,11 +1720,11 @@
13ceb006
AM
459 */
460 static inline int
461 binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr,
462- my_xid xid)
463+ my_xid xid, bool all)
464 {
465 Xid_log_event end_evt(thd, xid);
466 return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
467- TRUE));
468+ TRUE, all));
469 }
470
471 /**
1bfc1981 472@@ -1785,7 +1786,7 @@
13ceb006
AM
473 do nothing.
474 just pretend we can do 2pc, so that MySQL won't
475 switch to 1pc.
476- real work will be done in MYSQL_BIN_LOG::log_xid()
477+ real work will be done in MYSQL_BIN_LOG::log_and_order()
478 */
479 return 0;
480 }
1bfc1981 481@@ -1818,7 +1819,7 @@
13ceb006
AM
482
483 if (!cache_mngr->stmt_cache.empty())
484 {
485- error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
486+ error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
487 }
488
489 if (cache_mngr->trx_cache.empty())
1bfc1981 490@@ -1837,7 +1838,7 @@
13ceb006
AM
491 Otherwise, we accumulate the changes.
492 */
493 if (!error && ending_trans(thd, all))
494- error= binlog_commit_flush_trx_cache(thd, cache_mngr);
495+ error= binlog_commit_flush_trx_cache(thd, cache_mngr, all);
496
497 /*
498 This is part of the stmt rollback.
1bfc1981 499@@ -1881,7 +1882,7 @@
13ceb006
AM
500 }
501 else if (!cache_mngr->stmt_cache.empty())
502 {
503- error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
504+ error= binlog_commit_flush_stmt_cache(thd, cache_mngr, all);
505 }
506
507 if (cache_mngr->trx_cache.empty())
1bfc1981 508@@ -1929,7 +1930,7 @@
13ceb006
AM
509 (trans_has_updated_non_trans_table(thd) &&
510 ending_single_stmt_trans(thd,all) &&
511 thd->variables.binlog_format == BINLOG_FORMAT_MIXED)))
512- error= binlog_rollback_flush_trx_cache(thd, cache_mngr);
513+ error= binlog_rollback_flush_trx_cache(thd, cache_mngr, all);
514 /*
515 Truncate the cache if:
516 . aborting a single or multi-statement transaction or;
1bfc1981 517@@ -2904,6 +2905,7 @@
13ceb006
AM
518 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
519 :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
520 need_start_event(TRUE),
521+ group_commit_queue(0), num_commits(0), num_group_commits(0),
522 sync_period_ptr(sync_period),
523 is_relay_log(0), signal_cnt(0),
524 description_event_for_exec(0), description_event_for_queue(0)
29ffd636 525@@ -5361,19 +5363,15 @@
13ceb006
AM
526 SYNOPSIS
527 write_cache()
528 cache Cache to write to the binary log
529- lock_log True if the LOCK_log mutex should be aquired, false otherwise
530- sync_log True if the log should be flushed and synced
531
532 DESCRIPTION
533 Write the contents of the cache to the binary log. The cache will
534 be reset as a READ_CACHE to be able to read the contents from it.
535 */
536
537-int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache,
538- bool lock_log, bool sync_log)
539+int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
540 {
541- Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
542-
543+ mysql_mutex_assert_owner(&LOCK_log);
544 if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
545 return ER_ERROR_ON_WRITE;
546 uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
29ffd636 547@@ -5484,6 +5482,8 @@
13ceb006
AM
548 }
549
550 /* Write data to the binary log file */
551+ DBUG_EXECUTE_IF("fail_binlog_write_1",
552+ errno= 28; return ER_ERROR_ON_WRITE;);
553 if (my_b_write(&log_file, cache->read_pos, length))
554 return ER_ERROR_ON_WRITE;
555 thd->binlog_bytes_written+= length;
29ffd636 556@@ -5492,9 +5492,6 @@
13ceb006
AM
557
558 DBUG_ASSERT(carry == 0);
559
560- if (sync_log)
561- return flush_and_sync(0);
562-
563 return 0; // All OK
564 }
565
29ffd636 566@@ -5535,8 +5532,6 @@
13ceb006
AM
567 if (!is_open())
568 DBUG_RETURN(error);
569
570- LEX_STRING const write_error_msg=
571- { C_STRING_WITH_LEN("error writing to the binary log") };
572 Incident incident= INCIDENT_LOST_EVENTS;
573 Incident_log_event ev(thd, incident, write_error_msg);
574 if (lock)
29ffd636 575@@ -5585,112 +5580,332 @@
13ceb006
AM
576 'cache' needs to be reinitialized after this functions returns.
577 */
578
579-bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
580- bool incident)
581+bool
582+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
583+ Log_event *end_ev, bool all)
1bfc1981 584+{
13ceb006
AM
585+ group_commit_entry entry;
586+ bool ret;
587+ DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
588+
589+ entry.thd= thd;
590+ entry.cache_data= cache_data;
591+ entry.error= 0;
592+ entry.all= all;
593+
594+ /*
595+ Log "BEGIN" at the beginning of every transaction. Here, a transaction is
596+ either a BEGIN..COMMIT block or a single statement in autocommit mode.
597+
598+ Create the necessary events here, where we have the correct THD (and
599+ thread context).
600+
601+ Due to group commit the actual writing to binlog may happen in a different
602+ thread.
603+ */
604+ Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
605+ entry.begin_event= &qinfo;
606+ entry.end_event= end_ev;
607+ if (cache_data->has_incident())
608+ {
609+ Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
610+ entry.incident_event= &inc_ev;
611+ ret = write_transaction_to_binlog_events(&entry);
612+ }
613+ else
614+ {
615+ entry.incident_event= NULL;
616+ ret = write_transaction_to_binlog_events(&entry);
617+ }
618+ if (!ret) /* userstat.patch */
619+ thd->binlog_bytes_written += qinfo.data_written; /* userstat.patch */
620+ DBUG_RETURN(ret);
621+}
622+
623+bool
624+MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
1bfc1981
AM
625 {
626- DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
13ceb006
AM
627+ /*
628+ To facilitate group commit for the binlog, we first queue up ourselves in
629+ the group commit queue. Then the first thread to enter the queue waits for
630+ the LOCK_log mutex, and commits for everyone in the queue once it gets the
631+ lock. Any other threads in the queue just wait for the first one to finish
632+ the commit and wake them up.
633+ */
634+ entry->thd->clear_wakeup_ready();
635+ mysql_mutex_lock(&LOCK_group_commit_queue);
636+ group_commit_entry *orig_queue= group_commit_queue;
637+ entry->next= orig_queue;
638+ group_commit_queue= entry;
639+ DEBUG_SYNC(entry->thd, "commit_group_commit_queue");
640+ mysql_mutex_unlock(&LOCK_group_commit_queue);
641+
642+ /*
643+ The first in the queue handle group commit for all; the others just wait
644+ to be signalled when group commit is done.
645+ */
646+ if (orig_queue != NULL)
647+ entry->thd->wait_for_wakeup_ready();
648+ else
649+ trx_group_commit_leader(entry);
650+
651+ if (likely(!entry->error))
652+ return 0;
653+
654+ switch (entry->error)
655+ {
656+ case ER_ERROR_ON_WRITE:
657+ my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
658+ break;
659+ case ER_ERROR_ON_READ:
660+ my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
661+ entry->cache_data->cache_log.file_name, entry->commit_errno);
662+ break;
663+ default:
664+ /*
665+ There are not (and should not be) any errors thrown not covered above.
666+ But just in case one is added later without updating the above switch
667+ statement, include a catch-all.
668+ */
669+ my_printf_error(entry->error,
670+ "Error writing transaction to binary log: %d",
671+ MYF(ME_NOREFRESH), entry->error);
672+ }
1bfc1981 673
13ceb006
AM
674+ /*
675+ Since we return error, this transaction XID will not be committed, so
676+ we need to mark it as not needed for recovery (unlog() is not called
677+ for a transaction if log_xid() fails).
678+ */
679+ if (entry->cache_data->using_xa && entry->cache_data->xa_xid)
680+ mark_xid_done();
681+
682+ return 1;
683+}
684+
685+/*
686+ Do binlog group commit as the lead thread.
687+
688+ This must be called when this thread/transaction is queued at the start of
689+ the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
690+ commit all the transactions in the queue (more may have entered while waiting
691+ for LOCK_log). After commit is done, all other threads in the queue will be
692+ signalled.
693+
694+ */
695+void
696+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
697+{
698+ DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
699+ uint xid_count= 0;
700+ uint write_count= 0;
1bfc1981
AM
701+ bool check_purge= false;
702+ group_commit_entry *current= 0;
13ceb006
AM
703 DBUG_ASSERT(is_open());
704 if (likely(is_open())) // Should always be true
705 {
1bfc1981
AM
706- bool check_purge;
707-
708+ /*
709+ Lock the LOCK_log(), and once we get it, collect any additional writes
710+ that queued up while we were waiting.
711+ */
712 mysql_mutex_lock(&LOCK_log);
713+
714+ DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
715+ mysql_mutex_lock(&LOCK_group_commit_queue);
716+ current= group_commit_queue;
717+ group_commit_queue= NULL;
718+ mysql_mutex_unlock(&LOCK_group_commit_queue);
719+
720+ /* As the queue is in reverse order of entering, reverse it. */
721+ group_commit_entry *queue= NULL;
722+ while (current)
723+ {
724+ group_commit_entry *next= current->next;
725+ current->next= queue;
726+ queue= current;
727+ current= next;
728+ }
729+ DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
13ceb006
AM
730 /*
731- We only bother to write to the binary log if there is anything
732- to write.
733- */
734- if (my_b_tell(cache) > 0)
1bfc1981
AM
735+ Now we have in queue the list of transactions to be committed in order.
736+
13ceb006
AM
737+ Commit every transaction in the queue.
738+
739+ Note that we are doing this in a different thread than the one running
740+ the transaction! So we are limited in the operations we can do. In
741+ particular, we cannot call my_error() on behalf of a transaction, as
742+ that obtains the THD from thread local storage. Instead, we must set
743+ current->error and let the thread do the error reporting itself once
744+ we wake it up.
745+ */
746+ for (current= queue; current != NULL; current= current->next)
747 {
748+ binlog_cache_data *cache_data= current->cache_data;
749+ IO_CACHE *cache= &cache_data->cache_log;
750+
751 /*
752- Log "BEGIN" at the beginning of every transaction. Here, a
753- transaction is either a BEGIN..COMMIT block or a single
754- statement in autocommit mode.
755+ We only bother to write to the binary log if there is anything
756+ to write.
757 */
758- Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
759- if (qinfo.write(&log_file))
760- goto err;
761- thd->binlog_bytes_written+= qinfo.data_written;
762- DBUG_EXECUTE_IF("crash_before_writing_xid",
763- {
764- if ((write_error= write_cache(thd, cache, false, true)))
765- DBUG_PRINT("info", ("error writing binlog cache: %d",
766- write_error));
767- DBUG_PRINT("info", ("crashing before writing xid"));
768- DBUG_SUICIDE();
769- });
1bfc1981
AM
770-
771- if ((write_error= write_cache(thd, cache, false, false)))
772- goto err;
773-
774- if (commit_event && commit_event->write(&log_file))
775- goto err;
776- if (commit_event)
777- thd->binlog_bytes_written+= commit_event->data_written;
13ceb006
AM
778+ if (my_b_tell(cache) > 0)
779+ {
780+ if ((current->error= write_transaction(current)))
781+ current->commit_errno= errno;
13ceb006
AM
782+ write_count++;
783+ }
784
1bfc1981 785- if (incident && write_incident(thd, FALSE))
13ceb006 786- goto err;
13ceb006
AM
787+ cache_data->commit_bin_log_file_pos= my_b_write_tell(&log_file);
788+ if (cache_data->using_xa && cache_data->xa_xid)
789+ xid_count++;
790+ }
791
13ceb006
AM
792+ if (write_count > 0)
793+ {
794 bool synced= 0;
795 if (flush_and_sync(&synced))
796- goto err;
797- DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
798- if (cache->error) // Error on read
799 {
800- sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
801- write_error=1; // Don't give more errors
802- goto err;
803+ for (current= queue; current != NULL; current= current->next)
804+ {
805+ if (!current->error)
806+ {
807+ current->error= ER_ERROR_ON_WRITE;
808+ current->commit_errno= errno;
809+ }
810+ }
811+ }
812+ else
813+ {
814+ signal_update();
815 }
816
817 if (RUN_HOOK(binlog_storage, after_flush,
818- (thd, log_file_name, log_file.pos_in_file, synced)))
819+ (leader->thd, log_file_name, log_file.pos_in_file, synced)))
820 {
821 sql_print_error("Failed to run 'after_flush' hooks");
822- write_error=1;
823- goto err;
824+ for (current= queue; current != NULL; current= current->next)
825+ {
826+ if (!current->error)
827+ {
828+ current->error= ER_ERROR_ON_WRITE;
829+ current->commit_errno= errno;
830+ }
831+ }
832 }
833
834- signal_update();
835 }
836
837 /*
838- if commit_event is Xid_log_event, increase the number of
839- prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
840+ if any commit_events are Xid_log_event, increase the number of
841+ prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
842 if there're prepared xids in it - see the comment in new_file() for
843 an explanation.
844- If the commit_event is not Xid_log_event (then it's a Query_log_event)
845- rotate binlog, if necessary.
846+ If no Xid_log_events (then it's all Query_log_event) rotate binlog,
847+ if necessary.
848 */
849- if (commit_event && commit_event->get_type_code() == XID_EVENT)
850+ if (xid_count > 0)
851 {
852- mysql_mutex_lock(&LOCK_prep_xids);
853- prepared_xids++;
854- mysql_mutex_unlock(&LOCK_prep_xids);
1bfc1981 855- mysql_mutex_unlock(&LOCK_log);
13ceb006
AM
856+ mark_xids_active(xid_count);
857 }
858 else
1bfc1981
AM
859 {
860 if (rotate(false, &check_purge))
13ceb006 861- goto err;
1bfc1981
AM
862- mysql_mutex_unlock(&LOCK_log);
863- if (check_purge)
864- purge();
13ceb006
AM
865+ {
866+ for (current= queue; current != NULL; current= current->next)
867+ {
868+ if (!current->error)
869+ {
870+ current->error= ER_ERROR_ON_WRITE;
871+ current->commit_errno= errno;
872+ }
873+ }
874+ }
1bfc1981
AM
875 }
876- }
13ceb006
AM
877
878- DBUG_RETURN(0);
1bfc1981
AM
879+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
880+ mysql_mutex_lock(&LOCK_commit_ordered);
881+ /*
882+ We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
883+ otherwise scheduling could allow the next group commit to run ahead of us,
884+ messing up the order of commit_ordered() calls. But as soon as
885+ LOCK_commit_ordered is obtained, we can let the next group commit start.
886+ */
887
13ceb006
AM
888-err:
889- if (!write_error)
1bfc1981 890- {
13ceb006
AM
891- write_error= 1;
892- sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
1bfc1981 893+ mysql_mutex_unlock(&LOCK_log);
13ceb006 894+
1bfc1981
AM
895+ if (xid_count > 0 && check_purge)
896+ {
897+ purge();
898+ }
899+
900+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
901+ ++num_group_commits;
13ceb006
AM
902+
903+ /*
1bfc1981
AM
904+ Wakeup each participant waiting for our group commit, first calling the
905+ commit_ordered() methods for any transactions doing 2-phase commit.
13ceb006 906+ */
1bfc1981
AM
907+ current= queue;
908+ while (current != NULL)
909+ {
910+ group_commit_entry *next;
911+
912+ DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
913+ ++num_commits;
914+ if (current->cache_data->using_xa && !current->error)
915+ run_commit_ordered(current->thd, current->all);
916+
917+ /*
918+ Careful not to access current->next after waking up the other thread! As
919+ it may change immediately after wakeup.
920+ */
921+ next= current->next;
922+ if (current != leader) // Don't wake up ourself
923+ current->thd->signal_wakeup_ready();
924+ current= next;
925+ }
926+ DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
927+ mysql_mutex_unlock(&LOCK_commit_ordered);
13ceb006
AM
928 }
929- mysql_mutex_unlock(&LOCK_log);
930- DBUG_RETURN(1);
13ceb006
AM
931+
932+ DBUG_VOID_RETURN;
933 }
934
1bfc1981 935
13ceb006
AM
936+int
937+MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry)
938+{
939+ binlog_cache_data *cache_data= entry->cache_data;
940+ IO_CACHE *cache= &cache_data->cache_log;
941+
942+ if (entry->begin_event->write(&log_file))
943+ return ER_ERROR_ON_WRITE;
944+
945+ DBUG_EXECUTE_IF("crash_before_writing_xid",
946+ {
947+ if ((write_cache(entry->thd, cache)))
948+ DBUG_PRINT("info", ("error writing binlog cache"));
949+ else
950+ flush_and_sync(0);
951+
952+ DBUG_PRINT("info", ("crashing before writing xid"));
953+ abort();
954+ });
955+
956+ if (write_cache(entry->thd, cache))
957+ return ER_ERROR_ON_WRITE;
958+
959+ if (entry->end_event->write(&log_file))
960+ return ER_ERROR_ON_WRITE;
961+
962+ if (entry->incident_event && entry->incident_event->write(&log_file))
963+ return ER_ERROR_ON_WRITE;
964+
965+ if (cache->error) // Error on read
966+ return ER_ERROR_ON_READ;
967+
968+ return 0;
969+}
1bfc1981 970+
13ceb006
AM
971 /**
972 Wait until we get a signal that the relay log has been updated.
1bfc1981 973
29ffd636 974@@ -6095,6 +6310,68 @@
13ceb006
AM
975 }
976
977
978+void
979+TC_init()
980+{
981+ mysql_mutex_init(key_LOCK_group_commit_queue, &LOCK_group_commit_queue, MY_MUTEX_INIT_SLOW);
982+ mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered, MY_MUTEX_INIT_SLOW);
983+ mutexes_inited= TRUE;
984+}
985+
986+
987+void
988+TC_destroy()
989+{
990+ if (mutexes_inited)
991+ {
992+ mysql_mutex_destroy(&LOCK_group_commit_queue);
993+ mysql_mutex_destroy(&LOCK_commit_ordered);
994+ mutexes_inited= FALSE;
995+ }
996+}
997+
998+
999+void
1000+TC_LOG::run_commit_ordered(THD *thd, bool all)
1001+{
1002+ Ha_trx_info *ha_info=
1003+ all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
1004+
1005+ mysql_mutex_assert_owner(&LOCK_commit_ordered);
1006+ for (; ha_info; ha_info= ha_info->next())
1007+ {
1008+ handlerton *ht= ha_info->ht();
1009+ if (!ht->commit_ordered)
1010+ continue;
1011+ ht->commit_ordered(ht, thd, all);
1012+ DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
1013+ }
1014+}
1015+
1016+int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
1017+ bool need_commit_ordered)
1018+{
1019+ int cookie;
1020+
1021+ cookie= 0;
1022+ if (xid)
1023+ cookie= log_one_transaction(xid);
1024+
1025+ if (need_commit_ordered)
1026+ {
1027+ /* Only run commit_ordered() if log_xid was successful. */
1028+ if (cookie)
1029+ {
1030+ mysql_mutex_lock(&LOCK_commit_ordered);
1031+ run_commit_ordered(thd, all);
1032+ mysql_mutex_unlock(&LOCK_commit_ordered);
1033+ }
1034+ }
1035+
1036+ return cookie;
1037+}
1038+
1039+
1040 /********* transaction coordinator log for 2pc - mmap() based solution *******/
1041
1042 /*
29ffd636 1043@@ -6231,6 +6508,7 @@
13ceb006
AM
1044 mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
1045 mysql_cond_init(key_COND_active, &COND_active, 0);
1046 mysql_cond_init(key_COND_pool, &COND_pool, 0);
1047+ mysql_cond_init(key_COND_queue_busy, &COND_queue_busy, 0);
1048
1049 inited=6;
1050
29ffd636 1051@@ -6238,6 +6516,8 @@
13ceb006
AM
1052 active=pages;
1053 pool=pages+1;
1054 pool_last=pages+npages-1;
1055+ commit_ordered_queue= NULL;
1056+ commit_ordered_queue_busy= false;
1057
1058 return 0;
1059
29ffd636 1060@@ -6343,7 +6623,7 @@
13ceb006
AM
1061 to the position in memory where xid was logged to.
1062 */
1063
1064-int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
1065+int TC_LOG_MMAP::log_one_transaction(my_xid xid)
1066 {
1067 int err;
1068 PAGE *p;
29ffd636 1069@@ -6482,7 +6762,9 @@
13ceb006
AM
1070 mysql_mutex_destroy(&LOCK_sync);
1071 mysql_mutex_destroy(&LOCK_active);
1072 mysql_mutex_destroy(&LOCK_pool);
1073+ mysql_cond_destroy(&COND_active);
1074 mysql_cond_destroy(&COND_pool);
1075+ mysql_cond_destroy(&COND_queue_busy);
1076 case 5:
1077 data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
1078 case 4:
29ffd636 1079@@ -6692,42 +6974,87 @@
13ceb006
AM
1080 mysql_cond_destroy(&COND_prep_xids);
1081 }
1082
1083-/**
1084- @todo
1085- group commit
1086+/*
1087+ Do a binlog log_xid() for a group of transactions, linked through
1088+ thd->next_commit_ordered.
1089
1090 @retval
1091 0 error
1092 @retval
1093 1 success
1094 */
1095-int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
1096+int TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
1097+ bool need_commit_ordered __attribute__((unused)))
1098 {
1099- DBUG_ENTER("TC_LOG_BINLOG::log");
1100+ DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
1101 binlog_cache_mngr *cache_mngr=
1102 (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1103+
1104+ cache_mngr->trx_cache.using_xa= TRUE;
1105+ cache_mngr->trx_cache.xa_xid= xid;
1106 /*
1107 We always commit the entire transaction when writing an XID. Also
1108 note that the return value is inverted.
1109 */
1110- DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr) &&
1111- !binlog_commit_flush_trx_cache(thd, cache_mngr, xid));
1112+ DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr, all) &&
1113+ !binlog_commit_flush_trx_cache(thd, cache_mngr, xid, all));
1114 }
1115
1116-int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1117+/*
1118+ After an XID is logged, we need to hold on to the current binlog file until
1119+ it is fully committed in the storage engine. The reason is that crash
1120+ recovery only looks at the latest binlog, so we must make sure there are no
1121+ outstanding prepared (but not committed) transactions before rotating the
1122+ binlog.
1123+
1124+ To handle this, we keep a count of outstanding XIDs. This function is used
1125+ to increase this count when committing one or more transactions to the
1126+ binary log.
1127+*/
1128+void
1129+TC_LOG_BINLOG::mark_xids_active(uint xid_count)
1130 {
1131- DBUG_ENTER("TC_LOG_BINLOG::unlog");
1132+ DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
1133+ DBUG_PRINT("info", ("xid_count=%u", xid_count));
1134+ mysql_mutex_lock(&LOCK_prep_xids);
1135+ prepared_xids+= xid_count;
1136+ mysql_mutex_unlock(&LOCK_prep_xids);
1137+ DBUG_VOID_RETURN;
1138+}
1139+
1140+/*
1141+ Once an XID is committed, it is safe to rotate the binary log, as it can no
1142+ longer be needed during crash recovery.
1143+
1144+ This function is called to mark an XID this way. It needs to decrease the
1145+ count of pending XIDs, and signal the log rotator thread when it reaches zero.
1146+*/
1147+void
1148+TC_LOG_BINLOG::mark_xid_done()
1149+{
1150+ my_bool send_signal;
1151+
1152+ DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
1153 mysql_mutex_lock(&LOCK_prep_xids);
1154 // prepared_xids can be 0 if the transaction had ignorable errors.
1155 DBUG_ASSERT(prepared_xids >= 0);
1156 if (prepared_xids > 0)
1157 prepared_xids--;
1158- if (prepared_xids == 0) {
1159+ send_signal= (prepared_xids == 0);
1160+ mysql_mutex_unlock(&LOCK_prep_xids);
1161+ if (send_signal) {
1162 DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
1163 mysql_cond_signal(&COND_prep_xids);
1164 }
1165- mysql_mutex_unlock(&LOCK_prep_xids);
1166- DBUG_RETURN(rotate_and_purge(0)); // as ::write() did not rotate
1167+ DBUG_VOID_RETURN;
1168+}
1169+
1170+int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
1171+{
1172+ DBUG_ENTER("TC_LOG_BINLOG::unlog");
1173+ if (xid)
1174+ mark_xid_done();
1175+ DBUG_RETURN(rotate_and_purge(0));
1176 }
1177
1178 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
29ffd636 1179@@ -6796,9 +7123,67 @@
13ceb006
AM
1180 {
1181 return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
1182 }
1183+/*
1184+ Get the current position of the MySQL binlog for transaction currently being
1185+ committed.
1186+
1187+ This is valid to call from within storage engine commit_ordered() and
1188+ commit() methods only.
1189+
1190+ Since it stores the position inside THD, it is safe to call without any
1191+ locking.
1192+
1193+ Note that currently the binlog file name is not stored inside THD, but this
1194+ is still safe as it can only change when the log is rotated, and we never
1195+ rotate the binlog while commits are pending inside storage engines.
1196+*/
1197+extern "C"
1198+void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
1199+{
1200+ binlog_cache_mngr *cache_mngr;
1201+ if (binlog_hton->state == SHOW_OPTION_YES
1202+ && (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
1203+ {
1204+ *out_pos= cache_mngr->trx_cache.commit_bin_log_file_pos;
1205+ *out_file= mysql_bin_log.get_log_fname();
1206+ }
1207+ else
1208+ {
3d3ecf24 1209+ *out_pos= 0ULL;
13ceb006
AM
1210+ *out_file= NULL;
1211+ }
1212+}
1213 #endif /* INNODB_COMPATIBILITY_HOOKS */
1214
1215
1216+static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
1217+{
1218+ mysql_bin_log.set_status_variables();
1219+ var->type= SHOW_ARRAY;
1220+ var->value= (char *)&binlog_status_vars_detail;
1221+ return 0;
1222+}
1223+
1224+static SHOW_VAR binlog_status_vars_top[]= {
1225+ {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
1226+ {NullS, NullS, SHOW_LONG}
1227+};
1228+
1229+/*
1230+ Copy out current values of status variables, for SHOW STATUS or
1231+ information_schema.global_status.
1232+
1233+ This is called only under LOCK_status, so we can fill in a static array.
1234+*/
1235+void
1236+TC_LOG_BINLOG::set_status_variables()
1237+{
1238+ mysql_mutex_lock(&LOCK_commit_ordered);
1239+ binlog_status_var_num_commits= this->num_commits;
1240+ binlog_status_var_num_group_commits= this->num_group_commits;
1241+ mysql_mutex_unlock(&LOCK_commit_ordered);
1242+}
1243+
1244 struct st_mysql_storage_engine binlog_storage_engine=
1245 { MYSQL_HANDLERTON_INTERFACE_VERSION };
1246
29ffd636 1247@@ -6813,7 +7198,7 @@
13ceb006
AM
1248 binlog_init, /* Plugin Init */
1249 NULL, /* Plugin Deinit */
1250 0x0100 /* 1.0 */,
1251- NULL, /* status variables */
1252+ binlog_status_vars_top, /* status variables */
1253 NULL, /* system variables */
1254 NULL, /* config options */
1255 0, /* flags */
1256--- a/sql/log.h
1257+++ b/sql/log.h
1258@@ -44,17 +44,42 @@
1259
1260 virtual int open(const char *opt_name)=0;
1261 virtual void close()=0;
1262- virtual int log_xid(THD *thd, my_xid xid)=0;
1263+ virtual int log_and_order(THD *thd, my_xid xid, bool all,
1264+ bool need_commit_ordered)=0;
1265 virtual int unlog(ulong cookie, my_xid xid)=0;
1266+
1267+ protected:
1268+ void run_commit_ordered(THD *thd, bool all);
1269 };
1270
1271+/*
1272+ Locks used to ensure serialised execution of
1273+ TC_LOG::run_commit_ordered(), or any other code that calls handler
1274+ commit_ordered() methods.
1275+*/
1276+extern mysql_mutex_t LOCK_group_commit_queue;
1277+extern mysql_mutex_t LOCK_commit_ordered;
1278+
1279+extern void TC_init();
1280+extern void TC_destroy();
1281+
1282 class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
1283 {
1284 public:
1285 TC_LOG_DUMMY() {}
1286 int open(const char *opt_name) { return 0; }
1287 void close() { }
1288- int log_xid(THD *thd, my_xid xid) { return 1; }
1289+ /*
1290+ TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we
1291+ only use internal XA during commit when >= 2 XA-capable engines
1292+ participate.
1293+ */
1294+ int log_and_order(THD *thd, my_xid xid, bool all,
1295+ bool need_commit_ordered)
1296+ {
1297+ DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */);
1298+ return 1;
1299+ }
1300 int unlog(ulong cookie, my_xid xid) { return 0; }
1301 };
1302
1303@@ -80,6 +105,13 @@
1304 mysql_cond_t cond; // to wait for a sync
1305 } PAGE;
1306
1307+ /* List of THDs for which to invoke commit_ordered(), in order. */
1308+ struct commit_entry
1309+ {
1310+ struct commit_entry *next;
1311+ THD *thd;
1312+ };
1313+
1314 char logname[FN_REFLEN];
1315 File fd;
1316 my_off_t file_length;
1317@@ -94,16 +126,38 @@
1318 */
1319 mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
1320 mysql_cond_t COND_pool, COND_active;
1321+ /*
1322+ Queue of threads that need to call commit_ordered().
1323+ Access to this queue must be protected by LOCK_group_commit_queue
1324+ */
1325+ commit_entry *commit_ordered_queue;
1326+ /*
1327+ This flag and condition is used to reserve the queue while threads in it
1328+ each run the commit_ordered() methods one after the other. Only once the
1329+ last commit_ordered() in the queue is done can we start on a new queue
1330+ run.
1331+
1332+ Since we start this process in the first thread in the queue and finish in
1333+ the last (and possibly different) thread, we need a condition variable for
1334+ this (we cannot unlock a mutex in a different thread than the one who
1335+ locked it).
1336+
1337+ The condition is used together with the LOCK_group_commit_queue mutex.
1338+ */
1339+ my_bool commit_ordered_queue_busy;
1340+ mysql_cond_t COND_queue_busy;
1341
1342 public:
1343 TC_LOG_MMAP(): inited(0) {}
1344 int open(const char *opt_name);
1345 void close();
1346- int log_xid(THD *thd, my_xid xid);
1347+ int log_and_order(THD *thd, my_xid xid, bool all,
1348+ bool need_commit_ordered);
1349 int unlog(ulong cookie, my_xid xid);
1350 int recover();
1351
1352 private:
1353+ int log_one_transaction(my_xid xid);
1354 void get_active_from_pool();
1355 int sync();
1356 int overflow();
1357@@ -271,9 +325,31 @@
1358 time_t last_time;
1359 };
1360
1361+class binlog_cache_data;
1362 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
1363 {
1364 private:
1365+ struct group_commit_entry
1366+ {
1367+ struct group_commit_entry *next;
1368+ THD *thd;
1369+ binlog_cache_data *cache_data;
1370+ /*
1371+ Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
1372+ written during group commit. The incident_event is only valid if
1373+ trx_data->has_incident() is true.
1374+ */
1375+ Log_event *begin_event;
1376+ Log_event *end_event;
1377+ Log_event *incident_event;
1378+ /* Set during group commit to record any per-thread error. */
1379+ int error;
1380+ int commit_errno;
1381+ /* This is the `all' parameter for ha_commit_ordered(). */
1382+ bool all;
1383+ /* True if we come in through XA log_and_order(), false otherwise. */
1384+ };
1385+
1386 #ifdef HAVE_PSI_INTERFACE
1387 /** The instrumentation key to use for @ LOCK_index. */
1388 PSI_mutex_key m_key_LOCK_index;
1389@@ -325,6 +401,12 @@
1390 In 5.0 it's 0 for relay logs too!
1391 */
1392 bool no_auto_events;
1393+ /* Queue of transactions queued up to participate in group commit. */
1394+ group_commit_entry *group_commit_queue;
1395+ /* Total number of committed transactions. */
1396+ ulonglong num_commits;
1397+ /* Number of group commits done. */
1398+ ulonglong num_group_commits;
1399
1400 /* pointer to the sync period variable, for binlog this will be
1401 sync_binlog_period, for relay log this will be
1402@@ -346,6 +428,11 @@
1403 */
1404 int new_file_without_locking();
1405 int new_file_impl(bool need_lock);
1406+ int write_transaction(group_commit_entry *entry);
1407+ bool write_transaction_to_binlog_events(group_commit_entry *entry);
1408+ void trx_group_commit_leader(group_commit_entry *leader);
1409+ void mark_xid_done();
1410+ void mark_xids_active(uint xid_count);
1411
1412 public:
1413 MYSQL_LOG::generate_name;
1414@@ -387,7 +474,8 @@
1415
1416 int open(const char *opt_name);
1417 void close();
1418- int log_xid(THD *thd, my_xid xid);
1419+ int log_and_order(THD *thd, my_xid xid, bool all,
1420+ bool need_commit_ordered);
1421 int unlog(ulong cookie, my_xid xid);
1422 int recover(IO_CACHE *log, Format_description_log_event *fdle);
1423 #if !defined(MYSQL_CLIENT)
1424@@ -434,11 +522,11 @@
1425 int new_file();
1426
1427 bool write(Log_event* event_info); // binary log write
1428- bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
1429+ bool write_transaction_to_binlog(THD *thd, binlog_cache_data *cache_data,
1430+ Log_event *end_ev, bool all);
1431 bool write_incident(THD *thd, bool lock);
1432
1433- int write_cache(THD *thd, IO_CACHE *cache,
1434- bool lock_log, bool flush_and_sync);
1435+ int write_cache(THD *thd, IO_CACHE *cache);
1436 void set_write_error(THD *thd, bool is_transactional);
1437 bool check_write_error(THD *thd);
1438
1bfc1981 1439@@ -509,6 +597,7 @@
13ceb006
AM
1440 inline void unlock_index() { mysql_mutex_unlock(&LOCK_index);}
1441 inline IO_CACHE *get_index_file() { return &index_file;}
1442 inline uint32 get_open_count() { return open_count; }
1443+ void set_status_variables();
1444 };
1445
1446 class Log_event_handler
1447--- a/sql/mysqld.cc
1448+++ b/sql/mysqld.cc
29ffd636 1449@@ -1495,6 +1495,7 @@
13ceb006
AM
1450 ha_end();
1451 if (tc_log)
1452 tc_log->close();
1453+ TC_destroy();
1454 delegates_destroy();
1455 xid_cache_free();
1456 table_def_free();
29ffd636 1457@@ -3911,6 +3912,8 @@
13ceb006
AM
1458 query_response_time_init();
1459 #endif // HAVE_RESPONSE_TIME_DISTRIBUTION
1460 /* We have to initialize the storage engines before CSV logging */
1461+ TC_init();
1462+
1463 init_global_table_stats();
1464 init_global_index_stats();
1465
29ffd636 1466@@ -7872,6 +7875,7 @@
13ceb006
AM
1467 key_LOCK_error_messages, key_LOG_INFO_lock, key_LOCK_thread_count,
1468 key_PARTITION_LOCK_auto_inc;
1469 PSI_mutex_key key_RELAYLOG_LOCK_index;
1470+PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1471
1472 static PSI_mutex_info all_server_mutexes[]=
1473 {
29ffd636 1474@@ -7892,6 +7896,7 @@
13ceb006
AM
1475 { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
1476 { &key_hash_filo_lock, "hash_filo::lock", 0},
1477 { &key_LOCK_active_mi, "LOCK_active_mi", PSI_FLAG_GLOBAL},
1478+ { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
1479 { &key_LOCK_connection_count, "LOCK_connection_count", PSI_FLAG_GLOBAL},
1480 { &key_LOCK_crypt, "LOCK_crypt", PSI_FLAG_GLOBAL},
1481 { &key_LOCK_delayed_create, "LOCK_delayed_create", PSI_FLAG_GLOBAL},
29ffd636 1482@@ -7907,6 +7912,7 @@
13ceb006
AM
1483 "LOCK_global_index_stats", PSI_FLAG_GLOBAL},
1484 { &key_LOCK_gdl, "LOCK_gdl", PSI_FLAG_GLOBAL},
1485 { &key_LOCK_global_system_variables, "LOCK_global_system_variables", PSI_FLAG_GLOBAL},
1486+ { &key_LOCK_group_commit_queue, "LOCK_group_commit_queue", PSI_FLAG_GLOBAL},
1487 { &key_LOCK_manager, "LOCK_manager", PSI_FLAG_GLOBAL},
1488 { &key_LOCK_prepared_stmt_count, "LOCK_prepared_stmt_count", PSI_FLAG_GLOBAL},
1489 { &key_LOCK_rpl_status, "LOCK_rpl_status", PSI_FLAG_GLOBAL},
29ffd636 1490@@ -7918,6 +7924,7 @@
13ceb006
AM
1491 { &key_LOCK_temporary_tables, "THD::LOCK_temporary_tables", 0},
1492 { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
1493 { &key_LOCK_uuid_generator, "LOCK_uuid_generator", PSI_FLAG_GLOBAL},
1494+ { &key_LOCK_wakeup_ready, "THD::LOCK_wakeup_ready", 0},
1495 { &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
1496 { &key_master_info_data_lock, "Master_info::data_lock", 0},
1497 { &key_master_info_run_lock, "Master_info::run_lock", 0},
29ffd636 1498@@ -7965,6 +7972,7 @@
13ceb006
AM
1499 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1500 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1501 PSI_cond_key key_RELAYLOG_update_cond;
1502+PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1503
1504 static PSI_cond_info all_server_conds[]=
1505 {
29ffd636 1506@@ -7981,8 +7989,10 @@
13ceb006
AM
1507 { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
1508 { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
1509 { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
1510+ { &key_COND_queue_busy, "COND_queue_busy", PSI_FLAG_GLOBAL},
1511 { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
1512 { &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
1513+ { &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
1514 { &key_delayed_insert_cond, "Delayed_insert::cond", 0},
1515 { &key_delayed_insert_cond_client, "Delayed_insert::cond_client", 0},
1516 { &key_item_func_sleep_cond, "Item_func_sleep::cond", 0},
1517--- a/sql/mysqld.h
1518+++ b/sql/mysqld.h
29ffd636 1519@@ -274,6 +274,7 @@
13ceb006
AM
1520 key_structure_guard_mutex, key_TABLE_SHARE_LOCK_ha_data,
1521 key_LOCK_error_messages, key_LOCK_thread_count, key_PARTITION_LOCK_auto_inc;
1522 extern PSI_mutex_key key_RELAYLOG_LOCK_index;
1523+extern PSI_mutex_key key_LOCK_wakeup_ready, key_LOCK_group_commit_queue, key_LOCK_commit_ordered;
1524
1525 extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger,
1526 key_rwlock_LOCK_sys_init_connect, key_rwlock_LOCK_sys_init_slave,
29ffd636 1527@@ -294,6 +295,7 @@
13ceb006
AM
1528 key_TABLE_SHARE_cond, key_user_level_lock_cond,
1529 key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
1530 extern PSI_cond_key key_RELAYLOG_update_cond;
1531+extern PSI_cond_key key_COND_wakeup_ready, key_COND_queue_busy;
1532
1533 extern PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
1534 key_thread_handle_manager, key_thread_kill_server, key_thread_main,
1535--- a/sql/sql_class.cc
1536+++ b/sql/sql_class.cc
1bfc1981 1537@@ -1005,6 +1005,8 @@
13ceb006
AM
1538 mysql_mutex_init(key_LOCK_thd_data, &LOCK_thd_data, MY_MUTEX_INIT_FAST);
1539 mysql_mutex_init(key_LOCK_temporary_tables, &LOCK_temporary_tables,
1540 MY_MUTEX_INIT_FAST);
1541+ mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
1542+ mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, NULL);
1543
1544 /* Variables with default values */
1545 proc_info="login";
1bfc1981 1546@@ -1609,6 +1611,8 @@
13ceb006
AM
1547 my_free(db);
1548 db= NULL;
1549 free_root(&transaction.mem_root,MYF(0));
1550+ mysql_cond_destroy(&COND_wakeup_ready);
1551+ mysql_mutex_destroy(&LOCK_wakeup_ready);
1552 mysql_mutex_destroy(&LOCK_thd_data);
1553 mysql_mutex_destroy(&LOCK_temporary_tables);
1554 #ifndef DBUG_OFF
29ffd636 1555@@ -5297,6 +5301,24 @@
13ceb006
AM
1556 DBUG_RETURN(0);
1557 }
1558
1559+void
1560+THD::wait_for_wakeup_ready()
1561+{
1562+ mysql_mutex_lock(&LOCK_wakeup_ready);
1563+ while (!wakeup_ready)
1564+ mysql_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready);
1565+ mysql_mutex_unlock(&LOCK_wakeup_ready);
1566+}
1567+
1568+void
1569+THD::signal_wakeup_ready()
1570+{
1571+ mysql_mutex_lock(&LOCK_wakeup_ready);
1572+ wakeup_ready= true;
1573+ mysql_mutex_unlock(&LOCK_wakeup_ready);
1574+ mysql_cond_signal(&COND_wakeup_ready);
1575+}
1576+
1577 bool Discrete_intervals_list::append(ulonglong start, ulonglong val,
1578 ulonglong incr)
1579 {
1580--- a/sql/sql_class.h
1581+++ b/sql/sql_class.h
1bfc1981 1582@@ -3078,6 +3078,14 @@
13ceb006
AM
1583 LEX_STRING get_invoker_user() { return invoker_user; }
1584 LEX_STRING get_invoker_host() { return invoker_host; }
1585 bool has_invoker() { return invoker_user.length > 0; }
1586+ void clear_wakeup_ready() { wakeup_ready= false; }
1587+ /*
1588+ Sleep waiting for others to wake us up with signal_wakeup_ready().
1589+ Must call clear_wakeup_ready() before waiting.
1590+ */
1591+ void wait_for_wakeup_ready();
1592+ /* Wake this thread up from wait_for_wakeup_ready(). */
1593+ void signal_wakeup_ready();
1594 private:
1595
1596 /** The current internal error handler for this thread, or NULL. */
1bfc1981 1597@@ -3120,6 +3128,16 @@
13ceb006
AM
1598 */
1599 LEX_STRING invoker_user;
1600 LEX_STRING invoker_host;
1601+ /*
1602+ Flag, mutex and condition for a thread to wait for a signal from another
1603+ thread.
1604+
1605+ Currently used to wait for group commit to complete, can also be used for
1606+ other purposes.
1607+ */
1608+ bool wakeup_ready;
1609+ mysql_mutex_t LOCK_wakeup_ready;
1610+ mysql_cond_t COND_wakeup_ready;
1611 };
1612
1613 /* Returns string as 'IP' for the client-side of the connection represented by
1614--- a/sql/sql_parse.cc
1615+++ b/sql/sql_parse.cc
1616@@ -889,6 +889,10 @@
1617 DBUG_ENTER("dispatch_command");
1618 DBUG_PRINT("info",("packet: '%*.s'; command: %d", packet_length, packet, command));
1619
1620+ DBUG_EXECUTE_IF("crash_dispatch_command_before",
1621+ { DBUG_PRINT("crash_dispatch_command_before", ("now"));
1622+ DBUG_ABORT(); });
1623+
1624 #if defined(ENABLED_PROFILING)
1625 thd->profiling.start_new_query();
1626 #endif
1627--- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1628+++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
1629@@ -11,9 +11,9 @@
1630 wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
1631 wait/synch/mutex/sql/LOCK_active_mi YES YES
1632 wait/synch/mutex/sql/LOCK_audit_mask YES YES
1633+wait/synch/mutex/sql/LOCK_commit_ordered YES YES
1634 wait/synch/mutex/sql/LOCK_connection_count YES YES
1635 wait/synch/mutex/sql/LOCK_crypt YES YES
1636-wait/synch/mutex/sql/LOCK_delayed_create YES YES
1637 select * from performance_schema.setup_instruments
1638 where name like 'Wait/Synch/Rwlock/sql/%'
1639 and name not in ('wait/synch/rwlock/sql/CRYPTO_dynlock_value::lock')
1640@@ -38,6 +38,7 @@
1641 NAME ENABLED TIMED
1642 wait/synch/cond/sql/COND_flush_thread_cache YES YES
1643 wait/synch/cond/sql/COND_manager YES YES
1644+wait/synch/cond/sql/COND_queue_busy YES YES
1645 wait/synch/cond/sql/COND_queue_state YES YES
1646 wait/synch/cond/sql/COND_rpl_status YES YES
1647 wait/synch/cond/sql/COND_server_started YES YES
1648@@ -45,7 +46,6 @@
1649 wait/synch/cond/sql/COND_thread_count YES YES
1650 wait/synch/cond/sql/Delayed_insert::cond YES YES
1651 wait/synch/cond/sql/Delayed_insert::cond_client YES YES
1652-wait/synch/cond/sql/Event_scheduler::COND_state YES YES
1653 select * from performance_schema.setup_instruments
1654 where name='Wait';
1655 select * from performance_schema.setup_instruments
1656--- a/storage/innobase/handler/ha_innodb.cc
1657+++ b/storage/innobase/handler/ha_innodb.cc
1658@@ -375,6 +375,9 @@
1659 static INNOBASE_SHARE *get_share(const char *table_name);
1660 static void free_share(INNOBASE_SHARE *share);
1661 static int innobase_close_connection(handlerton *hton, THD* thd);
1662+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1663+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
1664+#endif
1665 static int innobase_commit(handlerton *hton, THD* thd, bool all);
1666 static int innobase_rollback(handlerton *hton, THD* thd, bool all);
1667 static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
1668@@ -1699,7 +1702,10 @@
1669 trx_t* trx) /*!< in/out: InnoDB transaction handle */
1670 {
1671 DBUG_ENTER("innobase_trx_init");
1672+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1673+ /* used by innobase_commit_ordered */
1674 DBUG_ASSERT(EQ_CURRENT_THD(thd));
1675+#endif
1676 DBUG_ASSERT(thd == trx->mysql_thd);
1677
1678 trx->check_foreigns = !thd_test_options(
1679@@ -1760,7 +1766,10 @@
1680 {
1681 trx_t*& trx = thd_to_trx(thd);
1682
1683+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1684+ /* used by innobase_commit_ordered */
1685 ut_ad(EQ_CURRENT_THD(thd));
1686+#endif
1687
1688 if (trx == NULL) {
1689 trx = innobase_trx_allocate(thd);
1690@@ -1846,6 +1855,7 @@
1691 {
1692 trx->is_registered = 0;
1693 trx->owns_prepare_mutex = 0;
1694+ trx->called_commit_ordered = 0;
1695 }
1696
1697 /*********************************************************************//**
1698@@ -1861,6 +1871,29 @@
1699 }
1700
1701 /*********************************************************************//**
1702+*/
1703+static inline
1704+void
1705+trx_called_commit_ordered_set(
1706+/*==========================*/
1707+ trx_t* trx)
1708+{
1709+ ut_a(trx_is_registered_for_2pc(trx));
1710+ trx->called_commit_ordered = 1;
1711+}
1712+
1713+/*********************************************************************//**
1714+*/
1715+static inline
1716+bool
1717+trx_called_commit_ordered(
1718+/*======================*/
1719+ const trx_t* trx)
1720+{
1721+ return(trx->called_commit_ordered == 1);
1722+}
1723+
1724+/*********************************************************************//**
1725 Check if transaction is started.
1726 @reutrn true if transaction is in state started */
1727 static
1728@@ -2435,6 +2468,9 @@
1729 innobase_hton->savepoint_set=innobase_savepoint;
1730 innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
1731 innobase_hton->savepoint_release=innobase_release_savepoint;
1732+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1733+ innobase_hton->commit_ordered=innobase_commit_ordered;
1734+#endif
1735 innobase_hton->commit=innobase_commit;
1736 innobase_hton->rollback=innobase_rollback;
1737 innobase_hton->prepare=innobase_xa_prepare;
1738@@ -3187,6 +3223,126 @@
1739 DBUG_RETURN(0);
1740 }
1741
1742+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1743+/* MEMO:
1744+ InnoDB is coded with intention that always trx is accessed by the owner thd.
1745+ (not protected by any mutex/lock)
1746+ So, the caller of innobase_commit_ordered() should be conscious of
1747+ cache coherency between multi CPU about the trx, if called from another thd.
1748+
1749+ MariaDB's first implementation about it seems the cherency is protected by
1750+ the pthread_mutex LOCK_wakeup_ready. So, no problem for now.
1751+
1752+ But we should be aware the importance of the coherency.
1753+ */
1754+/*****************************************************************//**
1755+low function function innobase_commit_ordered().*/
1756+static
1757+void
1758+innobase_commit_ordered_low(
1759+/*========================*/
1760+ trx_t* trx, /*!< in: Innodb transaction */
1761+ THD* thd) /*!< in: MySQL thread handle */
1762+{
1763+ ulonglong tmp_pos;
1764+ DBUG_ENTER("innobase_commit_ordered");
1765+
1766+ /* This part was from innobase_commit() */
1767+
1768+ /* We need current binlog position for ibbackup to work.
1769+ Note, the position is current because commit_ordered is guaranteed
1770+ to be called in same sequenece as writing to binlog. */
1771+retry:
1772+ if (innobase_commit_concurrency > 0) {
1773+ mysql_mutex_lock(&commit_cond_m);
1774+ commit_threads++;
1775+
1776+ if (commit_threads > innobase_commit_concurrency) {
1777+ commit_threads--;
1778+ mysql_cond_wait(&commit_cond,
1779+ &commit_cond_m);
1780+ mysql_mutex_unlock(&commit_cond_m);
1781+ goto retry;
1782+ }
1783+ else {
1784+ mysql_mutex_unlock(&commit_cond_m);
1785+ }
1786+ }
1787+
1788+ mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
1789+ trx->mysql_log_offset = (ib_int64_t) tmp_pos;
1790+
1791+ /* Don't do write + flush right now. For group commit
1792+ to work we want to do the flush in the innobase_commit()
1793+ method, which runs without holding any locks. */
1794+ trx->flush_log_later = TRUE;
1795+ innobase_commit_low(trx);
1796+ trx->flush_log_later = FALSE;
1797+
1798+ if (innobase_commit_concurrency > 0) {
1799+ mysql_mutex_lock(&commit_cond_m);
1800+ commit_threads--;
1801+ mysql_cond_signal(&commit_cond);
1802+ mysql_mutex_unlock(&commit_cond_m);
1803+ }
1804+
1805+ DBUG_VOID_RETURN;
1806+}
1807+
1808+/*****************************************************************//**
1809+Perform the first, fast part of InnoDB commit.
1810+
1811+Doing it in this call ensures that we get the same commit order here
1812+as in binlog and any other participating transactional storage engines.
1813+
1814+Note that we want to do as little as really needed here, as we run
1815+under a global mutex. The expensive fsync() is done later, in
1816+innobase_commit(), without a lock so group commit can take place.
1817+
1818+Note also that this method can be called from a different thread than
1819+the one handling the rest of the transaction. */
1820+static
1821+void
1822+innobase_commit_ordered(
1823+/*====================*/
1824+ handlerton *hton, /*!< in: Innodb handlerton */
1825+ THD* thd, /*!< in: MySQL thread handle of the user for whom
1826+ the transaction should be committed */
1827+ bool all) /*!< in: TRUE - commit transaction
1828+ FALSE - the current SQL statement ended */
1829+{
1830+ trx_t* trx;
1831+ DBUG_ENTER("innobase_commit_ordered");
1832+ DBUG_ASSERT(hton == innodb_hton_ptr);
1833+
1834+ trx = check_trx_exists(thd);
1835+
1836+ /* Since we will reserve the kernel mutex, we have to release
1837+ the search system latch first to obey the latching order. */
1838+
1839+ if (trx->has_search_latch) {
1840+ trx_search_latch_release_if_reserved(trx);
1841+ }
1842+
1843+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
1844+ /* We cannot throw error here; instead we will catch this error
1845+ again in innobase_commit() and report it from there. */
1846+ DBUG_VOID_RETURN;
1847+ }
1848+
1849+ /* commit_ordered is only called when committing the whole transaction
1850+ (or an SQL statement when autocommit is on). */
1851+ DBUG_ASSERT(all ||
1852+ (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
1853+
1854+ innobase_commit_ordered_low(trx, thd);
1855+
1856+ trx_called_commit_ordered_set(trx);
1857+
1858+ DBUG_VOID_RETURN;
1859+}
1860+#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1861+
1862 /*****************************************************************//**
1863 Commits a transaction in an InnoDB database or marks an SQL statement
1864 ended.
1865@@ -3238,6 +3394,16 @@
1866 /* We were instructed to commit the whole transaction, or
1867 this is an SQL statement end and autocommit is on */
1868
1869+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1870+ ut_ad(!trx_has_prepare_commit_mutex(trx));
1871+
1872+ /* Run the fast part of commit if we did not already. */
1873+ if (!trx_called_commit_ordered(trx)) {
1874+ innobase_commit_ordered_low(trx, thd);
1875+ }
1876+#else
1877+ ut_ad(!trx_called_commit_ordered(trx));
1878+
1879 /* We need current binlog position for ibbackup to work.
1880 Note, the position is current because of
1881 prepare_commit_mutex */
1882@@ -3292,6 +3458,7 @@
1883
1884 mysql_mutex_unlock(&prepare_commit_mutex);
1885 }
1886+#endif /* EXTENDED_FOR_COMMIT_ORDERED */
1887
1888 trx_deregister_from_2pc(trx);
1889
29ffd636 1890@@ -10981,6 +11148,7 @@
13ceb006
AM
1891
1892 srv_active_wake_master_thread();
1893
1894+#ifndef EXTENDED_FOR_COMMIT_ORDERED
1895 if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
1896 && (all
1897 || !thd_test_options(
29ffd636 1898@@ -11007,6 +11175,7 @@
13ceb006
AM
1899 mysql_mutex_lock(&prepare_commit_mutex);
1900 trx_owns_prepare_commit_mutex_set(trx);
1901 }
1902+#endif /* ifndef EXTENDED_FOR_COMMIT_ORDERED */
1903
1904 return(error);
1905 }
1906--- a/storage/innobase/handler/ha_innodb.h
1907+++ b/storage/innobase/handler/ha_innodb.h
1908@@ -240,6 +240,12 @@
1909 struct charset_info_st *thd_charset(MYSQL_THD thd);
1910 LEX_STRING *thd_query_string(MYSQL_THD thd);
1911
1912+#ifdef EXTENDED_FOR_COMMIT_ORDERED
1913+/** Get the file name and position of the MySQL binlog corresponding to the
1914+ * current commit.
1915+ */
1916+void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
1917+#else
1918 /** Get the file name of the MySQL binlog.
1919 * @return the name of the binlog file
1920 */
1921@@ -249,6 +255,7 @@
1922 * @return byte offset from the beginning of the binlog
1923 */
1924 ulonglong mysql_bin_log_file_pos(void);
1925+#endif
1926
1927 /**
1928 Check if a user thread is a replication slave thread
1929--- a/storage/innobase/include/trx0trx.h
1930+++ b/storage/innobase/include/trx0trx.h
1931@@ -494,6 +494,7 @@
1932 this is set to 1 then registered should
1933 also be set to 1. This is used in the
1934 XA code */
1935+ unsigned called_commit_ordered:1;/* 1 if innobase_commit_ordered has run. */
1936 /*------------------------------*/
1937 ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
1938 ulint check_foreigns; /* normally TRUE, but if the user
1939--- a/storage/innobase/trx/trx0trx.c
1940+++ b/storage/innobase/trx/trx0trx.c
1941@@ -111,6 +111,7 @@
1942
1943 trx->is_registered = 0;
1944 trx->owns_prepare_mutex = 0;
1945+ trx->called_commit_ordered = 0;
1946
1947 trx->start_time = ut_time();
1948
1949--- /dev/null
1950+++ b/mysql-test/r/group_commit.result
1951@@ -0,0 +1,63 @@
1952+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
1953+SELECT variable_value INTO @commits FROM information_schema.global_status
1954+WHERE variable_name = 'binlog_commits';
1955+SELECT variable_value INTO @group_commits FROM information_schema.global_status
1956+WHERE variable_name = 'binlog_group_commits';
1957+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
1958+INSERT INTO t1 VALUES ("con1");
1959+set DEBUG_SYNC= "now WAIT_FOR group1_running";
1960+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
1961+SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
1962+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
1963+INSERT INTO t1 VALUES ("con2");
1964+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
1965+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
1966+INSERT INTO t1 VALUES ("con3");
1967+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
1968+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
1969+INSERT INTO t1 VALUES ("con4");
1970+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
1971+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
1972+SELECT * FROM t1 ORDER BY a;
1973+a
1974+SET DEBUG_SYNC= "now SIGNAL group2_queued";
1975+SELECT * FROM t1 ORDER BY a;
1976+a
1977+con1
1978+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
1979+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
1980+INSERT INTO t1 VALUES ("con5");
1981+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
1982+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
1983+INSERT INTO t1 VALUES ("con6");
1984+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
1985+SELECT * FROM t1 ORDER BY a;
1986+a
1987+con1
1988+SET DEBUG_SYNC= "now SIGNAL group3_committed";
1989+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
1990+SELECT * FROM t1 ORDER BY a;
1991+a
1992+con1
1993+con2
1994+con3
1995+con4
1996+SET DEBUG_SYNC= "now SIGNAL group2_checked";
1997+SELECT * FROM t1 ORDER BY a;
1998+a
1999+con1
2000+con2
2001+con3
2002+con4
2003+con5
2004+con6
2005+SELECT variable_value - @commits FROM information_schema.global_status
2006+WHERE variable_name = 'binlog_commits';
2007+variable_value - @commits
2008+6
2009+SELECT variable_value - @group_commits FROM information_schema.global_status
2010+WHERE variable_name = 'binlog_group_commits';
2011+variable_value - @group_commits
2012+3
2013+SET DEBUG_SYNC= 'RESET';
2014+DROP TABLE t1;
2015--- /dev/null
2016+++ b/mysql-test/r/group_commit_binlog_pos.result
2017@@ -0,0 +1,35 @@
2018+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2019+INSERT INTO t1 VALUES (0);
2020+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2021+SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2022+INSERT INTO t1 VALUES (1);
2023+SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2024+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2025+INSERT INTO t1 VALUES (2);
2026+SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2027+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2028+INSERT INTO t1 VALUES (3);
2029+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2030+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2031+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2032+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2033+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2034+SELECT * FROM t1 ORDER BY a;
2035+a
2036+0
2037+1
2038+2
2039+SET SESSION debug="+d,crash_dispatch_command_before";
2040+SELECT 1;
2041+Got one of the listed errors
2042+Got one of the listed errors
2043+Got one of the listed errors
2044+SELECT * FROM t1 ORDER BY a;
2045+a
2046+0
2047+1
2048+2
2049+3
2050+InnoDB: Last MySQL binlog file position 0 768, file name ./master-bin.000001
2051+SET DEBUG_SYNC= 'RESET';
2052+DROP TABLE t1;
2053--- /dev/null
2054+++ b/mysql-test/r/group_commit_crash.result
2055@@ -0,0 +1,120 @@
2056+CREATE TABLE t1(a CHAR(255),
2057+b CHAR(255),
2058+c CHAR(255),
2059+d CHAR(255),
2060+id INT AUTO_INCREMENT,
2061+PRIMARY KEY(id)) ENGINE=InnoDB;
2062+create table t2 like t1;
2063+create procedure setcrash(IN i INT)
2064+begin
2065+CASE i
2066+WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2067+WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2068+WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2069+WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2070+WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2071+ELSE BEGIN END;
2072+END CASE;
2073+end //
2074+FLUSH TABLES;
2075+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2076+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2077+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2078+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2079+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2080+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2081+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2082+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2083+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2084+INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2085+RESET MASTER;
2086+START TRANSACTION;
2087+insert into t1 select * from t2;
2088+call setcrash(5);
2089+COMMIT;
2090+Got one of the listed errors
2091+SELECT * FROM t1 ORDER BY id;
2092+a b c d id
2093+SHOW BINLOG EVENTS LIMIT 2,1;
2094+Log_name Pos Event_type Server_id End_log_pos Info
2095+delete from t1;
2096+RESET MASTER;
2097+START TRANSACTION;
2098+insert into t1 select * from t2;
2099+call setcrash(4);
2100+COMMIT;
2101+Got one of the listed errors
2102+SELECT * FROM t1 ORDER BY id;
2103+a b c d id
2104+a b c d 1
2105+a b c d 2
2106+a b c d 3
2107+a b c d 4
2108+a b c d 5
2109+a b c d 6
2110+a b c d 7
2111+a b c d 8
2112+a b c d 9
2113+a b c d 10
2114+SHOW BINLOG EVENTS LIMIT 2,1;
2115+Log_name Pos Event_type Server_id End_log_pos Info
2116+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2117+delete from t1;
2118+RESET MASTER;
2119+START TRANSACTION;
2120+insert into t1 select * from t2;
2121+call setcrash(3);
2122+COMMIT;
2123+Got one of the listed errors
2124+SELECT * FROM t1 ORDER BY id;
2125+a b c d id
2126+a b c d 1
2127+a b c d 2
2128+a b c d 3
2129+a b c d 4
2130+a b c d 5
2131+a b c d 6
2132+a b c d 7
2133+a b c d 8
2134+a b c d 9
2135+a b c d 10
2136+SHOW BINLOG EVENTS LIMIT 2,1;
2137+Log_name Pos Event_type Server_id End_log_pos Info
2138+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2139+delete from t1;
2140+RESET MASTER;
2141+START TRANSACTION;
2142+insert into t1 select * from t2;
2143+call setcrash(2);
2144+COMMIT;
2145+Got one of the listed errors
2146+SELECT * FROM t1 ORDER BY id;
2147+a b c d id
2148+a b c d 1
2149+a b c d 2
2150+a b c d 3
2151+a b c d 4
2152+a b c d 5
2153+a b c d 6
2154+a b c d 7
2155+a b c d 8
2156+a b c d 9
2157+a b c d 10
2158+SHOW BINLOG EVENTS LIMIT 2,1;
2159+Log_name Pos Event_type Server_id End_log_pos Info
2160+master-bin.000001 175 Query 1 269 use `test`; insert into t1 select * from t2
2161+delete from t1;
2162+RESET MASTER;
2163+START TRANSACTION;
2164+insert into t1 select * from t2;
2165+call setcrash(1);
2166+COMMIT;
2167+Got one of the listed errors
2168+SELECT * FROM t1 ORDER BY id;
2169+a b c d id
2170+SHOW BINLOG EVENTS LIMIT 2,1;
2171+Log_name Pos Event_type Server_id End_log_pos Info
2172+delete from t1;
2173+DROP TABLE t1;
2174+DROP TABLE t2;
2175+DROP PROCEDURE setcrash;
2176--- /dev/null
2177+++ b/mysql-test/r/xa_binlog.result
2178@@ -0,0 +1,32 @@
2179+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2180+SET binlog_format= mixed;
2181+RESET MASTER;
2182+XA START 'xatest';
2183+INSERT INTO t1 VALUES (1);
2184+XA END 'xatest';
2185+XA PREPARE 'xatest';
2186+XA COMMIT 'xatest';
2187+XA START 'xatest';
2188+INSERT INTO t1 VALUES (2);
2189+XA END 'xatest';
2190+XA COMMIT 'xatest' ONE PHASE;
2191+BEGIN;
2192+INSERT INTO t1 VALUES (3);
2193+COMMIT;
2194+SELECT * FROM t1 ORDER BY a;
2195+a
2196+1
2197+2
2198+3
2199+SHOW BINLOG EVENTS LIMIT 1,9;
2200+Log_name Pos Event_type Server_id End_log_pos Info
2201+master-bin.000001 # Query 1 # BEGIN
2202+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (1)
2203+master-bin.000001 # Query 1 # COMMIT
2204+master-bin.000001 # Query 1 # BEGIN
2205+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (2)
2206+master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2207+master-bin.000001 # Query 1 # BEGIN
2208+master-bin.000001 # Query 1 # use `test`; INSERT INTO t1 VALUES (3)
2209+master-bin.000001 # Xid 1 # COMMIT /* xid=XX */
2210+DROP TABLE t1;
2211--- /dev/null
2212+++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
2213@@ -0,0 +1,28 @@
2214+CALL mtr.add_suppression("Error writing file 'master-bin'");
2215+RESET MASTER;
2216+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2217+INSERT INTO t1 VALUES(0);
2218+SET SESSION debug='+d,fail_binlog_write_1';
2219+INSERT INTO t1 VALUES(1);
2220+ERROR HY000: Error writing file 'master-bin' (errno: 28)
2221+INSERT INTO t1 VALUES(2);
2222+ERROR HY000: Error writing file 'master-bin' (errno: 28)
2223+SET SESSION debug='';
2224+INSERT INTO t1 VALUES(3);
2225+SELECT * FROM t1;
2226+a
2227+0
2228+3
2229+SHOW BINLOG EVENTS;
2230+Log_name Pos Event_type Server_id End_log_pos Info
2231+BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
2232+BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
2233+BINLOG POS Query 1 ENDPOS BEGIN
2234+BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
2235+BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2236+BINLOG POS Query 1 ENDPOS BEGIN
2237+BINLOG POS Query 1 ENDPOS BEGIN
2238+BINLOG POS Query 1 ENDPOS BEGIN
2239+BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
2240+BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
2241+DROP TABLE t1;
2242--- /dev/null
2243+++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
2244@@ -0,0 +1,30 @@
2245+source include/have_debug.inc;
2246+source include/have_innodb.inc;
2247+source include/have_log_bin.inc;
2248+source include/have_binlog_format_mixed_or_statement.inc;
2249+
2250+CALL mtr.add_suppression("Error writing file 'master-bin'");
2251+
2252+RESET MASTER;
2253+
2254+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2255+INSERT INTO t1 VALUES(0);
2256+SET SESSION debug='+d,fail_binlog_write_1';
2257+--error ER_ERROR_ON_WRITE
2258+INSERT INTO t1 VALUES(1);
2259+--error ER_ERROR_ON_WRITE
2260+INSERT INTO t1 VALUES(2);
2261+SET SESSION debug='';
2262+INSERT INTO t1 VALUES(3);
2263+SELECT * FROM t1;
2264+
2265+# Actually the output from this currently shows a bug.
2266+# The injected IO error leaves partially written transactions in the binlog in
2267+# the form of stray "BEGIN" events.
2268+# These should disappear from the output if binlog error handling is improved
2269+# (see MySQL Bug#37148 and WL#1790).
2270+--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
2271+--replace_column 1 BINLOG 2 POS 5 ENDPOS
2272+SHOW BINLOG EVENTS;
2273+
2274+DROP TABLE t1;
2275--- /dev/null
2276+++ b/mysql-test/t/group_commit.test
2277@@ -0,0 +1,115 @@
2278+--source include/have_debug_sync.inc
2279+--source include/have_innodb.inc
2280+--source include/have_log_bin.inc
2281+
2282+# Test some group commit code paths by using debug_sync to do controlled
2283+# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
2284+# group.
2285+#
2286+# Group 3 is allowed to race as far as possible ahead before group 2 finishes
2287+# to check some edge case for concurrency control.
2288+
2289+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
2290+
2291+SELECT variable_value INTO @commits FROM information_schema.global_status
2292+ WHERE variable_name = 'binlog_commits';
2293+SELECT variable_value INTO @group_commits FROM information_schema.global_status
2294+ WHERE variable_name = 'binlog_group_commits';
2295+
2296+connect(con1,localhost,root,,);
2297+connect(con2,localhost,root,,);
2298+connect(con3,localhost,root,,);
2299+connect(con4,localhost,root,,);
2300+connect(con5,localhost,root,,);
2301+connect(con6,localhost,root,,);
2302+
2303+# Start group1 (with one thread) doing commit, waiting for
2304+# group2 to queue up before finishing.
2305+
2306+connection con1;
2307+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued";
2308+send INSERT INTO t1 VALUES ("con1");
2309+
2310+# Make group2 (with three threads) queue up.
2311+# Make sure con2 is the group commit leader for group2.
2312+# Make group2 wait with running commit_ordered() until group3 has committed.
2313+
2314+connection con2;
2315+set DEBUG_SYNC= "now WAIT_FOR group1_running";
2316+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con2";
2317+SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed";
2318+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
2319+send INSERT INTO t1 VALUES ("con2");
2320+connection con3;
2321+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
2322+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con3";
2323+send INSERT INTO t1 VALUES ("con3");
2324+connection con4;
2325+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
2326+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL group2_con4";
2327+send INSERT INTO t1 VALUES ("con4");
2328+
2329+# When group2 is queued, let group1 continue and queue group3.
2330+
2331+connection default;
2332+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
2333+
2334+# At this point, trasaction 1 is still not visible as commit_ordered() has not
2335+# been called yet.
2336+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
2337+SELECT * FROM t1 ORDER BY a;
2338+
2339+SET DEBUG_SYNC= "now SIGNAL group2_queued";
2340+connection con1;
2341+reap;
2342+
2343+# Now transaction 1 is visible.
2344+connection default;
2345+SELECT * FROM t1 ORDER BY a;
2346+
2347+connection con5;
2348+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
2349+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued";
2350+send INSERT INTO t1 VALUES ("con5");
2351+
2352+connection con6;
2353+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
2354+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con6_queued";
2355+send INSERT INTO t1 VALUES ("con6");
2356+
2357+connection default;
2358+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
2359+# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
2360+SELECT * FROM t1 ORDER BY a;
2361+SET DEBUG_SYNC= "now SIGNAL group3_committed";
2362+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
2363+# Now transactions 1-4 visible.
2364+SELECT * FROM t1 ORDER BY a;
2365+SET DEBUG_SYNC= "now SIGNAL group2_checked";
2366+
2367+connection con2;
2368+reap;
2369+
2370+connection con3;
2371+reap;
2372+
2373+connection con4;
2374+reap;
2375+
2376+connection con5;
2377+reap;
2378+
2379+connection con6;
2380+reap;
2381+
2382+connection default;
2383+# Check all transactions finally visible.
2384+SELECT * FROM t1 ORDER BY a;
2385+
2386+SELECT variable_value - @commits FROM information_schema.global_status
2387+ WHERE variable_name = 'binlog_commits';
2388+SELECT variable_value - @group_commits FROM information_schema.global_status
2389+ WHERE variable_name = 'binlog_group_commits';
2390+
2391+SET DEBUG_SYNC= 'RESET';
2392+DROP TABLE t1;
2393--- /dev/null
2394+++ b/mysql-test/t/group_commit_binlog_pos-master.opt
2395@@ -0,0 +1 @@
2396+--skip-stack-trace --skip-core-file
2397--- /dev/null
2398+++ b/mysql-test/t/group_commit_binlog_pos.test
2399@@ -0,0 +1,89 @@
2400+--source include/have_debug_sync.inc
2401+--source include/have_innodb.inc
2402+--source include/have_log_bin.inc
2403+--source include/have_binlog_format_mixed_or_statement.inc
2404+
2405+# Need DBUG to crash the server intentionally
2406+--source include/have_debug.inc
2407+# Don't test this under valgrind, memory leaks will occur as we crash
2408+--source include/not_valgrind.inc
2409+
2410+# The test case currently uses grep and tail, which may be unavailable on
2411+# some windows systems. But see MWL#191 for how to remove the need for grep.
2412+--source include/not_windows.inc
2413+
2414+# XtraDB stores the binlog position corresponding to the last commit, and
2415+# prints it during crash recovery.
2416+# Test that we get the correct position when we group commit several
2417+# transactions together.
2418+
2419+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
2420+INSERT INTO t1 VALUES (0);
2421+
2422+connect(con1,localhost,root,,);
2423+connect(con2,localhost,root,,);
2424+connect(con3,localhost,root,,);
2425+
2426+# Queue up three commits for group commit.
2427+
2428+connection con1;
2429+SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";
2430+SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3";
2431+send INSERT INTO t1 VALUES (1);
2432+
2433+connection con2;
2434+SET DEBUG_SYNC= "now WAIT_FOR con1_waiting";
2435+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con2_queued";
2436+send INSERT INTO t1 VALUES (2);
2437+
2438+connection con3;
2439+SET DEBUG_SYNC= "now WAIT_FOR con2_queued";
2440+SET DEBUG_SYNC= "commit_group_commit_queue SIGNAL con3_queued";
2441+send INSERT INTO t1 VALUES (3);
2442+
2443+connection default;
2444+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2445+# At this point, no transactions are committed.
2446+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2447+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2448+# At this point, 1 transaction is committed.
2449+SET DEBUG_SYNC= "now SIGNAL con1_loop_cont";
2450+SET DEBUG_SYNC= "now WAIT_FOR con1_loop";
2451+
2452+# At this point, 2 transactions are committed.
2453+SELECT * FROM t1 ORDER BY a;
2454+
2455+connection con2;
2456+reap;
2457+
2458+# Now crash the server with 1+2 in-memory committed, 3 only prepared.
2459+connection default;
2460+system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2461+SET SESSION debug="+d,crash_dispatch_command_before";
2462+--error 2006,2013
2463+SELECT 1;
2464+
2465+connection con1;
2466+--error 2006,2013
2467+reap;
2468+connection con3;
2469+--error 2006,2013
2470+reap;
2471+
2472+system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
2473+
2474+connection default;
2475+--enable_reconnect
2476+--source include/wait_until_connected_again.inc
2477+
2478+# Crash recovery should recover all three transactions.
2479+SELECT * FROM t1 ORDER BY a;
2480+
2481+# Check that the binlog position reported by InnoDB is the correct one
2482+# for the end of the second transaction (as can be checked with
2483+# mysqlbinlog).
2484+let $MYSQLD_DATADIR= `SELECT @@datadir`;
2485+--exec grep 'InnoDB: Last MySQL binlog file position' $MYSQLD_DATADIR/../../log/mysqld.1.err | tail -1
2486+
2487+SET DEBUG_SYNC= 'RESET';
2488+DROP TABLE t1;
2489--- /dev/null
2490+++ b/mysql-test/t/group_commit_crash-master.opt
2491@@ -0,0 +1 @@
2492+--skip-stack-trace --skip-core-file
2493--- /dev/null
2494+++ b/mysql-test/t/group_commit_crash.test
2495@@ -0,0 +1,80 @@
2496+# Testing group commit by crashing a few times.
2497+# Test adapted from the Facebook patch: lp:mysqlatfacebook
2498+--source include/not_embedded.inc
2499+# Don't test this under valgrind, memory leaks will occur
2500+--source include/not_valgrind.inc
2501+
2502+# Binary must be compiled with debug for crash to occur
2503+--source include/have_debug.inc
2504+--source include/have_innodb.inc
2505+--source include/have_log_bin.inc
2506+
2507+let $innodb_file_format_max_orig=`select @@innodb_file_format_max`;
2508+CREATE TABLE t1(a CHAR(255),
2509+ b CHAR(255),
2510+ c CHAR(255),
2511+ d CHAR(255),
2512+ id INT AUTO_INCREMENT,
2513+ PRIMARY KEY(id)) ENGINE=InnoDB;
2514+create table t2 like t1;
2515+delimiter //;
2516+create procedure setcrash(IN i INT)
2517+begin
2518+ CASE i
2519+ WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare";
2520+ WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log";
2521+ WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog";
2522+ WHEN 4 THEN SET SESSION debug="d,crash_commit_after";
2523+ WHEN 5 THEN SET SESSION debug="d,crash_commit_before";
2524+ ELSE BEGIN END;
2525+ END CASE;
2526+end //
2527+delimiter ;//
2528+# Avoid getting a crashed mysql.proc table.
2529+FLUSH TABLES;
2530+
2531+let $numtests = 5;
2532+
2533+let $numinserts = 10;
2534+while ($numinserts)
2535+{
2536+ dec $numinserts;
2537+ INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd');
2538+}
2539+
2540+--enable_reconnect
2541+
2542+while ($numtests)
2543+{
2544+ RESET MASTER;
2545+
2546+ START TRANSACTION;
2547+ insert into t1 select * from t2;
2548+ # Write file to make mysql-test-run.pl expect crash
2549+ --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
2550+
2551+ eval call setcrash($numtests);
2552+
2553+ # Run the crashing query
2554+ --error 2006,2013
2555+ COMMIT;
2556+
2557+ # Poll the server waiting for it to be back online again.
2558+ --source include/wait_until_connected_again.inc
2559+
2560+ # table and binlog should be in sync.
2561+ SELECT * FROM t1 ORDER BY id;
2562+ SHOW BINLOG EVENTS LIMIT 2,1;
2563+
2564+ delete from t1;
2565+
2566+ dec $numtests;
2567+}
2568+
2569+# final cleanup
2570+DROP TABLE t1;
2571+DROP TABLE t2;
2572+DROP PROCEDURE setcrash;
2573+--disable_query_log
2574+eval SET GLOBAL innodb_file_format_max=$innodb_file_format_max_orig;
2575+--enable_query_log
2576--- /dev/null
2577+++ b/mysql-test/t/xa_binlog.test
2578@@ -0,0 +1,32 @@
2579+--source include/have_innodb.inc
2580+--source include/have_log_bin.inc
2581+
2582+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
2583+
2584+# Fix binlog format (otherwise SHOW BINLOG EVENTS will fluctuate).
2585+SET binlog_format= mixed;
2586+
2587+RESET MASTER;
2588+
2589+XA START 'xatest';
2590+INSERT INTO t1 VALUES (1);
2591+XA END 'xatest';
2592+XA PREPARE 'xatest';
2593+XA COMMIT 'xatest';
2594+
2595+XA START 'xatest';
2596+INSERT INTO t1 VALUES (2);
2597+XA END 'xatest';
2598+XA COMMIT 'xatest' ONE PHASE;
2599+
2600+BEGIN;
2601+INSERT INTO t1 VALUES (3);
2602+COMMIT;
2603+
2604+SELECT * FROM t1 ORDER BY a;
2605+
2606+--replace_column 2 # 5 #
2607+--replace_regex /xid=[0-9]+/xid=XX/
2608+SHOW BINLOG EVENTS LIMIT 1,9;
2609+
2610+DROP TABLE t1;
This page took 0.523133 seconds and 4 git commands to generate.