diff -aruN postgresql-8.2.4/GNUmakefile.in pgcluster-1.7.0rc7/GNUmakefile.in --- postgresql-8.2.4/GNUmakefile.in 2006-08-18 21:58:05.000000000 +0200 +++ pgcluster-1.7.0rc7/GNUmakefile.in 2007-02-18 22:52:16.000000000 +0100 @@ -63,13 +63,13 @@ ########################################################################## -distdir := postgresql-$(VERSION) +distdir := pgcluster-$(PGCLUSTER_VERSION) dummy := =install= -garbage := =* "#"* ."#"* *~* *.orig *.rej core postgresql-* +garbage := =* "#"* ."#"* *~* *.orig *.rej core pgcluster-* dist: $(distdir).tar.gz ifeq ($(split-dist), yes) -dist: postgresql-base-$(VERSION).tar.gz postgresql-docs-$(VERSION).tar.gz postgresql-opt-$(VERSION).tar.gz postgresql-test-$(VERSION).tar.gz +dist: pgcluster-base-$(PGCLUSTER_VERSION).tar.gz pgcluster-docs-$(PGCLUSTER_VERSION).tar.gz pgcluster-opt-$(PGCLUSTER_VERSION).tar.gz pgcluster-test-$(PGCLUSTER_VERSION).tar.gz endif dist: -rm -rf $(distdir) @@ -81,19 +81,19 @@ src/tools src/tutorial \ $(addprefix src/pl/, plperl plpython tcl) -docs_files := doc/postgres.tar.gz doc/src doc/TODO.detail +docs_files := doc/pgcluster.tar.gz doc/src doc/TODO.detail -postgresql-base-$(VERSION).tar: distdir +pgcluster-base-$(PGCLUSTER_VERSION).tar: distdir $(TAR) -c $(addprefix --exclude $(distdir)/, $(docs_files) $(opt_files) src/test) \ -f $@ $(distdir) -postgresql-docs-$(VERSION).tar: distdir +pgcluster-docs-$(PGCLUSTER_VERSION).tar: distdir $(TAR) cf $@ $(addprefix $(distdir)/, $(docs_files)) -postgresql-opt-$(VERSION).tar: distdir +pgcluster-opt-$(PGCLUSTER_VERSION).tar: distdir $(TAR) cf $@ $(addprefix $(distdir)/, $(opt_files)) -postgresql-test-$(VERSION).tar: distdir +pgcluster-test-$(PGCLUSTER_VERSION).tar: distdir $(TAR) cf $@ $(distdir)/src/test distdir: diff -aruN postgresql-8.2.4/INSTALL_PGCLUSTER pgcluster-1.7.0rc7/INSTALL_PGCLUSTER --- postgresql-8.2.4/INSTALL_PGCLUSTER 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/INSTALL_PGCLUSTER 2007-02-19 00:59:13.000000000 +0100 @@ -0,0 +1,392 @@ +PGCluster Installation Instructions + +============================================================= +1. Installation +============================================================= + +1-1. Install Cluster DB Server, Replication Server & Load Balancer +---------------------------------------------------------------- +$ cd $source_dir +$ ./configure +$ gmake +$ su +# gmake install +# chown -R postgres /usr/local/pgsql +---------------------------------------------------------------- + +============================================================= +2. Initialize DB +============================================================= +$ su +# adduser postgres +# mkdir /usr/local/pgsql/data +# chown postgres /usr/local/pgsql/data +# su - postgres +$ /usr/local/pgsql/bin/initdb -D /usr/local/pgsql/data + + +============================================================= +3. Configuration +============================================================= +(EX.System Composition) + + | + ((Load Balance Server)) + ( hostname: lb.pgcluster.org) + ( receive port:5432 ) + ( recovery port:6001 ) + | +----------+-------------+------------+---------- + | | + (( Cluster DB 1 )) (( Cluster DB 2 )) + ( hostname:c1.pgcluster.org) ( hostname:c2.pgcluster.org) + ( receive port: 5432 ) ( receive port:5432 ) + ( recovery port:7001 ) ( recovery port 7002 ) + | | +----------+-------------+------------+---------- + | + ((Replication Server)) + ( hostname:pgr.pgcluster.org) + ( receive port:8001 ) + ( recovery port:8101 ) + + +3-1. Load Balance Server + +The setup file of load balance server is copied from the sample file and edited. +(the sample file is installed '/usr/local/pgsql/share' in default) +---------------------------------------------------------------- +$cd /usr/local/pgsql/share +$cp pglb.conf.sample pglb.conf +---------------------------------------------------------------- + +In the case of the above system composition example, +the setup example of pglb.conf file is as the following + +#============================================================ +# Load Balance Server configuration file +#------------------------------------------------------------- +# file: pglb.conf +#------------------------------------------------------------- +# This file controls: +# o which hosts are db cluster server +# o which port use connect to db cluster server +# o how many connections are allowed on each DB server +#============================================================ +#------------------------------------------------------------- +# set cluster DB server information +# o Host_Name : hostname +# o Port : Connection for postmaster +# o Max_Connection : Maximum number of connection to postmaster +#------------------------------------------------------------- + + c1.pgcluster.org + 5432 + 32 + + + c2.pgcluster.org + 5432 + 32 + +#------------------------------------------------------------- +# set Load Balance server information +# o Host_Name : The host name of this load balance server. +# -- please write a host name by FQDN or IP address. +# o Backend_Socket_Dir : Unix domain socket path for the backend +# o Receive_Port : Connection from client +# o Recovery_Port : Connection for recovery process +# o Max_Cluster_Num : Maximum number of cluster DB servers +# o Use_Connection_Pooling : Use connection pool [yes/no] +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + lb.pgcluster.org + /tmp + 5432 + 6001 + 128 + no + 3s + 15s +#------------------------------------------------------------- +# A setup of a log files +# +# o File_Name : Log file name with full path +# o File_Size : Maximum size of each log files +# Please specify in a number and unit(K or M) +# 10 -- 10 Byte +# 10K -- 10 KByte +# 10M -- 10 MByte +# o Rotate : Rotation times +# If specified 0, old versions are removed. +#------------------------------------------------------------- + + /tmp/pglb.log + 1M + 3 + + +3-2. Cluster DB Server + +The Cluster DB server need edit two configuration files +('pg_hba.conf' and 'cluster.conf'). +These files are create under the $PG_DATA directory after 'initdb'. + +A. pg_hba.conf +Permission to connect DB via IP connectoins is need for this system. + +B. cluster.conf +In the case of the above system composition example, +the setup example of cluster.conf file is as the following + +#============================================================ +# Cluster DB Server configuration file +#------------------------------------------------------------- +# file: cluster.conf +#------------------------------------------------------------- +# This file controls: +# o which hosts & port are replication server +# o which port use for replication request to replication server +# o which command use for recovery function +# +#============================================================ +#------------------------------------------------------------- +# set cluster DB server information +# o Host_Name : hostname +# o Port : Connection port for postmaster +# o Recovery_Port : Connection for recovery process +#------------------------------------------------------------- + + pgr.pgcluster.org + 8001 + 8101 + +#------------------------------------------------------------- +# set Cluster DB Server information +# o Host_Name : Host name which connect with replication server +# o Recovery_Port : Connection port for recovery +# o Rsync_Path : Path of rsync command +# o Rsync_Option : File transfer option for rsync +# o Rsync_Compress : Use compression option for rsync +# [yes/no]. default : yes +# o Pg_Dump_Path : path of pg_dump +# o When_Stand_Alone : When all replication servers fell, +# you can set up two kinds of permittion, +# "real_only" or "read_write". +# o Replication_Timeout : Timeout of each replication request +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + c1.pgcluster.org + 7001 + /usr/bin/rsync + ssh -1 + yes + /usr/local/pgsql/bin/pg_dump + read_only + 1min + 3s + 11s +#------------------------------------------------------------- +# set partitional replicate control information +# set DB name and Table name to stop reprication +# o DB_Name : DB name +# o Table_Name : Table name +#------------------------------------------------------------- +# +# test_db +# log_table +# + +3-3. Replication Server + +The setup file of replication server is copied from the sample file and edited. +(the sample file is installed '/usr/local/pgsql/share' in default) +---------------------------------------------------------------- +$cd /usr/local/pgsql/share +$cp pgreplicate.conf.sample pgreplicate.conf +---------------------------------------------------------------- +In the case of the above system composition example, +the setup example of pgreplicate.conf file is as the following + +#============================================================ +# PGReplicate configuration file +#------------------------------------------------------------- +# file: pgreplicate.conf +#------------------------------------------------------------- +# This file controls: +# o which hosts & port are cluster server +# o which port use for replication request from cluster server +#============================================================ +#------------------------------------------------------------- +# set cluster DB server information +# o Host_Name : hostname +# o Port : Connection port for postmaster +# o Recovery_Port : Connection port for recovery +#------------------------------------------------------------- + + c1.pgcluster.org + 5432 + 7001 + + + c2.pgcluster.org + 5432 + 7001 + +#------------------------------------------------------------- +# set Load Balance server information +# o Host_Name : hostname +# o Recovery_Port : Connection port for recovery +#------------------------------------------------------------- + + lb.pgcluster.org + 6001 + +#------------------------------------------------------------ +# A setup of the cascade connection between replication servers. +# When you do not use RLOG recovery, you can skip this setup +# +# o Host_Name : The host name of the upper replication server. +# Please write a host name by FQDN or IP address. +# o Port : The connection port with postmaster. +# o Recovery_Port : The connection port at the time of +# a recovery sequence . +#------------------------------------------------------------ +# +# upper_replicate.pgcluster.org +# 8002 +# 8102 +# +# +#------------------------------------------------------------- +# A setup of a replication server +# +# o Host_Name : The host name of the this replication server. +# Please write a host name by FQDN or IP address. +# o Replicate_Port : Connection port for replication +# o Recovery_Port : Connection port for recovery +# o RLOG_Port : Connection port for replication log +# o Response_mode : Timing which returns a response +# - normal -- return result of DB which received the query +# - reliable -- return result after waiting for response of +# all Cluster DBs. +# o Use_Replication_Log : Use replication log +# [yes/no]. default : no +# o Replication_Timeout : Timeout of each replication response +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + pgr.pgcluster.org + 8001 + 8101 + 8301 + normal + no + 1min + 3s + 15s +#------------------------------------------------------------- +# A setup of a log files +# +# o File_Name : Log file name with full path +# o File_Size : maximum size of each log files +# Please specify in a number and unit(K or M) +# 10 -- 10 Byte +# 10K -- 10 KByte +# 10M -- 10 MByte +# o Rotate : Rotation times +# If specified 0, old versions are removed. +#------------------------------------------------------------- + + /tmp/pgreplicate.log + 1M + 3 + + +============================================================= +4. Start Up / Stop +============================================================= + +4-1. replication server + +A. Start replication server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pgreplicate -D /usr/local/pgsql/etc +---------------------------------------------------------------- + +B. Stop replication server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pgreplicate -D /usr/local/pgsql/etc stop +---------------------------------------------------------------- + +usage: pgreplicate [-D path_of_config_file] [-W path_of_work_files] +[-w wait time before fork process][-U login user][-l][-n][-v][-h][stop] + -l: print error logs in the log file. + -n: don't run in daemon mode. + -v: debug mode. need '-n' flag + -h: print this help + stop: stop pgreplicate +(config file default path: ./pgreplicate.conf) + +4-2. cluster DB server +$PG_HOME = /usr/local/pgsql +$PG_DATA = /usr/local/pgsql/data + +A. Start cluster DB server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pg_ctl start -D /usr/local/pgsql/data +---------------------------------------------------------------- + +B. Stop cluster DB server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pg_ctl stop -D /usr/local/pgsql/data +---------------------------------------------------------------- + +C-1. RE start (recovery) cluster DB server with backup +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pg_ctl start -D /usr/local/pgsql/data -o "-R" +---------------------------------------------------------------- + +C-2. RE start (recovery) cluster DB server without backup +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pg_ctl start -D /usr/local/pgsql/data -o "-r" +---------------------------------------------------------------- + +D. Upgrade cluster DB server with pg_dump +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pg_ctl start -D /usr/local/pgsql/data -o "-U" +---------------------------------------------------------------- + +4-3. load balance server + +A. Start load balance server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pglb -D /usr/local/pgsql/share +---------------------------------------------------------------- + +B. Stop load balance server +---------------------------------------------------------------- +$ /usr/local/pgsql/bin/pglb -D /usr/local/pgsql/share stop +---------------------------------------------------------------- + +usage: pglb [-D path_of_config_file] [-W path_of_work_files] [-n][-v][-h][stop] + -l: print error logs in the log file. + -n: don't run in daemon mode. + -v: debug mode. need '-n' flag + -h: print this help + stop: stop pglb + (config file default path: ./pglb.conf) diff -aruN postgresql-8.2.4/README_PGCLUSTER pgcluster-1.7.0rc7/README_PGCLUSTER --- postgresql-8.2.4/README_PGCLUSTER 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/README_PGCLUSTER 2007-02-19 01:00:40.000000000 +0100 @@ -0,0 +1,118 @@ +PGCluster: Multi-Master Synchronous Replication System for PostgreSQL +=========================================================== + +PGCluster is a multi-master and synchronous replication system that supports load balancing of PostgreSQL. + +Changed: + $INSTALL_DIR/GNUmakefile.in + $INSTALL_DIR/INSTALL_PGCLUSTER + $INSTALL_DIR/README_PGCLUSTER + $INSTALL_DIR/configure + $INSTALL_DIR/configure.in + $INSTALL_DIR/pgcluster.sh.tmpl + $INSTALL_DIR/src/Makefile + $INSTALL_DIR/src/Makefile.global.in + $INSTALL_DIR/src/backend/Makefile + $INSTALL_DIR/src/backend/access/transam/clog.c + $INSTALL_DIR/src/backend/access/transam/xact.c + $INSTALL_DIR/src/backend/catalog/catalog.c + $INSTALL_DIR/src/backend/commands/analyze.c + $INSTALL_DIR/src/backend/commands/copy.c + $INSTALL_DIR/src/backend/commands/sequence.c + $INSTALL_DIR/src/backend/executor/functions.c + $INSTALL_DIR/src/backend/libpq/Makefile + $INSTALL_DIR/src/backend/libpq/be-fsstubs.c + $INSTALL_DIR/src/backend/libpq/cluster.conf.sample + $INSTALL_DIR/src/backend/libpq/recovery.c + $INSTALL_DIR/src/backend/libpq/lifecheck.c + $INSTALL_DIR/src/backend/libpq/replicate.c + $INSTALL_DIR/src/backend/libpq/replicate_com.c + $INSTALL_DIR/src/backend/main/main.c + $INSTALL_DIR/src/backend/parser/gram.y + $INSTALL_DIR/src/backend/parser/keywords.c + $INSTALL_DIR/src/backend/parser/parse_clause.c + $INSTALL_DIR/src/backend/parser/parse_relation.c + $INSTALL_DIR/src/backend/postmaster/postmaster.c + $INSTALL_DIR/src/backend/storage/large_object/inv_api.c + $INSTALL_DIR/src/backend/storage/lmgr/deadlock.c + $INSTALL_DIR/src/backend/storage/lmgr/lmgr.c + $INSTALL_DIR/src/backend/storage/lmgr/lock.c + $INSTALL_DIR/src/backend/storage/lmgr/proc.c + $INSTALL_DIR/src/backend/tcop/postgres.c + $INSTALL_DIR/src/backend/tcop/pquery.c + $INSTALL_DIR/src/backend/tcop/utility.c + $INSTALL_DIR/src/backend/utils/adt/float.c + $INSTALL_DIR/src/backend/utils/adt/nabstime.c + $INSTALL_DIR/src/backend/utils/adt/ri_triggers.c + $INSTALL_DIR/src/backend/utils/adt/timestamp.c + $INSTALL_DIR/src/backend/utils/error/assert.c + $INSTALL_DIR/src/backend/utils/error/elog.c + $INSTALL_DIR/src/backend/utils/fmgr/fmgr.c + $INSTALL_DIR/src/backend/utils/mb/mbutils.c + $INSTALL_DIR/src/backend/utils/misc/guc.c + $INSTALL_DIR/src/backend/utils/misc/postgresql.conf.sample + $INSTALL_DIR/src/bin/initdb/initdb.c + $INSTALL_DIR/src/bin/pg_dump/pg_dump.c + $INSTALL_DIR/src/bin/pg_dump/pg_dumpall.c + $INSTALL_DIR/src/include/pg_config.h.in + $INSTALL_DIR/src/include/replicate.h + $INSTALL_DIR/src/include/replicate_com.h + $INSTALL_DIR/src/include/storage/lmgr.h + $INSTALL_DIR/src/include/storage/proc.h + $INSTALL_DIR/src/interfaces/libpq/Makefile + $INSTALL_DIR/src/makefiles/Makefile.aix + $INSTALL_DIR/src/makefiles/Makefile.freebsd + $INSTALL_DIR/src/makefiles/Makefile.hpux + $INSTALL_DIR/src/makefiles/Makefile.linux + $INSTALL_DIR/src/makefiles/Makefile.netbsd + $INSTALL_DIR/src/makefiles/Makefile.openbsd + $INSTALL_DIR/src/makefiles/Makefile.solaris + $INSTALL_DIR/src/makefiles/Makefile.sunos4 +Added: + $INSTALL_DIR/src/pgcluster/Makefile + $INSTALL_DIR/src/pgcluster/libpgc/Makefile + $INSTALL_DIR/src/pgcluster/libpgc/libpgc.h + $INSTALL_DIR/src/pgcluster/libpgc/sem.c + $INSTALL_DIR/src/pgcluster/libpgc/show.c + $INSTALL_DIR/src/pgcluster/libpgc/signal.c + $INSTALL_DIR/src/pgcluster/pglb/AUTHORS + $INSTALL_DIR/src/pgcluster/pglb/COPYING + $INSTALL_DIR/src/pgcluster/pglb/Makefile + $INSTALL_DIR/src/pgcluster/pglb/child.c + $INSTALL_DIR/src/pgcluster/pglb/cluster_table.c + $INSTALL_DIR/src/pgcluster/pglb/lifecheck.c + $INSTALL_DIR/src/pgcluster/pglb/load_balance.c + $INSTALL_DIR/src/pgcluster/pglb/main.c + $INSTALL_DIR/src/pgcluster/pglb/pglb.conf.sample + $INSTALL_DIR/src/pgcluster/pglb/pglb.h + $INSTALL_DIR/src/pgcluster/pglb/pool_auth.c + $INSTALL_DIR/src/pgcluster/pglb/pool_connection_pool.c + $INSTALL_DIR/src/pgcluster/pglb/pool_params.c + $INSTALL_DIR/src/pgcluster/pglb/pool_process_query.c + $INSTALL_DIR/src/pgcluster/pglb/pool_stream.c + $INSTALL_DIR/src/pgcluster/pglb/recovery.c + $INSTALL_DIR/src/pgcluster/pglb/socket.c + $INSTALL_DIR/src/pgcluster/pgrp/AUTHORS + $INSTALL_DIR/src/pgcluster/pgrp/COPYING + $INSTALL_DIR/src/pgcluster/pgrp/Makefile + $INSTALL_DIR/src/pgcluster/pgrp/cascade.c + $INSTALL_DIR/src/pgcluster/pgrp/conf.c + $INSTALL_DIR/src/pgcluster/pgrp/lifecheck.c + $INSTALL_DIR/src/pgcluster/pgrp/main.c + $INSTALL_DIR/src/pgcluster/pgrp/pgreplicate.conf.sample + $INSTALL_DIR/src/pgcluster/pgrp/pgreplicate.h + $INSTALL_DIR/src/pgcluster/pgrp/pqformat.c + $INSTALL_DIR/src/pgcluster/pgrp/recovery.c + $INSTALL_DIR/src/pgcluster/pgrp/replicate.c + $INSTALL_DIR/src/pgcluster/pgrp/rlog.c + $INSTALL_DIR/src/pgcluster/tool/Makefile + $INSTALL_DIR/src/pgcluster/tool/README.jp + $INSTALL_DIR/src/pgcluster/tool/pgcbench.c + $INSTALL_DIR/src/pgcluster/tool/pgcbench.sh + $INSTALL_DIR/src/pgcluster/tool/tpc-b_like.sql + +The latest version of this software may be obtained at +http://pgfoundry.org/projects/pgcluster/ + +For more information look at pgFoundry web site located at +http://pgcluster.projects.postgresql.org/ diff -aruN postgresql-8.2.4/configure pgcluster-1.7.0rc7/configure --- postgresql-8.2.4/configure 2007-02-07 04:48:58.000000000 +0100 +++ pgcluster-1.7.0rc7/configure 2007-03-01 16:27:35.000000000 +0100 @@ -275,6 +275,8 @@ PACKAGE_STRING='PostgreSQL 8.2.4' PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org' +PGCLUSTER_VERSION='1.7.0rc7' + ac_unique_file="src/backend/access/common/heaptuple.c" ac_default_prefix=/usr/local/pgsql # Factoring default headers for most tests. @@ -314,7 +316,7 @@ # include #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS configure_args build build_cpu build_vendor build_os host host_cpu host_vendor host_os PORTNAME docdir enable_nls WANTED_LANGUAGES default_port enable_shared enable_rpath enable_debug DTRACE DTRACEFLAGS enable_dtrace CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP GCC TAS autodepend INCLUDES enable_thread_safety with_tcl with_perl with_python with_krb5 krb_srvtab with_pam with_ldap with_bonjour with_openssl with_zlib EGREP ELF_SYS LDFLAGS_SL AWK FLEX FLEXFLAGS LN_S LD with_gnu_ld ld_R_works RANLIB ac_ct_RANLIB TAR STRIP ac_ct_STRIP STRIP_STATIC_LIB STRIP_SHARED_LIB YACC YFLAGS PERL perl_archlibexp perl_privlibexp perl_useshrplib perl_embed_ldflags PYTHON python_version python_configdir python_includespec python_libdir python_libspec python_additional_libs HAVE_IPV6 LIBOBJS acx_pthread_config PTHREAD_CC PTHREAD_LIBS PTHREAD_CFLAGS LDAP_LIBS_FE LDAP_LIBS_BE HAVE_POSIX_SIGNALS MSGFMT MSGMERGE XGETTEXT localedir TCLSH TCL_CONFIG_SH TCL_INCLUDE_SPEC TCL_LIB_FILE TCL_LIBS TCL_LIB_SPEC TCL_SHARED_BUILD TCL_SHLIB_LD_LIBS NSGMLS JADE have_docbook DOCBOOKSTYLE COLLATEINDEX SGMLSPL vpath_build LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS configure_args build build_cpu build_vendor build_os host host_cpu host_vendor host_os PORTNAME docdir enable_nls WANTED_LANGUAGES default_port enable_shared enable_rpath enable_debug DTRACE DTRACEFLAGS enable_dtrace CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP GCC TAS autodepend INCLUDES enable_thread_safety with_tcl with_perl with_python with_krb5 krb_srvtab with_pam with_ldap with_bonjour with_openssl with_zlib EGREP ELF_SYS LDFLAGS_SL AWK FLEX FLEXFLAGS LN_S LD with_gnu_ld ld_R_works RANLIB ac_ct_RANLIB TAR STRIP ac_ct_STRIP STRIP_STATIC_LIB STRIP_SHARED_LIB YACC YFLAGS PERL perl_archlibexp perl_privlibexp perl_useshrplib perl_embed_ldflags PYTHON python_version python_configdir python_includespec python_libdir python_libspec python_additional_libs HAVE_IPV6 LIBOBJS acx_pthread_config PTHREAD_CC PTHREAD_LIBS PTHREAD_CFLAGS LDAP_LIBS_FE LDAP_LIBS_BE HAVE_POSIX_SIGNALS MSGFMT MSGMERGE XGETTEXT localedir TCLSH TCL_CONFIG_SH TCL_INCLUDE_SPEC TCL_LIB_FILE TCL_LIBS TCL_LIB_SPEC TCL_SHARED_BUILD TCL_SHLIB_LD_LIBS NSGMLS JADE have_docbook DOCBOOKSTYLE COLLATEINDEX SGMLSPL vpath_build LTLIBOBJS PGCLUSTER_VERSION' ac_subst_files='' # Initialize some variables set by options. @@ -1241,6 +1243,10 @@ #define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" _ACEOF +cat >>confdefs.h <<_ACEOF +#define PGCLUSTER_VERSION "$PGCLUSTER_VERSION" +_ACEOF + # Let the site file select an alternate cache file if it wants to. # Prefer explicitly selected file to automatically selected ones. @@ -23555,6 +23561,7 @@ s,@host_os@,$host_os,;t t s,@PORTNAME@,$PORTNAME,;t t s,@docdir@,$docdir,;t t +s,@PGCLUSTER_VERSION@,$PGCLUSTER_VERSION,;t t s,@enable_nls@,$enable_nls,;t t s,@WANTED_LANGUAGES@,$WANTED_LANGUAGES,;t t s,@default_port@,$default_port,;t t diff -aruN postgresql-8.2.4/configure.in pgcluster-1.7.0rc7/configure.in --- postgresql-8.2.4/configure.in 2007-02-07 04:48:58.000000000 +0100 +++ pgcluster-1.7.0rc7/configure.in 2007-02-18 22:52:16.000000000 +0100 @@ -27,6 +27,7 @@ AC_SUBST(configure_args, [$ac_configure_args]) AC_DEFINE_UNQUOTED(PG_VERSION, "$PACKAGE_VERSION", [PostgreSQL version as a string]) +AC_DEFINE_UNQUOTED(PGCLUSTER_VERSION, "$PGCLUSTER_VERSION", [PGCluster version]) AC_CANONICAL_HOST diff -aruN postgresql-8.2.4/pgcluster.sh.tmpl pgcluster-1.7.0rc7/pgcluster.sh.tmpl --- postgresql-8.2.4/pgcluster.sh.tmpl 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/pgcluster.sh.tmpl 2007-02-18 22:52:16.000000000 +0100 @@ -0,0 +1,56 @@ +#!/bin/sh +# +# $FreeBSD: ports/databases/pgcluster/files/pgcluster.sh.tmpl,v 1.1 2004/01/26 09:02:45 kuriyama Exp $ +# +# PROVIDE: pgcluster +# REQUIRE: DAEMON +# BEFORE: pgreplicate +# KEYWORD: FreeBSD +# +# Add the following line to /etc/rc.conf to enable pgcluster: +# +# pgcluster_enable="YES" +# # optional +# pgcluster_data="/home/pgsql/data" +# pgcluster_flags="-w -s" +# + +pgcluster_enable="NO" +pgcluster_data="%%PREFIX%%/pgsql/data" +pgcluster_flags="-w -s" + +. %%RC_SUBR%% + +load_rc_config pgcluster + +name=pgcluster +command=%%PREFIX%%/bin/pg_ctl +pgcluster_user=pgsql +extra_commands="initdb recover" +initdb_cmd="pgcluster_initdb" +recover_cmd="pgcluster_recover" +start_cmd="pgcluster_start" +stop_cmd="pgcluster_stop" + +pgcluster_flags="${pgcluster_flags} -D ${pgcluster_data}" +pidfile="${pgcluster_data}/postmaster.pid" + +pgcluster_start() +{ + su -m ${pgcluster_user} -c "exec ${command} start ${pgcluster_flags} -o '-i'" +} +pgcluster_stop() +{ + su -m ${pgcluster_user} -c "exec ${command} stop ${pgcluster_flags} -m i" +} +pgcluster_recover() +{ + su -m ${pgcluster_user} -c "exec ${command} start ${pgcluster_flags} -o '-i -R'" +} +pgcluster_initdb() +{ + su -m ${pgcluster_user} -c "exec %%PREFIX%%/bin/initdb -D ${pgcluster_data}" +} + +load_rc_config $name +run_rc_command "$1" diff -aruN postgresql-8.2.4/src/Makefile pgcluster-1.7.0rc7/src/Makefile --- postgresql-8.2.4/src/Makefile 2006-06-23 01:50:35.000000000 +0200 +++ pgcluster-1.7.0rc7/src/Makefile 2007-02-18 22:52:16.000000000 +0100 @@ -16,14 +16,15 @@ all install installdirs uninstall dep depend distprep: $(MAKE) -C port $@ $(MAKE) -C timezone $@ + $(MAKE) -C interfaces $@ $(MAKE) -C backend $@ $(MAKE) -C backend/utils/mb/conversion_procs $@ $(MAKE) -C include $@ - $(MAKE) -C interfaces $@ $(MAKE) -C bin $@ $(MAKE) -C pl $@ $(MAKE) -C makefiles $@ $(MAKE) -C test/regress $@ + $(MAKE) -C pgcluster $@ install: install-local @@ -44,6 +45,7 @@ rm -f $(addprefix '$(DESTDIR)$(pgxsdir)/$(subdir)'/, Makefile.global Makefile.port Makefile.shlib nls-global.mk) clean: + $(MAKE) -C pgcluster $@ $(MAKE) -C port $@ $(MAKE) -C timezone $@ $(MAKE) -C backend $@ @@ -57,6 +59,7 @@ $(MAKE) -C test/thread $@ distclean maintainer-clean: + -$(MAKE) -C pgcluster $@ -$(MAKE) -C port $@ -$(MAKE) -C timezone $@ -$(MAKE) -C backend $@ diff -aruN postgresql-8.2.4/src/Makefile.global.in pgcluster-1.7.0rc7/src/Makefile.global.in --- postgresql-8.2.4/src/Makefile.global.in 2006-10-08 19:15:33.000000000 +0200 +++ pgcluster-1.7.0rc7/src/Makefile.global.in 2007-02-18 22:52:16.000000000 +0100 @@ -31,6 +31,9 @@ # PostgreSQL version number VERSION = @PACKAGE_VERSION@ +# PGCluster version number +PGCLUSTER_VERSION = @PGCLUSTER_VERSION@ + # Support for VPATH builds vpath_build = @vpath_build@ abs_top_srcdir = @abs_top_srcdir@ @@ -207,6 +210,7 @@ GCC = @GCC@ CFLAGS = @CFLAGS@ +CFLAGS += -DUSE_REPLICATION -DPRINT_DEBUG # Kind-of compilers YACC = @YACC@ diff -aruN postgresql-8.2.4/src/backend/Makefile pgcluster-1.7.0rc7/src/backend/Makefile --- postgresql-8.2.4/src/backend/Makefile 2006-10-08 19:15:33.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/Makefile 2007-02-18 22:52:16.000000000 +0100 @@ -39,7 +39,7 @@ ifneq ($(PORTNAME), win32) ifneq ($(PORTNAME), aix) -postgres: $(OBJS) +postgres: $(OBJS) $(libpq_srcdir)/libpq.a $(CC) $(CFLAGS) $(LDFLAGS) $(export_dynamic) $^ $(LIBS) -o $@ endif @@ -169,6 +169,7 @@ $(INSTALL_DATA) $(srcdir)/libpq/pg_ident.conf.sample '$(DESTDIR)$(datadir)/pg_ident.conf.sample' $(INSTALL_DATA) $(srcdir)/utils/misc/postgresql.conf.sample '$(DESTDIR)$(datadir)/postgresql.conf.sample' $(INSTALL_DATA) $(srcdir)/access/transam/recovery.conf.sample '$(DESTDIR)$(datadir)/recovery.conf.sample' + $(INSTALL_DATA) $(srcdir)/libpq/cluster.conf.sample $(DESTDIR)$(datadir)/cluster.conf.sample install-bin: postgres $(POSTGRES_IMP) installdirs $(INSTALL_PROGRAM) postgres$(X) '$(DESTDIR)$(bindir)/postgres$(X)' @@ -221,8 +222,9 @@ $(MAKE) -C catalog uninstall-data rm -f '$(DESTDIR)$(datadir)/pg_hba.conf.sample' \ '$(DESTDIR)$(datadir)/pg_ident.conf.sample' \ - '$(DESTDIR)$(datadir)/postgresql.conf.sample' \ - '$(DESTDIR)$(datadir)/recovery.conf.sample' + '$(DESTDIR)$(datadir)/postgresql.conf.sample' \ + '$(DESTDIR)$(datadir)/recovery.conf.sample' \ + '$(DESTDIR)$(datadir)/cluster.conf.sample' ########################################################################## diff -aruN postgresql-8.2.4/src/backend/access/transam/clog.c pgcluster-1.7.0rc7/src/backend/access/transam/clog.c --- postgresql-8.2.4/src/backend/access/transam/clog.c 2006-11-05 23:42:07.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/access/transam/clog.c 2007-02-18 22:52:16.000000000 +0100 @@ -57,6 +57,9 @@ #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE) #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE) +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * Link to shared-memory data structures for CLOG control @@ -335,7 +338,16 @@ /* Check to see if there's any files that could be removed */ if (!SlruScanDirectory(ClogCtl, cutoffPage, false)) +#ifdef USE_REPLICATION + { + /* Perform a forced CHECKPOINT */ + /* CreateCheckPoint(false, true); */ + RequestCheckpoint(true, false); +#endif /* USE_REPLICATION */ return; /* nothing to remove */ +#ifdef USE_REPLICATION + } +#endif /* USE_REPLICATION */ /* Write XLOG record and flush XLOG to disk */ WriteTruncateXlogRec(cutoffPage); diff -aruN postgresql-8.2.4/src/backend/access/transam/xact.c pgcluster-1.7.0rc7/src/backend/access/transam/xact.c --- postgresql-8.2.4/src/backend/access/transam/xact.c 2006-11-23 02:14:59.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/access/transam/xact.c 2007-02-18 22:52:16.000000000 +0100 @@ -44,6 +44,9 @@ #include "utils/relcache.h" #include "utils/guc.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * User-tweakable parameters @@ -4335,3 +4338,11 @@ else appendStringInfo(buf, "UNKNOWN"); } + +#ifdef USE_REPLICATION +void +PGR_Reload_Start_Time(void) +{ + xactStartTimestamp = GetCurrentTimestamp(); +} +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/backend/catalog/catalog.c pgcluster-1.7.0rc7/src/backend/catalog/catalog.c --- postgresql-8.2.4/src/backend/catalog/catalog.c 2006-10-04 02:29:50.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/catalog/catalog.c 2007-02-18 22:52:16.000000000 +0100 @@ -38,6 +38,9 @@ #include "utils/fmgroids.h" #include "utils/relcache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ #define OIDCHARS 10 /* max chars printed by %u */ @@ -360,7 +363,7 @@ Oid GetNewOidWithIndex(Relation relation, Relation indexrel) { - Oid newOid; + Oid newOid = 0; IndexScanDesc scan; ScanKeyData key; bool collides; @@ -368,8 +371,18 @@ /* Generate new OIDs until we find one not in the table */ do { +#ifdef USE_REPLICATION + if (PGR_Is_Sync_OID == true) + { + newOid = PGRGetNewObjectId(newOid); + } + else + { + newOid = GetNewObjectId(); + } +#else newOid = GetNewObjectId(); - +#endif /* USE_REPLICATION */ ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, @@ -454,3 +467,4 @@ return rnode.relNode; } + diff -aruN postgresql-8.2.4/src/backend/commands/analyze.c pgcluster-1.7.0rc7/src/backend/commands/analyze.c --- postgresql-8.2.4/src/backend/commands/analyze.c 2006-11-05 23:42:08.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/commands/analyze.c 2007-02-18 22:52:16.000000000 +0100 @@ -36,6 +36,9 @@ #include "utils/syscache.h" #include "utils/tuplesort.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* Data structure for Algorithm S from Knuth 3.4.2 */ typedef struct @@ -934,7 +937,11 @@ static double random_fract(void) { +#ifdef USE_REPLICATION + return ((double) PGR_Random() + 1) / ((double) MAX_RANDOM_VALUE + 2); +#else return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2); +#endif /* USE_REPLICATION */ } /* diff -aruN postgresql-8.2.4/src/backend/commands/copy.c pgcluster-1.7.0rc7/src/backend/commands/copy.c --- postgresql-8.2.4/src/backend/commands/copy.c 2006-10-06 19:13:58.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/commands/copy.c 2007-02-18 22:52:16.000000000 +0100 @@ -41,6 +41,9 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) #define OCTVALUE(c) ((c) - '0') @@ -488,6 +491,9 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread) { int bytesread = 0; +#ifdef USE_REPLICATION + char * ptr = (char *)databuf; +#endif switch (cstate->copy_dest) { @@ -578,6 +584,9 @@ } break; } +#ifdef USE_REPLICATION + PGR_Set_Copy_Data(PGRCopyData,ptr,bytesread,0); +#endif /* USE_REPLICATION */ return bytesread; } @@ -2093,6 +2102,13 @@ } } +#ifdef USE_REPLICATION + if (done) + { + PGR_Set_Copy_Data(PGRCopyData,(char *)NULL,0,1); + } +#endif /* USE_REPLICATION */ + /* Done, clean up */ error_context_stack = errcontext.previous; @@ -2201,6 +2217,11 @@ break; } } +#ifdef USE_REPLICATION + /* + PGR_Set_Copy_Data(PGRCopyData,cstate->line_buf.data,cstate->line_buf.len,0); + */ +#endif /* Done reading the line. Convert it to server encoding. */ if (cstate->need_transcoding) diff -aruN postgresql-8.2.4/src/backend/commands/prepare.c pgcluster-1.7.0rc7/src/backend/commands/prepare.c --- postgresql-8.2.4/src/backend/commands/prepare.c 2006-10-04 02:29:51.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/commands/prepare.c 2007-02-18 22:52:16.000000000 +0100 @@ -29,6 +29,9 @@ #include "utils/builtins.h" #include "utils/memutils.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * The hash table in which prepared queries are stored. This is @@ -793,3 +796,27 @@ result = construct_array(tmp_ary, len, REGTYPEOID, 4, true, 'i'); return PointerGetDatum(result); } + + +#ifdef USE_REPLICATION +bool +PGR_is_select_prepared_statement(PrepareStmt *stmt) +{ + PreparedStatement *entry; + if ((stmt == NULL) || (stmt->name == NULL)) + { + return false; + } + entry = FetchPreparedStatement(stmt->name, true); + if (entry == NULL) + { + return false; + } + if (!strcmp(entry->commandTag,"SELECT")) + { + return true; + } + return false; +} +#endif /* USE_REPLICATION */ + diff -aruN postgresql-8.2.4/src/backend/commands/sequence.c pgcluster-1.7.0rc7/src/backend/commands/sequence.c --- postgresql-8.2.4/src/backend/commands/sequence.c 2006-10-06 19:13:58.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/commands/sequence.c 2007-02-18 22:52:16.000000000 +0100 @@ -31,6 +31,9 @@ #include "utils/resowner.h" #include "utils/syscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * We don't want to log each fetching of a value from a sequence, @@ -396,6 +399,9 @@ RangeVar *sequence; Oid relid; +#ifdef USE_REPLICATION + Xlog_Check_Replicate(CMD_UTILITY); +#endif /* USE_REPLICATION */ sequence = makeRangeVarFromNameList(textToQualifiedNameList(seqin)); relid = RangeVarGetRelid(sequence, false); @@ -622,6 +628,10 @@ SeqTable elm; Relation seqrel; +#ifdef USE_REPLICATION + Xlog_Check_Replicate(CMD_UTILITY); +#endif /* USE_REPLICATION */ + /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); diff -aruN postgresql-8.2.4/src/backend/executor/functions.c pgcluster-1.7.0rc7/src/backend/executor/functions.c --- postgresql-8.2.4/src/backend/executor/functions.c 2007-02-02 01:03:17.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/executor/functions.c 2007-02-18 22:52:16.000000000 +0100 @@ -30,6 +30,9 @@ #include "utils/syscache.h" #include "utils/typcache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * We have an execution_state record for each query in a function. Each @@ -454,6 +457,13 @@ Datum value; MemoryContext oldcontext; +#ifdef USE_REPLICATION + if ((es != NULL) && (es->qd != NULL)) + { + Xlog_Check_Replicate(es->qd->operation); + } +#endif /* USE_REPLICATION */ + if (es->status == F_EXEC_START) postquel_start(es, fcache); diff -aruN postgresql-8.2.4/src/backend/libpq/Makefile pgcluster-1.7.0rc7/src/backend/libpq/Makefile --- postgresql-8.2.4/src/backend/libpq/Makefile 2003-11-29 20:51:49.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/Makefile 2007-02-18 22:52:16.000000000 +0100 @@ -15,7 +15,8 @@ # be-fsstubs is here for historical reasons, probably belongs elsewhere OBJS = be-fsstubs.o be-secure.o auth.o crypt.o hba.o ip.o md5.o pqcomm.o \ - pqformat.o pqsignal.o + pqformat.o pqsignal.o \ + replicate.o replicate_com.o recovery.o lifecheck.o all: SUBSYS.o diff -aruN postgresql-8.2.4/src/backend/libpq/auth.c pgcluster-1.7.0rc7/src/backend/libpq/auth.c --- postgresql-8.2.4/src/backend/libpq/auth.c 2006-11-06 02:27:52.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/auth.c 2007-02-18 22:52:16.000000000 +0100 @@ -31,6 +31,9 @@ #include "libpq/pqformat.h" #include "storage/ipc.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ static void sendAuthRequest(Port *port, AuthRequest areq); static void auth_failed(Port *port, int status); @@ -888,6 +891,12 @@ { StringInfoData buf; +#ifdef USE_REPLICATION + if (PGR_password == NULL) + { + return NULL; + } +#endif /* USE_REPLICATION */ if (PG_PROTOCOL_MAJOR(port->proto) >= 3) { /* Expect 'p' message type */ @@ -939,6 +948,19 @@ ereport(DEBUG5, (errmsg("received password packet"))); +#ifdef USE_REPLICATION + if (strncmp(buf.data,"md5",3) == 0) + { + char * ptr = NULL; + ptr = strchr(buf.data,'('); + if (ptr != NULL) + { + PGR_get_md5salt(PGR_password->md5Salt,ptr); + *ptr='\0'; + } + } + strncpy(PGR_password->password,buf.data, PASSWORD_MAX_LENGTH ); +#endif /* USE_REPLICATION */ /* * Return the received string. Note we do not attempt to do any * character-set conversion on it; since we don't yet know the client's diff -aruN postgresql-8.2.4/src/backend/libpq/be-fsstubs.c pgcluster-1.7.0rc7/src/backend/libpq/be-fsstubs.c --- postgresql-8.2.4/src/backend/libpq/be-fsstubs.c 2006-09-07 17:37:25.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/libpq/be-fsstubs.c 2007-02-18 22:52:16.000000000 +0100 @@ -49,6 +49,9 @@ #include "storage/large_object.h" #include "utils/memutils.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /*#define FSDB 1*/ #define BUFSIZE 8192 @@ -93,6 +96,19 @@ LargeObjectDesc *lobjDesc; int fd; +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_open(lobjId,mode) != STATUS_OK)) + { + if ((mode & INV_WRITE) && + (PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + PG_RETURN_INT32(-1); + } + } +#endif /* USE_REPLICATION */ #if FSDB elog(DEBUG4, "lo_open(%u,%d)", lobjId, mode); #endif @@ -126,6 +142,9 @@ errmsg("invalid large-object descriptor: %d", fd))); PG_RETURN_INT32(-1); } +#ifdef USE_REPLICATION + PGR_lo_close(fd); +#endif #if FSDB elog(DEBUG4, "lo_close(%d)", fd); #endif @@ -183,6 +202,18 @@ errmsg("large object descriptor %d was not opened for writing", fd))); +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_write(fd, buf, len) != STATUS_OK)) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + return -1; + } + } +#endif status = inv_write(cookies[fd], buf, len); return status; @@ -205,6 +236,10 @@ PG_RETURN_INT32(-1); } +#ifdef USE_REPLICATION + PGR_lo_lseek(fd, offset, whence); +#endif /* USE_REPLICATION */ + status = inv_seek(cookies[fd], offset, whence); PG_RETURN_INT32(status); @@ -221,6 +256,18 @@ */ CreateFSContext(); +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_create(InvalidOid) != STATUS_OK)) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + PG_RETURN_INT32(-1); + } + } +#endif /* USE_REPLICATION */ lobjId = inv_create(InvalidOid); PG_RETURN_OID(lobjId); @@ -231,6 +278,18 @@ { Oid lobjId = PG_GETARG_OID(0); +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_create(lobjId) != STATUS_OK)) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + PG_RETURN_INT32(-1); + } + } +#endif /* USE_REPLICATION */ /* * We don't actually need to store into fscxt, but create it anyway to * ensure that AtEOXact_LargeObject knows there is state to clean up @@ -263,6 +322,18 @@ { Oid lobjId = PG_GETARG_OID(0); +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_unlink(lobjId) != STATUS_OK)) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + return -1; + } + } +#endif /* USE_REPLICATION */ /* * If there are any open LO FDs referencing that ID, close 'em. */ @@ -360,6 +431,19 @@ nbytes = MAXPGPATH - 1; memcpy(fnamebuf, VARDATA(filename), nbytes); fnamebuf[nbytes] = '\0'; + +#ifdef USE_REPLICATION + if ((PGR_Stand_Alone != NULL) && + (PGR_lo_import((char*)fnamebuf) != STATUS_OK)) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) ) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + return -1; + } + } +#endif fd = PathNameOpenFile(fnamebuf, O_RDONLY | PG_BINARY, 0666); if (fd < 0) ereport(ERROR, @@ -372,6 +456,7 @@ */ lobjOid = inv_create(InvalidOid); + /* * read in from the filesystem and write to the inversion object */ diff -aruN postgresql-8.2.4/src/backend/libpq/cluster.conf.sample pgcluster-1.7.0rc7/src/backend/libpq/cluster.conf.sample --- postgresql-8.2.4/src/backend/libpq/cluster.conf.sample 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/cluster.conf.sample 2007-02-18 22:52:16.000000000 +0100 @@ -0,0 +1,71 @@ +#============================================================ +# Cluster DB Server configuration file +#------------------------------------------------------------ +# file: cluster.conf +#------------------------------------------------------------ +# This file controls: +# o which hosts & port are replication server +# o which port use for replication request to replication server +# o which command use for recovery function +#============================================================ +#------------------------------------------------------------ +# set Replication Server information +# o Host_Name : hostname +# o Port : Connection port for postmaster +# o Recovery_Port : Connection port for recovery process +#------------------------------------------------------------ + + replicate1.pgcluster.org + 8001 + 8101 + +# +# replicate2.pgcluster.org +# 8002 +# 8102 +# +# +# replicate3.pgcluster.org +# 8003 +# 8103 +# +#------------------------------------------------------------- +# set Cluster DB Server information +# o Host_Name : Host name which connect with replication server +# o Recovery_Port : Connection port for recovery +# o Rsync_Path : Path of rsync command +# o Rsync_Option : File transfer option for rsync +# o Rsync_Compress : Use compression option for rsync +# [yes/no]. default : yes +# o Pg_Dump_Path : Path of pg_dump +# o When_Stand_Alone : When all replication servers fell, +# you can set up two kinds of permission, +# "real_only" or "read_write". +# o Replication_Timeout : Timeout of each replication request +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + cluster1.pgcluster.org + 7001 + /usr/bin/rsync + ssh -1 + yes + /usr/local/pgsql/bin/pg_dump + read_only + 1 min + 3s + 11s +#------------------------------------------------------------- +# set partitional replicate control information +# set DB name and Table name to stop reprication +# o DB_Name : DB name +# o Table_Name : Table name +#------------------------------------------------------------- +# +# test_db +# log_table +# diff -aruN postgresql-8.2.4/src/backend/libpq/crypt.c pgcluster-1.7.0rc7/src/backend/libpq/crypt.c --- postgresql-8.2.4/src/backend/libpq/crypt.c 2006-07-14 16:52:19.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/libpq/crypt.c 2007-02-18 22:52:16.000000000 +0100 @@ -23,6 +23,9 @@ #include "libpq/crypt.h" #include "libpq/md5.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ int md5_crypt_verify(const Port *port, const char *role, char *client_pass) @@ -72,13 +75,34 @@ if (isMD5(shadow_pass)) { /* stored password already encrypted, only do salt */ - if (!pg_md5_encrypt(shadow_pass + strlen("md5"), - (char *) port->md5Salt, +#ifdef USE_REPLICATION + if ((PGR_password != NULL) && + ((PGR_password->md5Salt[0] | + PGR_password->md5Salt[1] | + PGR_password->md5Salt[2] | + PGR_password->md5Salt[3]) != 0 )) + { + if (!pg_md5_encrypt(shadow_pass + strlen("md5"), + (char *) PGR_password->md5Salt, sizeof(port->md5Salt), crypt_pwd)) + { + pfree(crypt_pwd); + return STATUS_ERROR; + } + } + else { - pfree(crypt_pwd); - return STATUS_ERROR; +#endif /* USE_REPLICATION */ + if (!pg_md5_encrypt(shadow_pass + strlen("md5"), + (char *) port->md5Salt, + sizeof(port->md5Salt), crypt_pwd)) + { + pfree(crypt_pwd); + return STATUS_ERROR; + } +#ifdef USE_REPLICATION } +#endif /* USE_REPLICATION */ } else { @@ -134,6 +158,16 @@ if (strcmp(crypt_client_pass, crypt_pwd) == 0) { +#ifdef USE_REPLICATION + /* + if (*(PGR_password->password) != '\0') + { + memset(PGR_password->password,0,PASSWORD_MAX_LENGTH); + memset(PGR_password->md5Salt,0,sizeof(PGR_password->md5Salt)); + memset(PGR_password->cryptSalt,0,sizeof(PGR_password->cryptSalt)); + } + */ +#endif /* USE_REPLICATION */ /* * Password OK, now check to be sure we are not past valuntil */ diff -aruN postgresql-8.2.4/src/backend/libpq/lifecheck.c pgcluster-1.7.0rc7/src/backend/libpq/lifecheck.c --- postgresql-8.2.4/src/backend/libpq/lifecheck.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/lifecheck.c 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,281 @@ +/*-------------------------------------------------------------------- + * FILE: + * lifecheck.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at backend for the lifecheck. + * Low level I/O functions that called by in these functions are + * contained in 'replicate_com.c'. + * + *-------------------------------------------------------------------- + */ + +#ifdef USE_REPLICATION + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpq/pqsignal.h" +#include "utils/guc.h" +#include "miscadmin.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "tcop/tcopprot.h" +#include "postmaster/postmaster.h" + +#include "replicate.h" + +#ifdef WIN32 +#include "win32.h" +#else +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#endif + +#ifndef HAVE_STRDUP +#include "strdup.h" +#endif +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +static void set_replication_server_status(int status); +static int send_lifecheck(int sock); +static int recv_lifecheck(int sock); +static void set_timeout(SIGNAL_ARGS); +static void exit_lifecheck(SIGNAL_ARGS); + +ReplicateServerInfo * PGR_Replicator_4_Lifecheck = NULL; + +int +PGR_Lifecheck_Main(void) +{ + int status = STATUS_OK; + int sock = -1; + int pid = 0; + + if ((pid = fork()) != 0 ) + { + return pid; + } + + pqsignal(SIGHUP, exit_lifecheck); + pqsignal(SIGTERM, exit_lifecheck); + pqsignal(SIGINT, exit_lifecheck); + pqsignal(SIGQUIT, exit_lifecheck); + pqsignal(SIGALRM, set_timeout); + PG_SETMASK(&UnBlockSig); + + for (;;) + { + + PGR_Replicator_4_Lifecheck = PGR_check_replicate_server_info(); + if (PGR_Replicator_4_Lifecheck == NULL) + { + alarm(0); + sleep(PGR_Lifecheck_Interval); + continue; + } + /* get replication server information */ + PGR_Replicator_4_Lifecheck = PGR_get_replicate_server_info(); + if (PGR_Replicator_4_Lifecheck == NULL) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"not found replication server"); + } + return STATUS_ERROR; + } + sock = PGR_get_replicate_server_socket( PGR_Replicator_4_Lifecheck , PGR_QUERY_SOCKET ); + if (sock < 0) + { + set_replication_server_status(DATA_ERR); + if (Debug_pretty_print) + elog(DEBUG1,"get_replicate_server_socket failed"); + continue; + } + + /* set alarm as lifecheck timeout */ + alarm(PGR_Lifecheck_Timeout * 2); + + /* send lifecheck to replication server */ + status = send_lifecheck(sock); + if (status != STATUS_OK) + { + set_replication_server_status(DATA_ERR); + close(sock); + sock = -1; + if (Debug_pretty_print) + elog(DEBUG1,"send life check failed"); + continue; + } + + /* receive lifecheck response */ + status = recv_lifecheck(sock); + if (status != STATUS_OK) + { + set_replication_server_status(DATA_ERR); + close(sock); + sock = -1; + if (Debug_pretty_print) + elog(DEBUG1,"receive life check failed"); + continue; + } + + /* stop alarm */ + alarm(0); + set_replication_server_status(DATA_USE); + + /* wait next lifecheck as interval */ + sleep(PGR_Lifecheck_Interval); + } +} + +static void +set_replication_server_status(int status) +{ + if (status == DATA_ERR) + { + PGR_Replicator_4_Lifecheck->retry_count ++; + if (PGR_Replicator_4_Lifecheck->retry_count > MAX_RETRY_TIMES) + { + PGR_Set_Replication_Server_Status(PGR_Replicator_4_Lifecheck, status); + } + } + else + { + PGR_Replicator_4_Lifecheck->retry_count = 0; + PGR_Set_Replication_Server_Status(PGR_Replicator_4_Lifecheck, status); + } +} + +static int +send_lifecheck(int sock) +{ + ReplicateHeader header; + fd_set wmask; + struct timeval timeout; + int send_size = 0; + int buf_size = 0; + char * send_ptr = (char *)&header; + int s = 0; + int rtn = 0; + + timeout.tv_sec = PGR_Lifecheck_Timeout; + timeout.tv_usec = 0; + + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_LIFECHECK; + header.cmdSts = CMD_STS_CLUSTER; + buf_size = sizeof(ReplicateHeader); + + for (;;) + { + FD_ZERO(&wmask); + FD_SET(sock,&wmask); + rtn = select(sock+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR) + { + return STATUS_OK; + } + else + { + elog(DEBUG1, "send_lifecheck():select() failed"); + return STATUS_ERROR; + } + } + else if (rtn && FD_ISSET(sock, &wmask)) + { + s = send(sock,send_ptr + send_size,buf_size - send_size ,0); + if (s < 0){ + if (errno == EINTR) + { + return STATUS_OK; + } + if (errno == EAGAIN) + { + continue; + } + elog(DEBUG1, "send_replicate_packet():send error"); + + /* EPIPE || ENCONNREFUSED || ENSOCK || EHOSTUNREACH */ + return STATUS_ERROR; + } else if (s == 0) { + elog(DEBUG1, "send_lifecheck():unexpected EOF"); + return STATUS_ERROR; + } else /*if (s > 0)*/ { + send_size += s; + if (send_size == buf_size) + { + return STATUS_OK; + } + } + } + } +} + +static int +recv_lifecheck(int sock) +{ + int status = STATUS_OK; + char result[PGR_MESSAGE_BUFSIZE]; + + memset(result,0,PGR_MESSAGE_BUFSIZE); + status = PGR_recv_replicate_result(sock,result, PGR_Lifecheck_Timeout); + return ((status >= 0) ?STATUS_OK:STATUS_ERROR); +} + +static void +set_timeout(SIGNAL_ARGS) +{ + if (PGR_Replicator_4_Lifecheck != NULL) + { + set_replication_server_status(DATA_ERR); + if (Debug_pretty_print) + elog(DEBUG1,"time out is occured in life check"); + } +} + +static void +exit_lifecheck(SIGNAL_ARGS) +{ + fprintf(stderr,"lifecheck stopped\n"); + exit(0); +} + +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/backend/libpq/recovery.c pgcluster-1.7.0rc7/src/backend/libpq/recovery.c --- postgresql-8.2.4/src/backend/libpq/recovery.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/recovery.c 2007-02-18 22:52:16.000000000 +0100 @@ -0,0 +1,1566 @@ +/*-------------------------------------------------------------------- + * FILE: + * recovery.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at backend for the recovery. + * Low level I/O functions that called by in these functions are + * contained in 'replicate_com.c'. + * + *-------------------------------------------------------------------- + */ + +/*-------------------------------------- + * INTERFACE ROUTINES + * + * I/O call: + * PGR_recovery_finish_send + * master module: + * PGR_Master_Main(void); + * recovery module: + * PGR_Recovery_Main + *------------------------------------- + */ +#ifdef USE_REPLICATION + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpq/pqsignal.h" +#include "utils/guc.h" +#include "miscadmin.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "tcop/tcopprot.h" +#include "postmaster/postmaster.h" + +#include "../interfaces/libpq/libpq-fe.h" +#include "../interfaces/libpq/libpq-int.h" +#include "../interfaces/libpq/fe-auth.h" + +#include "replicate.h" + +#ifdef WIN32 +#include "win32.h" +#else +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#endif + +#ifndef HAVE_STRDUP +#include "strdup.h" +#endif +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#define RECOVERY_LOOP_END (0) +#define RECOVERY_LOOP_CONTINUE (1) +#define RECOVERY_LOOP_FAIL (2) +char Local_Host_Name[HOSTNAME_MAX_LENGTH]; +int PGR_Recovery_Mode = 0; + +static int read_packet(int sock,RecoveryPacket * packet); +static int send_recovery_packet(int sock, RecoveryPacket * packet); +static int send_packet(int * sock, RecoveryPacket * packet ); +static void master_loop(int fd); +static int start_recovery_send(int * sock, ReplicateServerInfo * host); +static int stop_recovery_send(int * sock, ReplicateServerInfo * host); +static int rsync_pg_data(char * src , char * dest); +static int remove_dir(char * dir_name); +static int clear_bkup_dir(char * dir_name); +static int bkup_dir(char * dir_name); +static int restore_dir(char * dir_name); +static int rsync_global_dir(char * src, char * dest, int stage); +static int first_recovery(char * src, char * dest, char * dir); +static int second_recovery(char * src, char * dest, char * dir); +static int recovery_rsync(char * src , char * dest, int stage); +static int recovery_loop(int fd, int mode); +static void show_recovery_packet(RecoveryPacket * packet); +static int direct_send_packet(int packet_no); +static void set_recovery_packet(RecoveryPacket * packet, int packet_no); +static int cold_recovery(char * src, RecoveryPacket *packet, bool need_sync_table_space, int stage); +static int hot_recovery(RecoveryPacket *packet, int stage); +static int restore_from_dumpall( char * hostName, uint16_t portNum, char * userName); +static int restore_from_dump( char * hostName, uint16_t portNum, char * userName, char * dbName); +static int restore_from_each_dump( char * hostName, uint16_t portNum, char * userName); +static PGresult * get_dbName(char * hostName, uint16_t portNum, char * userName); + +static int sync_table_space(char * hostName, uint16_t portNum, char * userName, int stage); +static PGresult * get_table_space_location(char * hostName, uint16_t portNum, char * userName); +static int rsync_table_space(char * hostName, char * location, int stage); + +int PGR_recovery_error_send(void); +int PGR_recovery_finish_send(void); +int PGR_recovery_queue_data_req(void); +int PGR_Master_Main(void); +int PGR_Recovery_Main(int mode); + +static int +read_packet(int sock,RecoveryPacket * packet) +{ + int r; + char * read_ptr; + int read_size = 0; + int packet_size = 0; + + read_ptr = (char*)packet; + packet_size = sizeof(RecoveryPacket); + + for (;;){ + r = recv(sock,read_ptr + read_size ,packet_size, MSG_WAITALL); + if (r < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } else { + elog(DEBUG1, "read_packet():recv failed"); + return -1; + } + } else if (r == 0) { + elog(DEBUG1, "read_packet():unexpected EOF"); + return -1; + } else /*if (r > 0)*/ { + read_size += r; + if (read_size == packet_size) { + show_recovery_packet(packet); + return read_size; + } + } + } + return -1; +} + +static int +send_recovery_packet(int sock, RecoveryPacket * packet) +{ + char * send_ptr; + int send_size= 0; + int buf_size = 0; + int s; + int rtn; + fd_set wmask; + struct timeval timeout; + + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + rtn = 1; + while (rtn) + { + for (;;) + { + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + FD_ZERO(&wmask); + FD_SET(sock,&wmask); + rtn = select(sock+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + { + continue; + } + else + { + rtn = 0; + break; + } + } + else if (rtn && FD_ISSET(sock, &wmask)) + { + send_ptr = (char *)packet; + buf_size = sizeof(RecoveryPacket); + + s = send(sock,send_ptr + send_size,buf_size - send_size ,0); + if (s < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + elog(DEBUG1, "send_recovery_packet():send error"); + + /* EPIPE || ENCONNREFUSED || ENSOCK || EHOSTUNREACH */ + return STATUS_ERROR; + } else if (s == 0) { + elog(DEBUG1, "send_recovery_packet():unexpected EOF"); + return STATUS_ERROR; + } else /*if (s > 0)*/ { + send_size += s; + if (send_size == buf_size) + { + return STATUS_OK; + } + } + } + } + } + return STATUS_ERROR; +} + +static int +send_packet(int * sock, RecoveryPacket * packet ) +{ + int count = 0; + ReplicateServerInfo * host = NULL; + + host = PGR_get_replicate_server_info(); + if (host == (ReplicateServerInfo*)NULL) + { + return STATUS_ERROR; + } + count = 0; + while (send_recovery_packet(*sock,packet) != STATUS_OK) + { + if (count < MAX_RETRY_TIMES ) + { + count ++; + continue; + } + count = 0; + close(*sock); + PGR_Set_Replication_Server_Status(host,DATA_ERR); + host = PGR_get_replicate_server_info(); + if (host == (ReplicateServerInfo*)NULL) + { + return STATUS_ERROR; + } + PGR_Set_Replication_Server_Status(host,DATA_USE); + PGR_Create_Socket_Connect(sock, host->hostName , host->recoveryPortNumber); + } + return STATUS_OK; +} + +static void +master_loop(int fd) +{ + int count; + int sock; + int status = STATUS_OK; + RecoveryPacket packet; + int r_size = 0; + bool loop_end = false; + + count = 0; + while ((status = PGR_Create_Acception(fd,&sock,"",RecoveryPortNumber)) != STATUS_OK) + { + PGR_Close_Sock(&sock); + sock = -1; + if ( count > MAX_RETRY_TIMES) + { + return; + } + count ++; + } + for(;;) + { + int rtn; + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + memset(&packet,0,sizeof(RecoveryPacket)); + rtn = select(sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn && FD_ISSET(sock, &rmask)) + { + r_size = read_packet(sock,&packet); + if (r_size == 0) + { + continue; + } + else if (r_size < 0) + { + loop_end=true; + break; + } + } + else + { + continue; + } + switch (ntohs(packet.packet_no)) + { + case RECOVERY_PGDATA_REQ : + /* + * PGDATA information request + */ + /* + * get master server information + */ + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_PGDATA_ANS) ; + status = send_packet(&sock,&packet); + PGR_Set_Cluster_Status(STATUS_RECOVERY); + break; + case RECOVERY_FSYNC_REQ : + /* + * get master server information + */ + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_FSYNC_ANS ); + status = send_packet(&sock,&packet); + PGR_Set_Cluster_Status(STATUS_RECOVERY); + loop_end = true; + break; + case RECOVERY_ERROR_TARGET_ONLY: + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_ERROR_ANS ); + status = send_packet(&sock,&packet); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + break; + case RECOVERY_ERROR_CONNECTION: + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_ERROR_ANS ); + status = send_packet(&sock,&packet); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + /** + * kill broken cluster db. + * FIXME: missing MyProcPid here. It must be postmaster's pid. + * but here's a bug MyProcPid doesn't initialized properly , so MyProcPid = postmaster's pid. + * To fix this, define variable to set posmaster's pid. + */ + kill(MyProcPid,SIGQUIT); + loop_end = true; + break; + case RECOVERY_ERROR_ANS: + /* TODO: recovery failed. close this postmaster */ + loop_end = true; + break; + case RECOVERY_FINISH: + PGR_Set_Cluster_Status(STATUS_REPLICATED); + loop_end = true; + break; + default: + loop_end = true; + break; + } + if (loop_end) + { + break; + } + } + PGR_Close_Sock(&sock); +} + +int +PGR_Master_Main(void) +{ + int status; + int fd = -1; + int rtn; + int pid; + + if ((pid = fork()) != 0 ) + { + return pid; + } + + memset(Local_Host_Name,0,sizeof(Local_Host_Name)); + gethostname(Local_Host_Name,sizeof(Local_Host_Name)); + pqsignal(SIGHUP, authdie); + pqsignal(SIGTERM, authdie); + pqsignal(SIGINT, authdie); + pqsignal(SIGQUIT, authdie); + pqsignal(SIGALRM, authdie); + PG_SETMASK(&UnBlockSig); + + status = STATUS_ERROR; + status = PGR_Create_Socket_Bind(&fd, "", RecoveryPortNumber); + + if (status != STATUS_OK) + { + return pid; + } + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = 60; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(fd,&rmask); + rtn = select(fd+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn && FD_ISSET(fd, &rmask)) + { + master_loop(fd); + } + } + return pid; +} + +static int +start_recovery_send(int * sock, ReplicateServerInfo * host) +{ + int status; + RecoveryPacket packet; + status = PGR_Create_Socket_Connect(sock, host->hostName, host->recoveryPortNumber); + if (status != STATUS_OK) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"connection error to replication server"); + } + return STATUS_ERROR; + } + + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_PREPARE_REQ ); + status = send_packet(sock,&packet); + + return status; +} + +static int +stop_recovery_send(int * sock, ReplicateServerInfo * host) +{ + int status; + RecoveryPacket packet; + + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, RECOVERY_ERROR_ANS ); + status = send_packet(sock,&packet); + return status; +} + +static int +direct_send_packet(int packet_no) +{ + + int status; + int fd = -1; + ReplicateServerInfo * host; + RecoveryPacket packet; + + host = PGR_get_replicate_server_info(); + if (host == NULL) + { + return STATUS_ERROR; + } + status = PGR_Create_Socket_Connect(&fd, host->hostName, host->recoveryPortNumber); + if (status != STATUS_OK) + { + PGR_Set_Replication_Server_Status(host,DATA_ERR); + return STATUS_ERROR; + } + + memset(&packet,0,sizeof(packet)); + set_recovery_packet(&packet, packet_no ); + status = send_packet(&fd,&packet); + + close(fd); + + return status; +} + +int +PGR_recovery_error_send(void) +{ + return direct_send_packet(RECOVERY_ERROR_ANS); +} + +int +PGR_recovery_finish_send(void) +{ + return direct_send_packet(RECOVERY_FINISH); +} + +int +PGR_recovery_queue_data_req(void) +{ + int status = STATUS_OK; + int r_size = 0; + int rtn = STATUS_OK; + int fd = -1; + ReplicateServerInfo * host = NULL; + RecoveryPacket packet; + + host = PGR_get_replicate_server_info(); + if (host == NULL) + { + return STATUS_ERROR; + } + status = PGR_Create_Socket_Connect(&fd, host->hostName, host->recoveryPortNumber); + if (status != STATUS_OK) + { + PGR_Set_Replication_Server_Status(host,DATA_ERR); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + close(fd); + return STATUS_ERROR; + } + + memset(&packet,0,sizeof(packet)); + PGRset_recovery_packet_no(&packet, RECOVERY_QUEUE_DATA_REQ ); + status = send_packet(&fd,&packet); + if (status != STATUS_OK) + { + status = stop_recovery_send(&fd,host); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + close(fd); + return STATUS_ERROR; + } + memset(&packet,0,sizeof(RecoveryPacket)); + r_size = read_packet(fd,&packet); + if (r_size <= 0) + { + rtn = STATUS_ERROR; + } + switch (ntohs(packet.packet_no)) + { + case RECOVERY_QUEUE_DATA_ANS: + rtn = STATUS_OK; + break; + default: + rtn = STATUS_ERROR; + break; + } + PGR_Set_Cluster_Status(STATUS_REPLICATED); + close(fd); + return rtn; +} + +static int +rsync_pg_data(char * src, char * dest) +{ + int status; + char *args[12]; + int pid, i = 0; + + args[i++] = "rsync"; + args[i++] = "-a"; + args[i++] = "-r"; + if (RsyncCompress) + args[i++] = "-z"; + args[i++] = "--delete"; + args[i++] = "-e"; + args[i++] = RsyncOption; + args[i++] = src; + args[i++] = dest; + args[i++] = NULL; + + pid = fork(); + if (pid == 0) + { + status = execv(RsyncPath,args); + } + else + { + for (;;) + { + int result; + result = wait(&status); + if (result < 0) + { + if (errno == EINTR) + continue; + return STATUS_ERROR; + } + + if (WIFEXITED(status) == 0 || WEXITSTATUS(status) != 0) + return STATUS_ERROR; + else + break; + } + } + return STATUS_OK; +} + +static int +remove_dir(char * dir_name) +{ + DIR * dp = NULL; + struct dirent *dirp = NULL; + char fname[256]; + int status = 0; + + if ((dp = opendir(dir_name)) == NULL) + { + return STATUS_ERROR; + } + while ((dirp = readdir(dp)) != NULL) + { + if ((!strcmp(dirp->d_name,".")) || + (!strcmp(dirp->d_name,".."))) + { + continue; + } + sprintf(fname,"%s/%s",dir_name,dirp->d_name); + status = remove(fname); + if (status < 0) + { + remove_dir(fname); + } + } + closedir(dp); + if (remove(dir_name) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +clear_bkup_dir(char * dir_name) +{ + char bkp_dir[256]; + pid_t pid = getpid(); + + sprintf(bkp_dir,"%s_%d",dir_name,pid); + return (remove_dir(bkp_dir)); +} + +static int +bkup_dir(char * dir_name) +{ + int status; + char org_dir[256]; + char bkp_dir[256]; + pid_t pid = getpid(); + + sprintf(org_dir,"%s",dir_name); + sprintf(bkp_dir,"%s_%d",dir_name,pid); + status = rename(org_dir,bkp_dir); + if (status < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +restore_dir(char * dir_name) +{ + int status; + char org_dir[256]; + char bkp_dir[256]; + pid_t pid = getpid(); + + sprintf(org_dir,"%s",dir_name); + sprintf(bkp_dir,"%s_%d",dir_name,pid); + status = rename(bkp_dir,org_dir); + if (status < 0) + { + remove_dir(org_dir); + status = rename(bkp_dir,org_dir); + if (status < 0) + { + return STATUS_ERROR; + } + } + return STATUS_OK; +} + +static int +rsync_global_dir(char * src, char * dest, int stage) +{ + int status; + char control_file[256]; + char org_dir[256]; + char src_dir[256]; + struct stat fstat; + int cnt; + + sprintf(org_dir,"%s/global",dest); + sprintf(control_file,"%s/global/pg_control",dest); + if ((stage == PGR_1ST_RECOVERY) && (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP)) + { + if (bkup_dir(org_dir) != STATUS_OK) + { + return STATUS_ERROR; + } + } + sprintf(src_dir,"%s/global",src); + status = rsync_pg_data(src_dir, dest); + if (status != STATUS_OK ) + { + restore_dir(org_dir); + return STATUS_ERROR; + } + /* check pg_control file */ + cnt = 0; + while (stat(control_file, &fstat) < 0) + { + if (cnt > MAX_RETRY_TIMES ) + { + restore_dir(org_dir); + return STATUS_ERROR; + } + cnt ++; + sleep(1); + } + if ((stage == PGR_2ND_RECOVERY) && (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP)) + { + clear_bkup_dir(org_dir); + } + return STATUS_OK; +} + +static int +first_recovery(char * src, char * dest, char * dir) +{ + int status = STATUS_OK; + char src_dir[256]; + char dest_dir[256]; + + memset(src_dir,0,sizeof(src_dir)); + memset(dest_dir,0,sizeof(dest_dir)); + sprintf(src_dir,"%s/%s",src,dir); + sprintf(dest_dir,"%s/%s",dest,dir); + if (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP) + { + status = bkup_dir(dest_dir); + if (status < 0) + { + return STATUS_ERROR; + } + } + status = rsync_pg_data(src_dir, dest); + if (status != STATUS_OK ) + { + restore_dir(dest_dir); + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +second_recovery(char * src, char * dest, char * dir) +{ + int status = STATUS_OK; + char src_dir[256]; + char dest_dir[256]; + + memset(src_dir,0,sizeof(src_dir)); + memset(dest_dir,0,sizeof(dest_dir)); + sprintf(src_dir,"%s/%s",src,dir); + sprintf(dest_dir,"%s/%s",dest,dir); + + status = rsync_pg_data(src_dir, dest); + if (status != STATUS_OK ) + { + restore_dir(dest_dir); + return STATUS_ERROR; + } + if (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP) + { + clear_bkup_dir(dest_dir); + } + + return STATUS_OK; +} + +static int +recovery_rsync(char * src , char * dest, int stage) +{ + if ((src== NULL) || ( dest == NULL)) + { + return STATUS_ERROR; + } + + /* recovery step of "global" directory */ + fprintf(stderr,"%s recovery step of [global] directory...", + ((stage == 1)?"1st":"2nd")); + if (rsync_global_dir(src, dest, stage) != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + + if (stage == PGR_1ST_RECOVERY) + { + /* 1st recovery step of "base" directory */ + fprintf(stderr,"1st recovery step of [base] directory..."); + if (first_recovery(src,dest,"base") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + + fprintf(stderr,"1st recovery step of [pg_clog] directory..."); + /* 1st recovery step of "pg_clog" directory */ + if (first_recovery(src,dest,"pg_clog") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + + /* 1st recovery step of "pg_xlog" directory */ + fprintf(stderr,"1st recovery step of [pg_xlog] directory..."); + if (first_recovery(src,dest,"pg_xlog") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + } + else + { + /* 2nd recovery step of "base" directory */ + fprintf(stderr,"2nd recovery step of [base] directory..."); + if (second_recovery(src,dest,"base") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + + /* 2nd recovery step of "pg_clog" directory */ + fprintf(stderr,"2nd recovery step of [pg_clog] directory..."); + if (second_recovery(src,dest,"pg_clog") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + + /* 2nd recovery step of "pg_xlog" directory */ + fprintf(stderr,"2nd recovery step of [pg_xlog] directory..."); + if (second_recovery(src,dest,"pg_xlog") != STATUS_OK) + { + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + } + + return STATUS_OK; +} + +static int +recovery_loop(int fd, int mode) +{ + + int status = STATUS_OK; + RecoveryPacket packet; + int r_size = 0; + int rtn = RECOVERY_LOOP_END; + char src[256]; + bool need_sync_table_space = false; + + memset(&packet,0,sizeof(RecoveryPacket)); + r_size = read_packet(fd,&packet); + if (r_size <= 0) + { + rtn = RECOVERY_LOOP_FAIL; + } + switch (ntohs(packet.packet_no)) + { + case RECOVERY_PREPARE_ANS : + /* + * get master information + */ + /* + * sync master data before recovery + */ + if (Debug_pretty_print) + { + elog(DEBUG1,"local host : %s master:%s",Local_Host_Name,packet.hostName); + } + if (!strncmp(Local_Host_Name,packet.hostName,strlen(Local_Host_Name))) + { + strcpy(src,packet.pg_data); + need_sync_table_space = false; + } + else + { + sprintf(src,"%s:%s",packet.hostName,packet.pg_data); + need_sync_table_space = true; + } + if (PGR_Recovery_Mode == PGR_COLD_RECOVERY) + { + rtn = cold_recovery(src,&packet,need_sync_table_space,PGR_1ST_RECOVERY); + } + else + { + rtn = hot_recovery(&packet,PGR_1ST_RECOVERY); + } + if (rtn != STATUS_OK) + { + rtn = RECOVERY_LOOP_FAIL; + break; + } + + /* + * send recovery start request + */ + PGRset_recovery_packet_no(&packet, RECOVERY_START_REQ ); + status = send_packet(&fd,&packet); + if (status != STATUS_OK) + { + fprintf(stderr,"RECOVERY_START_REQ send error\n"); + rtn = RECOVERY_LOOP_FAIL; + break; + } + rtn = RECOVERY_LOOP_CONTINUE; + break; + case RECOVERY_START_ANS : + /* + * sync master data for recovery + */ + if (!strncmp(Local_Host_Name,packet.hostName,strlen(Local_Host_Name))) + { + strcpy(src,packet.pg_data); + need_sync_table_space = false; + } + else + { + sprintf(src,"%s:%s",packet.hostName,packet.pg_data); + need_sync_table_space = true; + } + if (PGR_Recovery_Mode == PGR_COLD_RECOVERY) + { + rtn = cold_recovery(src,&packet,need_sync_table_space,PGR_2ND_RECOVERY); + } + else + { + rtn = hot_recovery(&packet,PGR_2ND_RECOVERY); + } + + if (rtn == STATUS_OK) + { + fprintf(stderr,"2nd recovery successed\n"); + if (mode == PGR_HOT_RECOVERY) + { + rtn = RECOVERY_LOOP_CONTINUE; + /* + * send recovery queued data request + */ + PGRset_recovery_packet_no(&packet, RECOVERY_QUEUE_DATA_REQ ); + status = send_packet(&fd,&packet); + if (status != STATUS_OK) + { + fprintf(stderr,"RECOVERY_QUEUE_DATA_REQ send error\n"); + rtn = RECOVERY_LOOP_FAIL; + break; + } + } + else + { + rtn = RECOVERY_LOOP_END; + } + } + else + { + fprintf(stderr,"2nd hot recovery failed\n"); + rtn = RECOVERY_LOOP_FAIL; + } + break; + case RECOVERY_QUEUE_DATA_ANS: + rtn = RECOVERY_LOOP_END; + break; + case RECOVERY_ERROR_OCCUPIED: + fprintf(stderr,"already in use for another recovery\n"); + rtn = RECOVERY_LOOP_FAIL; + break; + case RECOVERY_ERROR_CONNECTION: + fprintf(stderr,"connection failed\n"); + rtn = RECOVERY_LOOP_FAIL; + break; + default: + fprintf(stderr,"unknown packet received\n"); + rtn = RECOVERY_LOOP_FAIL; + break; + } + + return rtn; +} + +int +PGR_Recovery_Main(int mode) +{ + int status; + int fd = -1; + int rtn; + ReplicateServerInfo * host; + + memset(Local_Host_Name,0,sizeof(Local_Host_Name)); + gethostname(Local_Host_Name,sizeof(Local_Host_Name)); + PGR_Recovery_Mode = mode; + + status = STATUS_ERROR; + +Retry_Start_Recovery: + host = PGR_get_replicate_server_info(); + if (host == NULL) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"not found replication server"); + } + PGR_Set_Cluster_Status(STATUS_REPLICATED); + return STATUS_ERROR; + } + + PGR_Set_Cluster_Status(STATUS_RECOVERY); + status = start_recovery_send(&fd,host); + if (status != STATUS_OK) + { + PGR_Set_Replication_Server_Status(host,DATA_ERR); + close(fd); + if (Debug_pretty_print) + { + elog(DEBUG1,"start recovery packet send error"); + } + goto Retry_Start_Recovery; + } + + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(fd,&rmask); + rtn = select(fd+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn && FD_ISSET(fd, &rmask)) + { + status = recovery_loop(fd, mode); + if (status == RECOVERY_LOOP_CONTINUE) + { + continue; + } + else if (status == RECOVERY_LOOP_END) + { + close(fd); + break; + } + else if (status == RECOVERY_LOOP_FAIL) + { + status = stop_recovery_send(&fd,host); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + if (status != STATUS_OK) + { + close(fd); + return STATUS_ERROR; + } + close(fd); + return STATUS_ERROR; + } + else + { + close(fd); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + return STATUS_ERROR; + } + } + } + PGR_Set_Cluster_Status(STATUS_REPLICATED); + return STATUS_OK; +} + +static void +show_recovery_packet(RecoveryPacket * packet) +{ + + if (Debug_pretty_print) + { + elog(DEBUG1,"no = %d",ntohs(packet->packet_no)); + elog(DEBUG1,"max_connect = %d",ntohs(packet->max_connect)); + elog(DEBUG1,"port = %d",ntohs(packet->port)); + elog(DEBUG1,"recoveryPort = %d",ntohs(packet->recoveryPort)); + if (packet->hostName != NULL) + elog(DEBUG1,"hostName = %s",packet->hostName); + if (packet->pg_data != NULL) + elog(DEBUG1,"pg_data = %s",packet->pg_data); + } +} + +static void +set_recovery_packet(RecoveryPacket * packet, int packet_no) +{ + struct passwd * pw = NULL; + + if (packet == NULL) + { + return; + } + PGRset_recovery_packet_no(packet, packet_no ); + packet->max_connect = htons(MaxBackends); + packet->port = htons(PostPortNumber); + packet->recoveryPort = htons(RecoveryPortNumber); + gethostname(packet->hostName,sizeof(packet->hostName)); + memcpy(packet->pg_data,DataDir,sizeof(packet->pg_data)); + memset(packet->userName,0,sizeof(packet->userName)); + if ((pw = getpwuid(geteuid())) != NULL) + { + strncpy(packet->userName,pw->pw_name,sizeof(packet->userName)); + } + else + { + cuserid(packet->userName); + } +} + +static int +sync_table_space(char * hostName, uint16_t portNum, char * userName, int stage) +{ + PGresult * res = (PGresult *)NULL; + int i = 0; + int row_num = 0; + char * location = NULL; + int rtn = STATUS_OK; + + res = get_table_space_location(hostName, portNum, userName); + if (res == (PGresult *)NULL) + { + return STATUS_ERROR; + } + row_num = PQntuples(res); + for ( i = 0 ; i < row_num ; i ++) + { + location = PQgetvalue(res,i,0); + if (strlen(location) > 0 ) + { + fprintf(stderr,"sync tablespace[%s]...",location); + rtn = rsync_table_space(hostName, location, stage); + fprintf(stderr,"%s\n", (rtn == STATUS_OK)?"OK":"NG"); + } + } + if (res != (PGresult *)NULL) + { + PQclear(res); + } + + return STATUS_OK; +} + +static PGresult * +get_table_space_location(char * hostName, uint16_t portNum, char * userName) +{ + PGresult * res = (PGresult *)NULL; + int cnt = 0; + PGconn * conn = (PGconn *)NULL; + char port[8]; + char *database = "template1"; + char * query = "select spclocation from pg_tablespace where spcname not like 'pg_%'"; + + if ( (hostName == NULL) || + (portNum <= 0) || + (userName == NULL)) + { + return (PGresult *)NULL; + } + snprintf(port,sizeof(port),"%d", portNum); + + /* create connection to master */ + conn = PQsetdbLogin(hostName, port, NULL, NULL, database, userName, NULL); + if (conn == NULL) + { + return (PGresult *)NULL; + } + /* check to see that the backend Connection was successfully made */ + cnt = 0; + while (PQstatus(conn) == CONNECTION_BAD) + { + if (conn != NULL) + { + PQfinish(conn); + } + if (cnt > MAX_RETRY_TIMES ) + { + return (PGresult *)NULL; + } + conn = PQsetdbLogin(hostName, port, NULL, NULL, database, userName, NULL); + cnt ++; + } + res = PQexec(conn , query); + if ((res == NULL) || + (PQresultStatus(res) != PGRES_TUPLES_OK)) + { + PQclear(res); + res = (PGresult *)NULL; + } + if (conn != NULL) + { + PQfinish(conn); + } + + return res; +} + +static int +rsync_table_space(char * hostName, char * location, int stage) +{ + int status = STATUS_OK; + char src_dir[256]; + char dest_dir[256]; + struct stat fstat; + int cnt = 0; + + sprintf(src_dir,"%s:%s",hostName,location); + strncpy(dest_dir,location,sizeof(dest_dir)); + + if ((stage == PGR_1ST_RECOVERY) && (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP)) + { + status = bkup_dir(location); + } + status = rsync_pg_data(src_dir, dest_dir); + if (status != STATUS_OK ) + { + restore_dir(location); + return STATUS_ERROR; + } + /* check file status */ + cnt = 0; + while (stat(location,&fstat) < 0) + { + if (cnt > MAX_RETRY_TIMES ) + { + restore_dir(location); + return STATUS_ERROR; + } + cnt ++; + sleep(1); + } + if ((stage == PGR_2ND_RECOVERY) && (PGR_Recovery_Mode != PGR_WITHOUT_BACKUP)) + { + clear_bkup_dir(location); + } + return STATUS_OK; +} + +static int +cold_recovery(char * src, RecoveryPacket *packet, bool need_sync_table_space, int stage) +{ + int status = STATUS_OK; + + status = recovery_rsync(src,DataDir,stage); + if (status != STATUS_OK) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"%s rsync error", + ((stage == PGR_1ST_RECOVERY)?"1st":"2nd")); + } + return STATUS_ERROR; + } + if (need_sync_table_space == true) + { + status = sync_table_space(packet->hostName, ntohs(packet->port), packet->userName, stage); + fprintf(stderr,"%s sync_table_space ", + ((stage == PGR_1ST_RECOVERY)?"1st":"2nd")); + if (status != STATUS_OK) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"%s sync table space error", + ((stage == PGR_1ST_RECOVERY)?"1st":"2nd")); + } + fprintf(stderr,"NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"OK\n"); + } + return STATUS_OK; +} + +static int +hot_recovery(RecoveryPacket *packet, int stage) +{ + int status = STATUS_OK; + + fprintf(stderr,"%s restore from pg_dump ", + ((stage == PGR_1ST_RECOVERY)?"1st":"2nd")); + if (stage == PGR_1ST_RECOVERY) + { + status = restore_from_dumpall(packet->hostName, ntohs(packet->port), packet->userName ); + } + else + { + status = restore_from_each_dump(packet->hostName, ntohs(packet->port), packet->userName ); + } + if (status != STATUS_OK) + { + if (Debug_pretty_print) + { + elog(DEBUG1,"%s sync table space error", + ((stage == PGR_1ST_RECOVERY)?"1st":"2nd")); + } + fprintf(stderr,"->NG\n"); + return STATUS_ERROR; + } + fprintf(stderr,"->OK\n"); + return STATUS_OK; +} + +static int +restore_from_dumpall( char * hostName, uint16_t portNum, char * userName) +{ + int status; + char exec_command[512]; + int pid; + char pg_dumpall[256]; + char psql[256]; + char *p=NULL; + + /* set pg_dumpall path */ + memset(pg_dumpall, 0, sizeof(pg_dumpall)); + strncpy(pg_dumpall, PgDumpPath, sizeof(pg_dumpall)); + p = strrchr(pg_dumpall,'/'); + if (p == NULL) + { + return STATUS_ERROR; + } + p++; + strcpy(p,"pg_dumpall"); + + /* set psql path */ + p = NULL; + memset(psql, 0, sizeof(psql)); + strncpy(psql, PgDumpPath, sizeof(psql)); + p = strrchr(psql,'/'); + if (p == NULL) + { + return STATUS_ERROR; + } + p++; + strcpy(p,"psql"); + p+=4; + *p = '\0'; + + snprintf(exec_command,sizeof(exec_command),"%s -i -o -c -h %s -p %d -U %s | %s -p %d template1", + pg_dumpall, + hostName, + portNum, + userName, + psql, + PostPortNumber + ); + fprintf(stderr,"1st exec:[%s]\n",exec_command); + + pid = fork(); + if (pid == 0) + { + system(exec_command); + exit(0); + } + else + { + for (;;) + { + int result; + result = wait(&status); + if (result < 0) + { + if (errno == EINTR) + continue; + return STATUS_ERROR; + } + + if (WIFEXITED(status) == 0 || WEXITSTATUS(status) != 0) + return STATUS_ERROR; + else + break; + } + } + return STATUS_OK; +} + +static int +restore_from_dump( char * hostName, uint16_t portNum, char * userName, char * dbName) +{ + int status; + char exec_command[512]; + int pid= 0; + char pg_restore[256]; + char *p=NULL; + + /* set pq_restore path */ + p = NULL; + memset(pg_restore, 0, sizeof(pg_restore)); + strncpy(pg_restore, PgDumpPath, sizeof(pg_restore)); + p = strrchr(pg_restore,'/'); + if (p == NULL) + { + return STATUS_ERROR; + } + p++; + strcpy(p,"pg_restore"); + + snprintf(exec_command,sizeof(exec_command),"%s -i -Fc -o -b -h %s -p %d -U %s %s | %s -i -c -p %d -d %s", + PgDumpPath, + hostName, + portNum, + userName, + dbName, + pg_restore, + PostPortNumber, + dbName + ); + + fprintf(stderr,"2nd exec:[%s]\n",exec_command); + pid = fork(); + if (pid == 0) + { + system(exec_command); + exit(0); + } + else + { + for (;;) + { + int result; + result = wait(&status); + if (result < 0) + { + if (errno == EINTR) + continue; + return STATUS_ERROR; + } + + if (WIFEXITED(status) == 0 || WEXITSTATUS(status) != 0) + return STATUS_ERROR; + else + break; + } + } + return STATUS_OK; +} + +static int +restore_from_each_dump( char * hostName, uint16_t portNum, char * userName) +{ + PGresult * res = (PGresult *)NULL; + int i = 0; + int row_num = 0; + char * dbName = NULL; + int rtn = STATUS_OK; + + res = get_dbName(hostName, portNum, userName); + if (res == (PGresult *)NULL) + { + return STATUS_ERROR; + } + row_num = PQntuples(res); + for ( i = 0 ; i < row_num ; i ++) + { + dbName = PQgetvalue(res,i,0); + if (strlen(dbName) > 0 ) + { + if ((strcmp("template0",dbName)) && + (strcmp("template1",dbName))) + { + rtn = restore_from_dump(hostName, portNum, userName, dbName); + fprintf(stderr,"."); + } + } + } + if (res != (PGresult *)NULL) + { + PQclear(res); + } + + return STATUS_OK; +} + +static PGresult * +get_dbName(char * hostName, uint16_t portNum, char * userName) +{ + PGresult * res = (PGresult *)NULL; + int cnt = 0; + PGconn * conn = (PGconn *)NULL; + char port[8]; + char *database = "template1"; + char * query = "SELECT datname FROM pg_database"; + + if ( (hostName == NULL) || + (portNum <= 0) || + (userName == NULL)) + { + return (PGresult *)NULL; + } + snprintf(port,sizeof(port),"%d", portNum); + + /* create connection to master */ + conn = PQsetdbLogin(hostName, port, NULL, NULL, database, userName, NULL); + if (conn == NULL) + { + return (PGresult *)NULL; + } + /* check to see that the backend Connection was successfully made */ + cnt = 0; + while (PQstatus(conn) == CONNECTION_BAD) + { + if (conn != NULL) + { + PQfinish(conn); + } + if (cnt > MAX_RETRY_TIMES ) + { + return (PGresult *)NULL; + } + conn = PQsetdbLogin(hostName, port, NULL, NULL, database, userName, NULL); + cnt ++; + } + res = PQexec(conn , query); + if ((res == NULL) || + (PQresultStatus(res) != PGRES_TUPLES_OK)) + { + PQclear(res); + res = (PGresult *)NULL; + } + if (conn != NULL) + { + PQfinish(conn); + } + + return res; +} + +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/backend/libpq/replicate.c pgcluster-1.7.0rc7/src/backend/libpq/replicate.c --- postgresql-8.2.4/src/backend/libpq/replicate.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/replicate.c 2007-02-18 22:52:16.000000000 +0100 @@ -0,0 +1,4021 @@ +/*-------------------------------------------------------------------- + * FILE: + * replicate.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at backend for the replication. + * Low level I/O functions that called by in these functions are + * contained in 'replicate_com.c'. + * + *-------------------------------------------------------------------- + */ + +/*-------------------------------------- + * INTERFACE ROUTINES + * + * setup/teardown: + * PGR_Init_Replicate_Server_Data + * PGR_Set_Replicate_Server_Socket + * PGR_delete_shm + * I/O call: + * PGR_Send_Replicate_Command + * table handling: + * PGR_get_replicate_server_info + * status distinction: + * PGR_Is_Replicated_Command + * Xlog_Check_Replicatec + * replicateion main: + * PGR_replication + *------------------------------------- + */ +#ifdef USE_REPLICATION + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#include +#include + +#include "access/transam.h" +#include "bootstrap/bootstrap.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "commands/prepare.h" +#include "nodes/nodes.h" +#include "nodes/print.h" +#include "utils/guc.h" +#include "parser/parser.h" +#include "access/xact.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "tcop/utility.h" +#include "postmaster/postmaster.h" +#include "replicate.h" + +/* the source of this value is 'access/transam/varsup.c' */ +#define VAR_OID_PREFETCH (8192) + +PGR_ReplicationLog_Info ReplicationLog_Info; +bool pgr_skip_in_prepared_query = false; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static int set_command_args(char argv[PGR_CMD_ARG_NUM][256],char *str); +static bool is_same_replication_server(ReplicateServerInfo * sp1, ReplicateServerInfo * sp2 ); +static ReplicateServerInfo * search_new_replication_server ( ReplicateServerInfo * sp , int socket_type ); + +static int close_replicate_server_socket ( ReplicateServerInfo * sp , int socket_type ); +static int recv_message(int sock,char * buf,int flag); +static int send_replicate_packet(int sock,ReplicateHeader * header, char * query_string); +static bool is_copy_from(char * query); +static int get_words( char words[MAX_WORDS][MAX_WORD_LETTERS] ,char * string,int length,int upper); +static int get_table_name(char * table_name, char * query, int position ); +static bool is_not_replication_query(char * query_string, int query_len, char cmdType); +static int Comp_Not_Replicate(PGR_Not_Replicate_Type * nrp1,PGR_Not_Replicate_Type* nrp2); +static bool is_serial_control_query(char cmdType,char * query); +static bool is_select_into_query(char cmdType,char * query); +static int send_response_to_replication_server(const char * notice); +static bool do_not_replication_command(const char * commandTag); +static bool is_create_temp_table(char * query); +static int add_replication_server(char * hostname,char * port, char * recovery_port); +static int change_replication_server(char * hostname,char * port, char * recovery_port); +static int get_new_replication_socket( ReplicateServerInfo * base, ReplicateServerInfo * sp, int socket_type); +static char * get_hostName(char * str); +static void set_response_mode(char * mode); +static void PGR_Set_Current_Replication_Query_ID(char *id); +#ifdef CONTROL_LOCK_CONFLICT +static int wait_lock_answer(void); +static int read_trigger(char * result, int buf_size); +#endif /* CONTROL_LOCK_CONFLICT */ +static int check_conf_data(void); + +static unsigned int get_next_request_id(void); +static bool is_this_query_replicated(char * id); +static int set_replication_id(char * id); +static int return_current_oid(void); +static int sync_oid(char * oid); +static bool is_concerned_with_prepared_select(char cmdType, char * query_string); +static int skip_non_blank(char * ptr, int max); +static int skip_blank(char * ptr, int max); +static int parse_message(char * query_string); +static bool is_prepared_as_select(char * query_string); +static bool is_statement_as_select(char * query_string); + +extern ssize_t secure_read(Port *, void *, size_t); +/*-------------------------------------------------------------------- + * SYMBOL + * PGR_Init_Replicate_Server_Data() + * NOTES + * Read Configuration file and create ReplicateServerData table + * ARGS + * void + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGR_Init_Replicate_Server_Data(void) +{ + int table_size,str_size; + ReplicateServerInfo *sp; + PGR_Not_Replicate_Type * nrp; + ConfDataType * conf; + int rec_no,cnt; + unsigned int ip; + char HostName[HOSTNAME_MAX_LENGTH]; + + memset (HostName,0,sizeof(HostName)); + if (ConfData_Top == (ConfDataType *)NULL) + { + return STATUS_ERROR; + } + + /* allocate replication server information table */ + table_size = sizeof(ReplicateServerInfo) * MAX_SERVER_NUM; + ReplicateServerShmid = shmget(IPC_PRIVATE,table_size,IPC_CREAT | IPC_EXCL | 0600); + if (ReplicateServerShmid < 0) + { + return STATUS_ERROR; + } + ReplicateServerData = (ReplicateServerInfo *)shmat(ReplicateServerShmid,0,0); + if (ReplicateServerData == (ReplicateServerInfo *)-1) + { + return STATUS_ERROR; + } + memset(ReplicateServerData,0,table_size); + sp = ReplicateServerData; + + /* allocate cluster db information table */ + ClusterDBShmid = shmget(IPC_PRIVATE,sizeof(ClusterDBInfo),IPC_CREAT | IPC_EXCL | 0600); + if (ClusterDBShmid < 0) + { + return STATUS_ERROR; + } + ClusterDBData = (ClusterDBInfo *)shmat(ClusterDBShmid,0,0); + if (ClusterDBData == (ClusterDBInfo *)-1) + { + return STATUS_ERROR; + } + memset(ClusterDBData,0,sizeof(ClusterDBInfo)); + PGR_Set_Cluster_Status(STATUS_REPLICATED); + + /* allocate partial replicate table */ + table_size = sizeof(PGR_Not_Replicate_Type) * MAX_SERVER_NUM; + PGR_Not_Replicate = malloc(table_size); + if (PGR_Not_Replicate == (PGR_Not_Replicate_Type*)NULL) + { + return STATUS_ERROR; + } + memset(PGR_Not_Replicate, 0, table_size); + nrp = PGR_Not_Replicate; + cnt = 0; + conf = ConfData_Top; + while ((conf != (ConfDataType *)NULL) && (cnt < MAX_SERVER_NUM)) + { + /* set replication server table */ + if (!strcmp(conf->table,REPLICATION_SERVER_INFO_TAG)) + { + rec_no = conf->rec_no; + cnt = rec_no; + if (!strcmp(conf->key,HOST_NAME_TAG)) + { + strncpy((sp + rec_no)->hostName,conf->value,sizeof(sp->hostName)); + conf = (ConfDataType *)conf->next; + continue; + } + if (!strcmp(conf->key,PORT_TAG)) + { + (sp + rec_no)->portNumber = atoi(conf->value); + (sp + rec_no)->sock = -1; + if ((sp + rec_no)->useFlag != DATA_USE) + { + PGR_Set_Replication_Server_Status((sp+rec_no), DATA_INIT); + } + memset((sp + rec_no + 1)->hostName,0,sizeof(sp->hostName)); + (sp + rec_no + 1)->useFlag = DATA_END; + conf = (ConfDataType *)conf->next; + continue; + } + if (!strcmp(conf->key,RECOVERY_PORT_TAG)) + { + (sp + rec_no)->recoveryPortNumber = atoi(conf->value); + if ((sp + rec_no)->useFlag != DATA_USE) + { + PGR_Set_Replication_Server_Status((sp+rec_no), DATA_INIT); + } + memset((sp + rec_no + 1)->hostName,0,sizeof(sp->hostName)); + (sp + rec_no + 1)->useFlag = DATA_END; + conf = (ConfDataType *)conf->next; + continue; + } + } + /* set part replication table */ + if (!strcmp(conf->table,NOT_REPLICATE_INFO_TAG)) + { + rec_no = conf->rec_no; + cnt = rec_no; + if (PGR_Not_Replicate_Rec_Num < rec_no +1) + { + PGR_Not_Replicate_Rec_Num = rec_no +1; + } + if (!strcmp(conf->key,DB_NAME_TAG)) + { + strncpy((nrp + rec_no)->db_name,conf->value,sizeof(nrp->db_name)); + conf = (ConfDataType *)conf->next; + continue; + } + if (!strcmp(conf->key,TABLE_NAME_TAG)) + { + strncpy((nrp + rec_no)->table_name,conf->value,sizeof(nrp->table_name)); + conf = (ConfDataType *)conf->next; + continue; + } + } + if (!strcmp(conf->key,HOST_NAME_TAG)) + { + str_size = sizeof(HostName) ; + memset(HostName,0,str_size); + strncpy(HostName,conf->value,str_size-1); + } + else if (!strcmp(conf->key,RECOVERY_PORT_TAG)) + { + RecoveryPortNumber = atoi(conf->value); + } + else if (!strcmp(conf->key,RSYNC_PATH_TAG)) + { + str_size = strlen(conf->value) ; + RsyncPath = malloc(str_size + 1); + if (RsyncPath == NULL) + { + return STATUS_ERROR; + } + memset(RsyncPath,0,str_size + 1); + strncpy(RsyncPath,conf->value,str_size); + } + else if (!strcmp(conf->key,RSYNC_OPTION_TAG)) + { + str_size = strlen(conf->value) ; + RsyncOption = malloc(str_size + 1); + if (RsyncOption == NULL) + { + return STATUS_ERROR; + } + memset(RsyncOption,0,str_size + 1); + strncpy(RsyncOption,conf->value,str_size); + } + else if (!strcmp(conf->key,RSYNC_COMPRESS_TAG)) + { + if (!strcmp(conf->value, "yes")) + RsyncCompress = true; + else if (!strcmp(conf->value, "no")) + RsyncCompress = false; + } + else if (!strcmp(conf->key,PG_DUMP_PATH_TAG)) + { + str_size = strlen(conf->value) ; + PgDumpPath = malloc(str_size + 1); + if (PgDumpPath == NULL) + { + return STATUS_ERROR; + } + memset(PgDumpPath,0,str_size + 1); + strncpy(PgDumpPath,conf->value,str_size); + } + else if (!strcmp(conf->key,STAND_ALONE_TAG)) + { + PGR_Stand_Alone = (PGR_Stand_Alone_Type*)malloc(sizeof(PGR_Stand_Alone_Type)); + if (PGR_Stand_Alone == (PGR_Stand_Alone_Type *)NULL) + { + return STATUS_ERROR; + } + PGR_Stand_Alone->is_stand_alone = false; + if (!strcmp(conf->value,READ_WRITE_IF_STAND_ALONE)) + { + PGR_Stand_Alone->permit = PERMIT_READ_WRITE; + } + else + { + PGR_Stand_Alone->permit = PERMIT_READ_ONLY; + } + } + else if (!strcmp(conf->key,TIMEOUT_TAG)) + { + /* get repliaction timeout */ + PGR_Replication_Timeout = PGRget_time_value(conf->value); + if ((PGR_Replication_Timeout < 1) || (PGR_Replication_Timeout > 3600)) + { + fprintf(stderr,"%s is out of range. It should be between 1sec-1hr.\n",TIMEOUT_TAG); + return STATUS_ERROR; + } + } + else if (!strcmp(conf->key,LIFECHECK_TIMEOUT_TAG)) + { + /* get lifecheck timeout */ + PGR_Lifecheck_Timeout = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Timeout < 1) || (PGR_Lifecheck_Timeout > 3600)) + { + fprintf(stderr,"%s is out of range. It should be between 1sec-1hr.\n",LIFECHECK_TIMEOUT_TAG); + return STATUS_ERROR; + } + } + else if (!strcmp(conf->key,LIFECHECK_INTERVAL_TAG)) + { + /* get lifecheck interval */ + PGR_Lifecheck_Interval = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Interval < 1) || (PGR_Lifecheck_Interval > 3600)) + { + fprintf(stderr,"%s is out of range. It should between 1sec-1hr.\n",LIFECHECK_INTERVAL_TAG); + return STATUS_ERROR; + } + } + conf = (ConfDataType *)conf->next; + } + TransactionSock = -1; + ReplicateCurrentTime = (ReplicateNow *)malloc(sizeof(ReplicateNow)); + if (ReplicateCurrentTime == (ReplicateNow *)NULL) + { + return STATUS_ERROR; + } + memset(ReplicateCurrentTime,0,sizeof(ReplicateNow)); + + PGRCopyData = (CopyData *)malloc(sizeof(CopyData)); + if (PGRCopyData == (CopyData *)NULL) + { + return STATUS_ERROR; + } + memset(PGRCopyData,0,sizeof(CopyData)); + + if (PGR_Not_Replicate_Rec_Num == 0) + { + free(PGR_Not_Replicate); + PGR_Not_Replicate = NULL; + } + else + { + qsort((char *)PGR_Not_Replicate,PGR_Not_Replicate_Rec_Num,sizeof(PGR_Not_Replicate_Type), (int (*)(const void*,const void*))Comp_Not_Replicate); + } + + PGRSelfHostName = malloc(HOSTNAME_MAX_LENGTH); + if (PGRSelfHostName == NULL) + { + return STATUS_ERROR; + } + memset(PGRSelfHostName,0,HOSTNAME_MAX_LENGTH); + + PGR_password = malloc(sizeof(PGR_Password_Info)); + if (PGR_password == NULL) + { + return STATUS_ERROR; + } + memset(PGR_password,0,sizeof(PGR_Password_Info)); + PGR_password->password = malloc(PASSWORD_MAX_LENGTH); + if (PGR_password->password == NULL) + { + return STATUS_ERROR; + } + memset(PGR_password->password,0,PASSWORD_MAX_LENGTH); + + if (HostName[0] == 0) + { + if (gethostname(HostName,HOSTNAME_MAX_LENGTH) < 0) + { + return STATUS_ERROR; + } + } + ip=PGRget_ip_by_name(HostName); + + sprintf(PGRSelfHostName, + "%d.%d.%d.%d", + (ip ) & 0xff , + (ip >> 8) & 0xff , + (ip >> 16) & 0xff , + (ip >> 24) & 0xff ); + if (RsyncPath == NULL) + { + RsyncPath = strdup(DEFAULT_RSYNC); + } + if (PgDumpPath == NULL) + { + PgDumpPath = strdup(DEFAULT_PG_DUMP); + } + + return (check_conf_data()); +} + +static int +check_conf_data(void) +{ + int i = 0; + ReplicateServerInfo *sp; + sp = ReplicateServerData; + while ((sp + i)->useFlag != DATA_END) + { + if (*((sp + i)->hostName) == 0) + { + fprintf(stderr,"Hostname of replication server is not valid.\n"); + return STATUS_ERROR; + } + if ((sp + i)->portNumber < 1024) + { + fprintf(stderr,"Replication Port of replication server is not valid. It's required larger than 1024.\n"); + return STATUS_ERROR; + } + if ((sp + i)->recoveryPortNumber < 1024) + { + fprintf(stderr,"RecoveryPort of replication server is not valid. It's required larger than 1024.\n"); + return STATUS_ERROR; + } + if ((sp + i)->portNumber == (sp + i)->recoveryPortNumber) + { + fprintf(stderr,"Replication Port and RecoveryPort is conflicted.\n"); + return STATUS_ERROR; + } + i++; + } + if (RecoveryPortNumber < 1024) + { + fprintf(stderr,"RecoveryPort of Cluster DB is not valid. It's required larger than 1024.\n"); + return STATUS_ERROR; + } + if (PGR_Stand_Alone == NULL) + { + fprintf(stderr,"Stand Alone Mode is not specified.\n"); + return STATUS_ERROR; + } + if (RsyncOption == NULL) + { + fprintf(stderr,"Option of rsync command is not specified.\n"); + return STATUS_ERROR; + } + if (strlen(PGRSelfHostName) <= 0) + { + fprintf(stderr,"Hostname of Cluster DB is not valid.\n"); + return STATUS_ERROR; + } + if (PGR_Lifecheck_Timeout > PGR_Lifecheck_Interval) + { + fprintf(stderr,"The lifecheck timeouti(%d) should be shorter than interval(%d).\n",PGR_Lifecheck_Timeout,PGR_Lifecheck_Interval); + return STATUS_ERROR; + } + return STATUS_OK; + } + +/*-------------------------------------------------------------------- + * SYMBOL + * PGR_Set_Replicate_Server_Socket() + * NOTES + * Create new socket and set ReplicateServerData table + * ARGS + * void + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGR_Set_Replicate_Server_Socket(void) +{ + ReplicateServerInfo * sp; + if (ReplicateServerData == NULL) + { + return STATUS_ERROR; + } + sp = ReplicateServerData; + while (sp->useFlag != DATA_END){ + sp->sock = -1; + PGR_Create_Socket_Connect(&(sp->sock),sp->hostName,sp->portNumber); + sp ++; + } + return STATUS_OK; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGR_get_replicate_server_socket() + * NOTES + * search or create a socket to connect with the replication server + * ARGS + * ReplicateServerInfo * sp: replication server data (I) + * int socket_type: socket type (I) + * -PGR_TRANSACTION_SOCKET: + * -PGR_QUERY_SOCKET: + * RETURN + * OK: >0(socket) + * NG: -1 + *-------------------------------------------------------------------- + */ +int +PGR_get_replicate_server_socket ( ReplicateServerInfo * sp , int socket_type ) +{ + ReplicateServerInfo * tmp; + tmp = sp; + if (tmp == (ReplicateServerInfo *) NULL) + { + return -1; + } + if (tmp->hostName[0] == '\0') + { + return -1; + } + + if (TransactionSock != -1) + { + return TransactionSock; + } + + while(PGR_Create_Socket_Connect(&TransactionSock,tmp->hostName,tmp->portNumber) != STATUS_OK) + { + close(TransactionSock); + TransactionSock = -1; + PGR_Set_Replication_Server_Status(tmp, DATA_ERR); + usleep(20); + tmp = PGR_get_replicate_server_info(); + if (tmp == (ReplicateServerInfo *)NULL) + { + return -1; + } + PGR_Set_Replication_Server_Status(tmp, DATA_USE); + usleep(10); + } + return TransactionSock; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * close_replicate_server_socket() + * NOTES + * close the socket connected with the replication server + * ARGS + * ReplicateServerInfo * sp: replication server data (I) + * int socket_type: socket type (I) + * -PGR_TRANSACTION_SOCKET: + * -PGR_QUERY_SOCKET: + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +static int +close_replicate_server_socket ( ReplicateServerInfo * sp , int socket_type ) +{ + if (sp == (ReplicateServerInfo *)NULL ) + { + return STATUS_ERROR; + } + if (sp->hostName[0] == '\0') + { + return STATUS_ERROR; + } + if (TransactionSock != -1) + { + PGR_Close_Sock(&(TransactionSock)); + TransactionSock = -1; + } + switch (socket_type) + { + case PGR_TRANSACTION_SOCKET: + if (TransactionSock != -1) + { + PGR_Close_Sock(&(TransactionSock)); + } + TransactionSock = -1; + sp->sock = -1; + break; + case PGR_QUERY_SOCKET: + if (sp->sock != -1) + { + PGR_Close_Sock(&(sp->sock)); + } + sp->sock = -1; + break; + } + PGR_Set_Replication_Server_Status(sp, DATA_INIT); + return STATUS_OK; +} + +static bool +is_same_replication_server(ReplicateServerInfo * sp1, ReplicateServerInfo * sp2 ) +{ + if ((sp1 == NULL) || (sp2 == NULL)) + { + return false; + } + if ((!strcmp(sp1->hostName,sp2->hostName)) && + (sp1->portNumber == sp2->portNumber) && + (sp1->recoveryPortNumber == sp2->recoveryPortNumber)) + { + return true; + } + return false; +} + +static ReplicateServerInfo * +search_new_replication_server ( ReplicateServerInfo * sp , int socket_type ) +{ + ReplicateHeader dummy_header; + ReplicateServerInfo * rs_tbl; + char command[256]; + int sock = -1; + int cnt = 0; + + if ((ReplicateServerData == NULL) || ( sp == NULL)) + { + return NULL; + } + rs_tbl = sp; + close_replicate_server_socket ( sp , socket_type); + sp ++; + while (is_same_replication_server(sp,rs_tbl) != true) + { + if (sp->useFlag == DATA_END) + { + sp = ReplicateServerData; + } + sock = PGR_get_replicate_server_socket( sp , socket_type); + if (sock < 0 ) + { + if (is_same_replication_server(sp,rs_tbl) == true) + { + return NULL; + } + else + { + sp++; + } + continue; + } + memset(&dummy_header, 0, sizeof(ReplicateHeader)); + memset(command,0,sizeof(command)); + snprintf(command,sizeof(command)-1,"SELECT %s(%d,%s,%d,%d)", + PGR_SYSTEM_COMMAND_FUNC, + PGR_CHANGE_REPLICATION_SERVER_FUNC_NO, + sp->hostName, + sp->portNumber, + sp->recoveryPortNumber); + dummy_header.cmdSys = CMD_SYS_CALL; + dummy_header.cmdSts = CMD_STS_NOTICE; + dummy_header.query_size = htonl(strlen(command)); + if (send_replicate_packet(sock,&dummy_header,command) != STATUS_OK) + { + cnt ++; + close_replicate_server_socket ( sp , socket_type); + PGR_Set_Replication_Server_Status(sp, DATA_ERR); + } + else + { + PGR_Set_Replication_Server_Status(sp, DATA_USE); + return sp; + } + if (cnt > MAX_RETRY_TIMES ) + { + sp++; + cnt = 0; + } + else + { + continue; + } + } + return NULL; +} + +static int +get_table_name(char * table_name, char * query, int position ) +{ + + int i,wc; + char * p; + char * sp; + int length; + + if ((table_name == NULL) || (query == NULL) || (position < 1)) + { + return STATUS_ERROR; + } + length = strlen(query); + p = query; + wc = 1; + sp = table_name; + for (i = 0 ; i < length ; i ++) + { + while(isspace(*p)) + { + p++; + i++; + } + while((*p != '\0') && (! isspace(*p))) + { + if ((*p == ';') || (*p == '(')) + break; + if (wc == position) + { + *sp = *p; + sp++; + } + p++; + i++; + } + if (wc == position) + { + *sp = '\0'; + break; + } + wc++; + } + return STATUS_OK; +} + +static bool +is_not_replication_query(char * query_string, int query_len, char cmdType) +{ + PGR_Not_Replicate_Type key; + PGR_Not_Replicate_Type * ptr = NULL; + + if (PGR_Not_Replicate_Rec_Num <= 0) + return false; + if (query_string == NULL) + return true; + memset(&key,0,sizeof(PGR_Not_Replicate_Type)); + strncpy(key.db_name ,(char *)(MyProcPort->database_name),sizeof(key.db_name)-1); + switch (cmdType) + { + case CMD_TYPE_INSERT: + get_table_name(key.table_name,query_string,3); + break; + case CMD_TYPE_UPDATE: + get_table_name(key.table_name,query_string,2); + break; + case CMD_TYPE_DELETE: + get_table_name(key.table_name,query_string,3); + break; + case CMD_TYPE_COPY: + get_table_name(key.table_name,query_string,2); + break; + default: + return false; + } + ptr = (PGR_Not_Replicate_Type*)bsearch((void*)&key,(void*)PGR_Not_Replicate,PGR_Not_Replicate_Rec_Num,sizeof(PGR_Not_Replicate_Type), (int (*)(const void*,const void*))Comp_Not_Replicate); + if (ptr == NULL) + { + return false; + } + return true; + +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGR_Send_Replicate_Command() + * NOTES + * create new socket + * ARGS + * char * query_string: query strings (I) + * char cmdSts: + * char cmdType: + * RETURN + * OK: result + * NG: NULL + *-------------------------------------------------------------------- + */ +char * +PGR_Send_Replicate_Command(char * query_string, int query_len, char cmdSts ,char cmdType) +{ + int sock = -1; + int cnt = 0; + ReplicateHeader header; + char * serverName = NULL; + int portNumber=0; + char * result = NULL; + ReplicateServerInfo * sp = NULL; + ReplicateServerInfo * base = NULL; + int socket_type = 0; + char argv[ PGR_CMD_ARG_NUM ][256]; + int argc = 0; + int func_no = 0; + int check_flag =0; + bool in_transaction = false; + + + /* + * check query string + */ + if ((query_string == NULL) || + (query_len < 0)) + { + return NULL; + } + /* check not replication query */ + if (is_not_replication_query(query_string, query_len, cmdType) == true) + { + PGR_Copy_Data_Need_Replicate = false; + return NULL; + } + + if ((cmdSts == CMD_STS_TRANSACTION ) || + (cmdSts == CMD_STS_SET_SESSION_AUTHORIZATION ) || + (cmdSts == CMD_STS_TEMP_TABLE )) + { + socket_type = PGR_TRANSACTION_SOCKET ; + } + else + { + socket_type = PGR_QUERY_SOCKET ; + } + + if(cmdSts==CMD_STS_TRANSACTION + && (cmdType!=CMD_TYPE_BEGIN && cmdType!=CMD_TYPE_ROLLBACK)) + { + in_transaction = true; + } + + sp = PGR_get_replicate_server_info(); + if (sp == NULL) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_info get error"); + return NULL; + } + sock = PGR_get_replicate_server_socket( sp , socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_socket fail"); + return NULL; + } + result = malloc(PGR_MESSAGE_BUFSIZE + 4); + if (result == NULL) + { + return NULL; + } + + serverName = sp->hostName; + portNumber = (int)sp->portNumber; + memset(&header,0,sizeof(ReplicateHeader)); + + header.cmdSts = cmdSts; + header.cmdType = cmdType; + header.port = htons(PostPortNumber); + header.pid = htons(getpid()); + header.query_size = htonl(query_len); + strncpy(header.dbName ,(char *)(MyProcPort->database_name),sizeof(header.dbName)-1); + strncpy(header.userName , (char *)(MyProcPort->user_name),sizeof(header.userName)-1); + strncpy(header.password , PGR_password->password, PASSWORD_MAX_LENGTH ); + memcpy(header.md5Salt ,MyProcPort->md5Salt, sizeof(header.md5Salt)); + memcpy(header.cryptSalt ,MyProcPort->cryptSalt, sizeof(header.cryptSalt)); + header.request_id = htonl(get_next_request_id()); + header.rlog = 0; + + if (PGRSelfHostName != NULL) + { + strncpy(header.from_host, PGRSelfHostName, HOSTNAME_MAX_LENGTH); + } + + base = sp; + PGR_Sock_To_Replication_Server = sock; + +retry_send_prereplicate_packet: + + memset(result,0,PGR_MESSAGE_BUFSIZE + 4); + cnt = 0; + header.cmdSys=CMD_SYS_PREREPLICATE; + + while (send_replicate_packet(sock,&header,query_string) != STATUS_OK) + { + cnt++; + if (cnt >= MAX_RETRY_TIMES ) + { + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + if (cmdSts == CMD_STS_TRANSACTION ) + { + strcpy(result,PGR_REPLICATION_ABORT_MSG); + return result; + } + free(result); + result = NULL; + return NULL; + + } + if(in_transaction) + { + elog(ERROR,"replicate server down during replicating transaction. aborted."); + free(result); + return NULL; + } + PGR_Sock_To_Replication_Server = sock; + cnt = 0; + } + } + + memset(result,0,PGR_MESSAGE_BUFSIZE); + if (PGR_recv_replicate_result(sock,result,0) < 0) + { + + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + + if (cmdSts == CMD_STS_TRANSACTION ) + { + strcpy(result,PGR_REPLICATION_ABORT_MSG); + return result; + } + if(result!=NULL) { + free(result); + result = NULL; + } + return NULL; + } + PGR_Sock_To_Replication_Server = sock; + /* replication server should be down */ + + if(in_transaction) + { + elog(ERROR,"replicate server down during replicating transaction. aborted."); + free(result); + return NULL; + } + + goto retry_send_prereplicate_packet; + } + + + argc = set_command_args(argv,result); + func_no=atoi(argv[0]); + if(func_no==0) { + /* this server is not primary replicate server*/ + sock=-1; + goto retry_send_prereplicate_packet; + } +retry_send_replicate_packet: + + memset(result,0,PGR_MESSAGE_BUFSIZE + 4); + cnt = 0; + header.cmdSys = CMD_SYS_REPLICATE; + while (send_replicate_packet(sock,&header,query_string) != STATUS_OK) + { + if (cnt > MAX_RETRY_TIMES ) + { + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + if (cmdSts == CMD_STS_TRANSACTION ) + { + strcpy(result,PGR_REPLICATION_ABORT_MSG); + return result; + } + free(result); + result = NULL; + return NULL; + + } + PGR_Sock_To_Replication_Server = sock; + header.rlog = CONNECTION_SUSPENDED_TYPE; + cnt = 0; + } + cnt ++; + } + + memset(result,0,PGR_MESSAGE_BUFSIZE); + if (PGR_recv_replicate_result(sock,result,0) < 0) + { + /* replication server should be down */ + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + + if (cmdSts == CMD_STS_TRANSACTION ) + { + strcpy(result,PGR_REPLICATION_ABORT_MSG); + return result; + } + if(result!=NULL) { + free(result); + result = NULL; + } + return NULL; + } + PGR_Sock_To_Replication_Server = sock; + header.rlog = CONNECTION_SUSPENDED_TYPE; + + goto retry_send_replicate_packet; + } + + argc = set_command_args(argv,result); + if (argc >= 1) + { + func_no = atoi(argv[0]); + if (func_no == PGR_SET_CURRENT_TIME_FUNC_NO) + { + if(! in_transaction) + PGR_Set_Current_Time(argv[1],argv[2]); + set_replication_id(argv[3]); + set_response_mode(argv[4]); + PGR_Set_Current_Replication_Query_ID(argv[5]); + } + else if (func_no == PGR_NOTICE_DEADLOCK_DETECTION_FUNC_NO) + { + memset(result,0,PGR_MESSAGE_BUFSIZE); + strcpy(result,PGR_DEADLOCK_DETECTION_MSG); + } + else if (func_no == PGR_SET_CURRENT_REPLICATION_QUERY_ID_NO) + { + PGR_Set_Current_Replication_Query_ID(argv[1]); + } + else if (func_no == PGR_QUERY_CONFIRM_ANSWER_FUNC_NO) + { + check_flag = atoi(argv[1]); + if (check_flag == PGR_ALREADY_COMMITTED ) + { + if(! in_transaction) + PGR_Set_Current_Time(argv[2],argv[3]); + set_replication_id(argv[4]); + } + else + { + if(! in_transaction) + PGR_Set_Current_Time(argv[1],argv[2]); + set_replication_id(argv[3]); + /* this query is not replicated */ + /* + free(result); + return NULL; + */ + } + } + } + return result; +} + +uint32_t +PGRget_replication_id(void) +{ + return (ReplicationLog_Info.PGR_Replicate_ID); +} + +static int +set_replication_id(char * id) +{ + uint32_t rid=0; + uint32_t saved_id; + if (id == NULL) + { + return STATUS_ERROR; + } + + rid=(uint32_t)atol(id); + if(rid==0) + return STATUS_OK; + + needToUpdateReplicateIdOnNextQueryIsDone=true; + saved_id=ReplicationLog_Info.PGR_Replicate_ID; + + ReplicationLog_Info.PGR_Replicate_ID =rid; + + + /*set replicate id in this process */ + + + if (CurrentReplicateServer == NULL) + { + PGR_get_replicate_server_info(); + } + if (CurrentReplicateServer != NULL) + { + /* set replicate id in this system */ + saved_id=CurrentReplicateServer->replicate_id; + elog(DEBUG1, "replication id set from %d to %d", saved_id, rid); + + CurrentReplicateServer->replicate_id = (uint32_t)(atol(id)); + } + + return STATUS_OK; +} + + +static unsigned int +get_next_request_id(void) +{ + if (ReplicationLog_Info.PGR_Request_ID +1 < PGR_MAX_COUNTER) + { + ReplicationLog_Info.PGR_Request_ID ++; + } + else + { + ReplicationLog_Info.PGR_Request_ID = 0; + } + return ReplicationLog_Info.PGR_Request_ID ; + +} + +static bool +is_this_query_replicated(char * id) +{ + uint32_t replicate_id = 0; + uint32_t saved_id = 0; + int32_t diff=0; + ReplicateServerInfo * replicate_server_info = NULL; + + if (id == NULL) + { + return false; + } + replicate_id = (uint32_t)atol(id); + elog(DEBUG1, "check for replication id , input=%u", replicate_id); + + if (CurrentReplicateServer == NULL) + { + PGR_get_replicate_server_info(); + } + + if (CurrentReplicateServer != NULL) + { + replicate_server_info = CurrentReplicateServer; + } + else if (LastReplicateServer != NULL) + { + replicate_server_info = LastReplicateServer; + } + if (replicate_server_info != NULL) + { + + saved_id=replicate_server_info->replicate_id; + saved_id = saved_id < ReplicationLog_Info.PGR_Replicate_ID + ? ReplicationLog_Info.PGR_Replicate_ID + : saved_id; + + elog(DEBUG1, "check for replication id , now=%u", saved_id); + /* check replicate_id < saved_id logically + * + * see also: + * backend/transam/transam.c#TransactionIdPrecedes + */ + + diff = (int32) (saved_id-replicate_id); + return (diff > 0); + } + elog(DEBUG1, "check for replication id check failed. no replication server"); + return false; +} + + +static int +get_new_replication_socket( ReplicateServerInfo * base, ReplicateServerInfo * sp, int socket_type) +{ + int sock = -1; + + if (( base == NULL) || + ( sp == NULL)) + { + return -1; + } + close_replicate_server_socket ( sp , socket_type); + PGR_Set_Replication_Server_Status(sp, DATA_ERR); + sp = search_new_replication_server(base, socket_type); + if (sp == NULL) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + return -1; + } + sock = PGR_get_replicate_server_socket( sp , socket_type); + return sock; +} + + +int +PGR_recv_replicate_result(int sock,char * result,int user_timeout) +{ + fd_set rmask; + struct timeval timeout; + int rtn; + + if (result == NULL) + { + return -1; + } + + /* + * Wait for something to happen. + */ + for (;;) + { + if (user_timeout == 0) + timeout.tv_sec = PGR_Replication_Timeout; + else + timeout.tv_sec = user_timeout; + + timeout.tv_usec = 0; + + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + rtn = select(sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn <= 0) + { + if (errno != EINTR) + return -1; + } + + else if ((rtn > 0) && (FD_ISSET(sock, &rmask))) + { + return (recv_message(sock, result,0)); + } + } + return -1; +} + +static int +recv_message(int sock,char * buf,int flag) +{ + int cnt = 0; + int r = 0; + char * read_ptr; + int read_size = 0; + cnt = 0; + read_ptr = buf; + + for (;;) + { + r = recv(sock,read_ptr + read_size ,PGR_MESSAGE_BUFSIZE - read_size, flag); + if (r < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } else { + elog(DEBUG1, "recv_message():recv failed"); + return -1; + } + } else if (r == 0) { + elog(DEBUG1, "recv_message():unexpected EOF"); + return -1; + } else /*if (r > 0)*/ { + read_size += r; + if (read_size == PGR_MESSAGE_BUFSIZE) + { + return read_size; + } + } + } + return -1; +} + +static int +send_replicate_packet(int sock,ReplicateHeader * header, char * query_string) +{ + int s = 0; + char * send_ptr = NULL; + char * buf = NULL; + int send_size = 0; + int buf_size = 0; + int header_size = 0; + int rtn = 0; + fd_set wmask; + struct timeval timeout; + int query_size = 0; + + /* check parameter */ + if ((sock < 0) || (header == NULL)) + { + return STATUS_ERROR; + } + + query_size = ntohl(header->query_size); + header_size = sizeof(ReplicateHeader); + buf_size = header_size + query_size + 4; + buf = malloc(buf_size); + if (buf == NULL) + { + return STATUS_ERROR; + } + memset(buf,0,buf_size); + buf_size -= 4; + memcpy(buf,header,header_size); + if (query_string != NULL) + { + memcpy((char *)(buf+header_size),query_string,query_size+1); + } + send_ptr = buf; + + /* + * Wait for something to happen. + */ + rtn = 1; + for (;;) + { + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + FD_ZERO(&wmask); + FD_SET(sock,&wmask); + rtn = select(sock+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR) + continue; + else + { + elog(DEBUG1, "send_replicate_packet():select() failed"); + return STATUS_ERROR; + } + } + else if (rtn && FD_ISSET(sock, &wmask)) + { + + + s = send(sock,send_ptr + send_size,buf_size - send_size ,0); + if (s < 0){ + if (errno == EINTR || errno == EAGAIN) + { + continue; + } + elog(DEBUG1, "send_replicate_packet():send error"); + + /* EPIPE || ENCONNREFUSED || ENSOCK || EHOSTUNREACH */ + return STATUS_ERROR; + } else if (s == 0) { + free(buf); + buf = NULL; + elog(DEBUG1, "send_replicate_packet():unexpected EOF"); + return STATUS_ERROR; + } else /*if (s > 0)*/ { + send_size += s; + if (send_size == buf_size) + { + free(buf); + buf = NULL; + return STATUS_OK; + } + } + } + } + if (buf != NULL) + { + free(buf); + buf = NULL; + } + return STATUS_ERROR; +} + +bool +PGR_Is_Replicated_Command(char * query) +{ + + return (PGR_Is_System_Command(query)); +} + +int +Xlog_Check_Replicate(int operation) +{ + if (PGR_Get_Cluster_Status() == STATUS_RECOVERY) + { + return STATUS_OK; + /* elog(WARNING, "This query is not permitted while recovery db "); */ + } + else if ((operation == CMD_UTILITY ) || + (operation == CMD_INSERT ) || + (operation == CMD_UPDATE ) || + (operation == CMD_DELETE )) + { + return (PGR_Replicate_Function_Call()); + } + return STATUS_OK; +} + +int +PGR_Replicate_Function_Call(void) +{ + char *result = NULL; + int status = STATUS_OK; + + if ((PGR_Get_Cluster_Status() == STATUS_RECOVERY) || + (PGR_Stand_Alone == NULL)) + { + return STATUS_OK; + } + if (Query_String != NULL) + { + if (PGR_Is_Stand_Alone() == true) + { + if (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) + { + Query_String = NULL; + return STATUS_ERROR; + } + } + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + result = PGR_Send_Replicate_Command(Query_String,strlen(Query_String), CMD_STS_QUERY,CMD_TYPE_SELECT); + if (result != NULL) + { + PGR_Reload_Start_Time(); + if (!strncmp(result,PGR_DEADLOCK_DETECTION_MSG,strlen(PGR_DEADLOCK_DETECTION_MSG))) + { + status = STATUS_DEADLOCK_DETECT; + } + free(result); + result = NULL; + } + else + { + status = STATUS_ERROR; + } + Query_String = NULL; + } + return status; +} + +void +PGR_delete_shm(void) +{ + + if (ReplicateServerData != NULL) + { + shmdt(ReplicateServerData); + ReplicateServerData = NULL; + shmctl(ReplicateServerShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + if (ClusterDBData != NULL) + { + shmdt(ClusterDBData); + ClusterDBData = NULL; + shmctl(ClusterDBShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (TransactionSock != -1) + { + close(TransactionSock); + } + + if (RsyncPath != NULL) + { + free(RsyncPath); + RsyncPath = NULL; + } + if (RsyncOption != NULL) + { + free(RsyncOption); + RsyncOption = NULL; + } + + if (ReplicateCurrentTime != NULL) + { + free(ReplicateCurrentTime); + ReplicateCurrentTime = NULL; + } + + if (PGRCopyData != NULL) + { + free (PGRCopyData); + PGRCopyData = NULL; + } + + if (PGR_Stand_Alone != NULL) + { + free(PGR_Stand_Alone); + PGR_Stand_Alone = NULL; + } + + if (PGR_Not_Replicate != NULL) + { + free(PGR_Not_Replicate); + PGR_Not_Replicate = NULL; + } + if (PGRSelfHostName != NULL) + { + free(PGRSelfHostName); + PGRSelfHostName = NULL; + } + if (PGR_password != NULL) + { + if (PGR_password->password != NULL) + { + free(PGR_password->password); + PGR_password->password = NULL; + } + free(PGR_password); + PGR_password = NULL; + } +} + +ReplicateServerInfo * +PGR_get_replicate_server_info(void) +{ + + ReplicateServerInfo * sp; + + if (ReplicateServerData == NULL) + { + return (ReplicateServerInfo *)NULL; + } + /* check current using replication server */ + sp = PGR_check_replicate_server_info(); + if (sp != NULL) + { + if (CurrentReplicateServer != NULL) + { + LastReplicateServer = CurrentReplicateServer; + CurrentReplicateServer->replicate_id = LastReplicateServer->replicate_id; + } + CurrentReplicateServer = sp; + return sp; + } + /* there is no used replication server */ + /* however it may exist still in initial status */ + sp = ReplicateServerData; + while (sp->useFlag != DATA_END) + { + if (sp->useFlag != DATA_ERR ) + { + if (CurrentReplicateServer != NULL) + { + LastReplicateServer = CurrentReplicateServer; + CurrentReplicateServer->replicate_id = LastReplicateServer-> replicate_id; + } + CurrentReplicateServer = sp; + PGR_Set_Replication_Server_Status(sp, DATA_USE); + return sp; + } + sp++; + } + PGR_Stand_Alone->is_stand_alone = true; + if (CurrentReplicateServer != NULL) + { + LastReplicateServer = CurrentReplicateServer; + CurrentReplicateServer->replicate_id = LastReplicateServer-> replicate_id; + } + CurrentReplicateServer = NULL; + return (ReplicateServerInfo *)NULL; +} + +ReplicateServerInfo * +PGR_check_replicate_server_info(void) +{ + + ReplicateServerInfo * sp; + + if (ReplicateServerData == NULL) + { + return (ReplicateServerInfo *)NULL; + } + sp = ReplicateServerData; + while (sp->useFlag != DATA_END) + { + if (sp->useFlag == DATA_USE ) + { + return sp; + } + sp++; + } + return NULL; +} + +int +PGR_Send_Copy(CopyData * copy,int end ) +{ + + char cmdSts,cmdType; + char * p = NULL; + char *result = NULL; + char term[8]; + /*int status = 0; */ + + if (copy == NULL) + { + return STATUS_ERROR; + } + + cmdSts = CMD_STS_COPY; + + if (Transaction_Mode > 0) + { + cmdSts = CMD_STS_TRANSACTION ; + } + if (Session_Authorization_Mode) + { + cmdSts = CMD_STS_SET_SESSION_AUTHORIZATION ; + } + cmdType = CMD_TYPE_COPY_DATA; + + copy->copy_data[copy->cnt] = '\0'; + if (end) + { + memset(term,0,sizeof(term)); + term[0]='\\'; + term[1]='.'; + term[2]='\n'; + + cmdType = CMD_TYPE_COPY_DATA_END; + p = NULL; + if (copy->cnt > 0) + { + copy->copy_data[copy->cnt] = '\0'; + p = strstr(copy->copy_data,term); + if (p == NULL) + { + p = &(copy->copy_data[copy->cnt-1]); + copy->cnt--; + } + else + { + p = NULL; + } + } + if (p != NULL) + { + strncpy(p,term,sizeof(term)); + copy->cnt += 4; + } + } + result = PGR_Send_Replicate_Command(copy->copy_data, copy->cnt, cmdSts, cmdType); + memset(copy,0,sizeof(CopyData)); + + if (result != NULL) + { + PGR_Reload_Start_Time(); + free(result); + result = NULL; + return STATUS_OK; + } + else + { + return STATUS_ERROR; + } +} + +CopyData * +PGR_Set_Copy_Data(CopyData * copy, char *str, int len,int end) +{ + CopyData save; + int save_len = 0; + int read_index = 0; + int send_size = 0; + int buf_size = 0; + int rest_len = 0; + int rest_buf_size = 0; + int status = STATUS_OK; + char * ep = NULL; + char term[4]; + + #define BUFF_OFFSET (8) + + if ((PGR_Copy_Data_Need_Replicate == false) || + (copy == NULL)) + { + return (CopyData *)NULL; + } + memset(term,0,sizeof(term)); + term[0]='\n'; + term[1]='\\'; + term[2]='.'; + buf_size = COPYBUFSIZ - BUFF_OFFSET; + read_index = 0; + rest_len = len; + rest_buf_size = buf_size - copy->cnt; + while ((rest_len > 0) && (rest_buf_size > 0)) + { + if (rest_buf_size < rest_len) + { + send_size = rest_buf_size; + rest_len -= send_size; + } + else + { + send_size = rest_len; + rest_len = 0; + } + memcpy(&(copy->copy_data[copy->cnt]) ,str + read_index ,send_size); + copy->cnt += send_size; + read_index += send_size; + rest_buf_size = buf_size - copy->cnt; + if (strstr(copy->copy_data,term) != NULL) + { + break; + } + if (rest_buf_size <= 0) + { + ep = strrchr(copy->copy_data,'\n'); + if (ep != NULL) + { + *ep = '\0'; + save_len = copy->cnt - strlen(copy->copy_data) -1; + copy->cnt -= save_len ; + memset(&save,0,sizeof(CopyData)); + memcpy(save.copy_data,(ep+1),save_len+1); + save.cnt = save_len; + *ep = '\n'; + *(ep+1) = '\0'; + status = PGR_Send_Copy(copy,0); + memset(copy,0,sizeof(CopyData)); + if (save_len > 0) + { + memcpy(copy,&save,sizeof(CopyData)); + } + rest_buf_size = buf_size - copy->cnt; + + } + else + { + /* one record is bigger than COPYBUFSIZ */ + /* buffer would be over flow*/ + status = PGR_Send_Copy(copy,0); + memset(copy,0,sizeof(CopyData)); + rest_buf_size = buf_size - copy->cnt; + } + } + } + if (end) + { + status = PGR_Send_Copy(copy,end); + memset(copy,0,sizeof(CopyData)); + } + if (status != STATUS_OK) + { + return (CopyData *)NULL; + } + return copy; +} + +int +PGR_replication(char * query_string, CommandDest dest, Node *parsetree, const char * commandTag) +{ + char *result = NULL; + char cmdSts = CMD_STS_OTHER; + char cmdType = CMD_TYPE_OTHER; + int query_len = 0; + + if ((query_string == NULL) || + (commandTag == NULL)) + { + return STATUS_ERROR; + } + + Query_String = NULL; + query_len = strlen(query_string); + + /* save query data for retry */ + PGR_Retry_Query.query_string = query_string; + PGR_Retry_Query.query_len = query_len; + PGR_Retry_Query.cmdSts = cmdSts; + PGR_Retry_Query.cmdType = cmdType; + PGR_Retry_Query.useFlag = DATA_USE; + /* set cmdType */ + if (!strcmp(commandTag,"BEGIN")) cmdType = CMD_TYPE_BEGIN ; + else if (!strcmp(commandTag,"COMMIT")) cmdType = CMD_TYPE_COMMIT ; + else if (!strcmp(commandTag,"SELECT")) cmdType = CMD_TYPE_SELECT ; + else if (!strcmp(commandTag,"INSERT")) cmdType = CMD_TYPE_INSERT ; + else if (!strcmp(commandTag,"UPDATE")) cmdType = CMD_TYPE_UPDATE ; + else if (!strcmp(commandTag,"DELETE")) cmdType = CMD_TYPE_DELETE ; + else if (!strcmp(commandTag,"VACUUM")) cmdType = CMD_TYPE_VACUUM ; + else if (!strcmp(commandTag,"ANALYZE")) cmdType = CMD_TYPE_ANALYZE ; + else if (!strcmp(commandTag,"REINDEX")) cmdType = CMD_TYPE_REINDEX ; + else if (!strcmp(commandTag,"ROLLBACK")) cmdType = CMD_TYPE_ROLLBACK ; + else if (!strcmp(commandTag,"RESET")) cmdType = CMD_TYPE_RESET ; + else if (!strcmp(commandTag,"START TRANSACTION")) cmdType = CMD_TYPE_BEGIN ; + + /* only "replication_server" statement-name is replicated for SHOW. */ + /* see CreateCommandTag() @ backend/tcop/postgres.c */ + + else if (!strcmp(commandTag,"COPY")) + { + cmdType = CMD_TYPE_COPY ; + if (is_copy_from(query_string)) + { + PGR_Copy_Data_Need_Replicate = true; + } + else + { + PGR_Copy_Data_Need_Replicate = false; + return STATUS_NOT_REPLICATE; + } + } + else if (!strcmp(commandTag,"SET")) + { + cmdType = CMD_TYPE_SET; + /* + VariableSetStmt *stmt = (VariableSetStmt *)parsetree; + if (strcmp(stmt->name, "TRANSACTION ISOLATION LEVEL") && + strcmp(stmt->name, "datestyle") && + strcmp(stmt->name, "autocommit") && + strcmp(stmt->name, "client_encoding") && + strcmp(stmt->name, "password_encryption") && + strcmp(stmt->name, "search_path") && + strcmp(stmt->name, "session_authorization") && + strcmp(stmt->name, "timezone")) + + return STATUS_NOT_REPLICATE; + */ + if (strstr(query_string,SYS_QUERY_1) != NULL) + { + return STATUS_NOT_REPLICATE; + } + } + else if (!strcmp(commandTag,"CREATE TABLE")) + { + if (is_create_temp_table(query_string)) + { + Create_Temp_Table_Mode = true; + } + } + if (Create_Temp_Table_Mode) + { + cmdSts = CMD_STS_TEMP_TABLE ; + } + if (Transaction_Mode > 0) + { + cmdSts = CMD_STS_TRANSACTION ; + } + else + { + if ((cmdType == CMD_TYPE_COMMIT ) || + (cmdType == CMD_TYPE_ROLLBACK )) + { + cmdSts = CMD_STS_TRANSACTION ; + if (ReplicateCurrentTime != NULL) + { + ReplicateCurrentTime->useFlag = DATA_INIT; + ReplicateCurrentTime->use_seed = 0; + } + } + } + if (Session_Authorization_Mode) + { + cmdSts = CMD_STS_SET_SESSION_AUTHORIZATION ; + if (cmdType == CMD_TYPE_SESSION_AUTHORIZATION_END) + { + Session_Authorization_Mode = false; + } + } + if ((cmdSts == CMD_STS_TRANSACTION ) || + (cmdSts == CMD_STS_SET_SESSION_AUTHORIZATION ) || + (cmdSts == CMD_STS_TEMP_TABLE )) + { + /* check partitional replication table */ + if (is_not_replication_query(query_string, query_len, cmdType)== true ) + { + PGR_Copy_Data_Need_Replicate = false; + return STATUS_NOT_REPLICATE; + } + Query_String = NULL; + if (( do_not_replication_command(commandTag) == true) && + (strcmp(commandTag,"SELECT"))) + { + return STATUS_NOT_REPLICATE; + } + + if (Debug_pretty_print) + elog(DEBUG1,"transaction query send :%s",(char *)query_string); + PGR_Retry_Query.cmdSts = cmdSts; + PGR_Retry_Query.cmdType = cmdType; + result = PGR_Send_Replicate_Command(query_string,query_len, cmdSts,cmdType); + if (result != NULL) + { + if (!strncmp(result,PGR_DEADLOCK_DETECTION_MSG,strlen(PGR_DEADLOCK_DETECTION_MSG))) + { + /* + PGR_Send_Message_To_Frontend(result); + */ + free(result); + result = NULL; + return STATUS_DEADLOCK_DETECT; + } + else if (!strncmp(result,PGR_REPLICATION_ABORT_MSG,strlen(PGR_REPLICATION_ABORT_MSG))) + { + free(result); + result = NULL; + return STATUS_REPLICATION_ABORT; + } + free(result); + result = NULL; + return STATUS_CONTINUE; + } + else + { + return STATUS_ERROR; + } + } + else + { + cmdSts = CMD_STS_QUERY ; + if ( do_not_replication_command(commandTag) == false) + { + Query_String = NULL; + /* check partitional replication table */ + if (is_not_replication_query(query_string, query_len, cmdType)== true ) + { + PGR_Copy_Data_Need_Replicate = false; + return STATUS_NOT_REPLICATE; + } + result = PGR_Send_Replicate_Command(query_string,query_len,cmdSts,cmdType); + if (result != NULL) + { + if (!strncmp(result,PGR_DEADLOCK_DETECTION_MSG,strlen(PGR_DEADLOCK_DETECTION_MSG))) + { + free(result); + result = NULL; + return STATUS_DEADLOCK_DETECT; + } + else if (!strncmp(result,PGR_REPLICATION_ABORT_MSG,strlen(PGR_REPLICATION_ABORT_MSG))) + { + free(result); + result = NULL; + return STATUS_REPLICATION_ABORT; + } + /* + PGR_Send_Message_To_Frontend(result); + */ + free(result); + result = NULL; + return STATUS_CONTINUE; + } + else + { + return STATUS_ERROR; + } + } + else + { + if (( is_serial_control_query(cmdType,query_string) == true) || + ( is_select_into_query(cmdType,query_string) == true)) + { + Query_String = NULL; + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + result = PGR_Send_Replicate_Command(query_string,query_len,cmdSts,cmdType); + if (result != NULL) + { + /* + PGR_Send_Message_To_Frontend(result); + */ + if (!strncmp(result,PGR_DEADLOCK_DETECTION_MSG,strlen(PGR_DEADLOCK_DETECTION_MSG))) + { + free(result); + return STATUS_DEADLOCK_DETECT; + } + free(result); + result = NULL; + return STATUS_CONTINUE; + } + else + { + return STATUS_ERROR; + } + } + else + { + Query_String = query_string; + /*PGR_Sock_To_Replication_Server = -1;*/ + } + return STATUS_CONTINUE_SELECT; + } + } + return STATUS_CONTINUE; +} + + +bool +PGR_Is_System_Command(char * query) +{ + char * ptr; + + if (query == NULL) + { + return false; + } + ptr = strstr(query,PGR_SYSTEM_COMMAND_FUNC); + if (ptr != NULL) + { + ptr = strchr(ptr,'('); + if (ptr == NULL) + return false; + return true; + } + return false; +} + +static int +set_command_args(char argv[ PGR_CMD_ARG_NUM ][256],char *str) +{ + int i,j,cnt,len; + char * ptr = str; + + if (str == NULL) + { + return 0; + } + len = strlen(str); + cnt = j = 0; + for ( i = 0 ; i < len ; i++,ptr++) + { + if (cnt >= PGR_CMD_ARG_NUM) + break; + if (( *ptr == ',') || (*ptr == ')')) + { + argv[cnt][j] = '\0'; + cnt ++; + j = 0; + continue; + } + argv[cnt][j] = *ptr; + j++; + } + if (cnt < PGR_CMD_ARG_NUM) + argv[cnt][j] = '\0'; + cnt ++; + + return cnt; +} + +static int +add_replication_server(char * hostname,char * port, char * recovery_port) +{ + int cnt; + int portNumber; + int recoveryPortNumber; + ReplicateServerInfo * sp; + + if ((hostname == NULL) || + (port == NULL ) || + (recovery_port == NULL )) + { + return STATUS_ERROR; + } + if (ReplicateServerData == NULL) + { + return STATUS_ERROR; + } + portNumber = atoi(port); + recoveryPortNumber = atoi(recovery_port); + cnt = 0; + sp = ReplicateServerData; + while (sp->useFlag != DATA_END){ + if((!strncmp(sp->hostName,hostname,sizeof(sp->hostName))) && + (sp->portNumber == portNumber) && + (sp->recoveryPortNumber == recoveryPortNumber)) + { + if (sp->useFlag != DATA_USE) + { + PGR_Set_Replication_Server_Status(sp, DATA_INIT); + } + return STATUS_OK; + } + sp ++; + cnt ++; + } + if (cnt < MAX_SERVER_NUM) + { + strncpy(sp->hostName,hostname,sizeof(sp->hostName)); + sp->portNumber = portNumber; + sp->recoveryPortNumber = recoveryPortNumber; + PGR_Set_Replication_Server_Status(sp, DATA_INIT); + memset((sp+1),0,sizeof(ReplicateServerInfo)); + (sp + 1)->useFlag = DATA_END; + } + else + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +change_replication_server(char * hostname,char * port, char * recovery_port) +{ + int cnt; + int portNumber; + int recoveryPortNumber; + ReplicateServerInfo * sp; + + if ((hostname == NULL) || + (port == NULL ) || + (recovery_port == NULL )) + { + return STATUS_ERROR; + } + if (ReplicateServerData == NULL) + { + return STATUS_ERROR; + } + portNumber = atoi(port); + recoveryPortNumber = atoi(recovery_port); + cnt = 0; + sp = ReplicateServerData; + while (sp->useFlag != DATA_END){ + if((!strcmp(sp->hostName,hostname)) && + (sp->portNumber == portNumber) && + (sp->recoveryPortNumber == recoveryPortNumber)) + { + PGR_Set_Replication_Server_Status(sp, DATA_USE); + } + else + { + if (sp->useFlag == DATA_USE) + { + PGR_Set_Replication_Server_Status(sp, DATA_INIT); + } + } + sp ++; + cnt ++; + } + return STATUS_OK; +} + +int +PGR_Set_Current_Time(char * sec, char * usec) +{ + int rtn = 0; + struct timeval local_tp; + struct timezone local_tpz; + struct timeval tv; + + if ((sec == NULL) || + (usec == NULL)) + { + return STATUS_ERROR; + } + rtn = gettimeofday(&local_tp, &local_tpz); + tv.tv_sec = atol(sec); + tv.tv_usec = atol(usec); + ReplicateCurrentTime->offset_sec = local_tp.tv_sec - tv.tv_sec; + ReplicateCurrentTime->offset_usec = local_tp.tv_usec - tv.tv_usec; + ReplicateCurrentTime->tp.tv_sec = tv.tv_sec; + ReplicateCurrentTime->tp.tv_usec = tv.tv_usec; + ReplicateCurrentTime->useFlag = DATA_USE; + ReplicateCurrentTime->use_seed = 0; + + return STATUS_OK; +} + +static void +PGR_Set_Current_Replication_Query_ID(char *id) { + MyProc->replicationId=atol(id); + return; +} + +static void +set_response_mode(char * mode) +{ + int response_mode = 0; + + if (mode == NULL) + return; + response_mode = atoi(mode); + if (response_mode < 0) + return; + if (CurrentReplicateServer == NULL) + { + PGR_get_replicate_server_info(); + if (CurrentReplicateServer == NULL) + { + return; + } + } + if (CurrentReplicateServer->response_mode != response_mode) + { + CurrentReplicateServer->response_mode = response_mode; + } +} + +int +PGR_Call_System_Command(char * command) +{ + char * ptr; + char * args; + char argv[ PGR_CMD_ARG_NUM ][256]; + int argc = 0; + int func_no; + char * hostName = NULL; + + if ((command == NULL) || (ReplicateCurrentTime == NULL)) + { + return STATUS_ERROR; + } + ptr = strstr(command,PGR_SYSTEM_COMMAND_FUNC); + if (ptr == NULL) + return STATUS_ERROR; + ptr = strchr(ptr,'('); + if (ptr == NULL) + return STATUS_ERROR; + args = ptr+1; + ptr = strchr(ptr,')'); + if (ptr == NULL) + return STATUS_ERROR; + *ptr = '\0'; + argc = set_command_args(argv,args); + if (argc < 1) + return STATUS_ERROR; + func_no = atoi(argv[0]); + switch (func_no) + { + /* set current system time */ + case PGR_SET_CURRENT_TIME_FUNC_NO: + if (atol(argv[1]) == 0) + { + CreateCheckPoint(false,true); + } + else + { + /* + if ((atoi(argv[3]) > 0) && + (is_this_query_replicated(argv[3]) == true)) + { + return STATUS_SKIP_QUERY; + } + */ + PGR_Set_Current_Time(argv[1],argv[2]); + set_replication_id(argv[3]); + set_response_mode(argv[4]); + PGR_Set_Current_Replication_Query_ID(argv[5]); + + } + break; + /* add new replication server data */ + case PGR_STARTUP_REPLICATION_SERVER_FUNC_NO: + hostName = get_hostName(argv[1]); + add_replication_server(hostName,argv[2],argv[3]); + break; + /* change new replication server */ + case PGR_CHANGE_REPLICATION_SERVER_FUNC_NO: + hostName = get_hostName(argv[1]); + change_replication_server(hostName,argv[2],argv[3]); + break; + case PGR_SET_CURRENT_REPLICATION_QUERY_ID_NO: + PGR_Set_Current_Replication_Query_ID(argv[1]); + break; + case PGR_QUERY_CONFIRM_ANSWER_FUNC_NO: + if ((atoi(argv[3]) > 0) && + (is_this_query_replicated(argv[3]) == true)) + { + /* skip this query */ + return STATUS_SKIP_QUERY; + } + else + { + PGR_Set_Current_Time(argv[1],argv[2]); + set_replication_id(argv[3]); + } + break; + /* get current oid */ + case PGR_GET_OID_FUNC_NO: + return_current_oid(); + break; + /* set current oid */ + case PGR_SET_OID_FUNC_NO: + sync_oid(argv[1]); + break; + /* set noticed session abort */ + case PGR_NOTICE_ABORT_FUNC_NO: + PGR_Noticed_Abort = true; + break; + } + return STATUS_OK; +} + +int +PGR_GetTimeOfDay(struct timeval *tp, struct timezone *tpz) +{ + + int rtn; + + rtn = gettimeofday(tp, tpz); + if (ReplicateCurrentTime == NULL) + { + return rtn; + } + if (ReplicateCurrentTime->useFlag == DATA_USE) + { + if (ReplicateCurrentTime->use_seed != 0) + { + tp->tv_sec -= ReplicateCurrentTime->offset_sec; + if (tp->tv_usec < ReplicateCurrentTime->offset_usec) + { + tp->tv_usec += (1000000 - ReplicateCurrentTime->offset_usec); + tp->tv_sec -= 1; + } + else + { + tp->tv_usec -= ReplicateCurrentTime->offset_usec; + } + } + else + { + tp->tv_sec = ReplicateCurrentTime->tp.tv_sec; + tp->tv_usec = ReplicateCurrentTime->tp.tv_usec; + } + rtn = 0; + } + return rtn; +} + +long +PGR_Random(void) +{ + double rtn; + if (ReplicateCurrentTime != NULL) + { + if ( ReplicateCurrentTime->use_seed == 0) + { + srand( ReplicateCurrentTime->tp.tv_usec ); + ReplicateCurrentTime->use_seed = 1; + } + } + rtn = random(); + return rtn; +} + +char * +PGR_scan_terminate( char * str) +{ + char * p; + int sflag = 0; + int dflag = 0; + int lflag = 0; + int i = 0; + char tag[256]; + + if (str == NULL) + return NULL; + p = str; + memset(tag,0,sizeof(tag)); + while ( *p != '\0' ) + { + if ((!strncmp(p,"--",2)) || + (!strncmp(p,"//",2))) + { + while (( *p != '\n') && (*p != '\0')) + { + p++; + } + continue; + } + + switch (*p) + { + case '\'': + sflag ^= 1; + break; + case '\"': + dflag ^= 1; + break; + case '$': + i = 0; + p++; + while (( *p != '\n') && (*p != '\0')) + { + if (isalnum(*p) == 0) + { + if (*p == '$') + { + lflag ^= 1; + } + break; + } + else + { + if (i >= sizeof(tag)) + break; + if (lflag == 0) + { + tag[i] = *p; + } + else + { + if (tag[i] != *p) + { + break; + } + } + i++; + } + p++; + } + break; + case '\\': + p +=2; + continue; + break; + case ';': + if ((!sflag) && (!dflag) && (!lflag)) + return p; + break; + } + p++; + } + return NULL; +} + +static bool +is_copy_from(char * query) +{ + char * p; + int i; + char buf[12]; + int c_flag = 0; + if (query == NULL) + return false; + p = query; + for ( i = 0 ; i <= 1 ; i ++) + { + /* get 'copy table_name' string */ + while(isspace(*p)) + p++; + while ((*p != '\0') && (*p != '(') && (!isspace(*p))) + p++; + } + while(isspace(*p)) + p++; + /* skip table column */ + if (*p == '(') + { + c_flag = 1; + p++; + while (*p != '\0') + { + if (*p == '(') + c_flag ++; + if (*p == ')') + c_flag --; + if (c_flag == 0) + { + p++; + break; + } + p++; + } + while(isspace(*p)) + p++; + } + /* get 'from' or 'to' */ + i = 0; + memset(buf,0,sizeof(buf)); + while ((*p != '\0') && (!isspace(*p)) && ( i < sizeof(buf)-1)) + { + buf[i] = (char)toupper(*p); + p++; + i++; + } + if (!strcmp(buf,"FROM")) + { + return true; + } + else + { + return false; + } +} + +static bool +is_create_temp_table(char * query) +{ + int len,wc; + char buf[MAX_WORDS][MAX_WORD_LETTERS]; + + if (query == NULL) + return false; + len = strlen(query); + wc = get_words(buf,query,len,1); + if (wc < 4) + return false; + if ((!strncmp(buf[0],"CREATE", strlen("CREATE"))) && + (!strncmp(buf[1],"TEMP",strlen("TEMP"))) && + (!strncmp(buf[2],"TABLE",strlen("TABLE")))) + { + return true; + } + return false; +} + +static int +get_words( char words[MAX_WORDS][MAX_WORD_LETTERS] ,char * string,int length,int upper) +{ + int i,wc,lc; + char * p = NULL; + char * buf = NULL; + + if (string == NULL) + return STATUS_ERROR; + buf = malloc(length); + if (buf == NULL) + return STATUS_ERROR; + + memset(buf,0,length); + p = string; + wc = 0; + for (i = 0 ; i < length ; i ++) + { + if ((*p == '\0') || (wc >= MAX_WORDS)) + break; + while (isspace(*p)) + { + p++; + i++; + } + lc = 0; + while ((*p != '\0') && (! isspace(*p))) + { + if (upper) + *(buf+lc) = (char)toupper(*p); + else + *(buf+lc) = *p; + + p++; + i++; + lc++; + } + memset(words[wc],0,MAX_WORD_LETTERS); + memcpy(words[wc],buf,lc); + memset(buf,0,length); + wc++; + } + free(buf); + buf = NULL; + return wc; +} + +static int +Comp_Not_Replicate(PGR_Not_Replicate_Type * nrp1,PGR_Not_Replicate_Type* nrp2) +{ + int rtn; + + if ((nrp1 == NULL) || + (nrp2 == NULL)) + { + return 0; + } + rtn = strcasecmp(nrp1->table_name,nrp2->table_name); + if (rtn == 0) + { + rtn = strcasecmp(nrp1->db_name,nrp2->db_name); + } + return rtn; +} + +bool +PGR_Is_Stand_Alone(void) +{ + ReplicateServerInfo * sp = NULL; + + if (PGR_Stand_Alone == NULL) + return true; + if (PGR_Stand_Alone->is_stand_alone == true) + { + sp = PGR_get_replicate_server_info(); + if (sp == NULL) + { + return true; + } + } + return false; +} + +void +PGR_Send_Message_To_Frontend(char * msg) +{ + StringInfoData msgbuf; + + pq_beginmessage(&msgbuf, 'N'); + + if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) + { + /* New style with separate fields */ + char tbuf[12]; + int ssval; + int i; + + pq_sendbyte(&msgbuf, PG_DIAG_SEVERITY); + pq_sendstring(&msgbuf, "NOTICE" ); + + /* unpack MAKE_SQLSTATE code */ + ssval = ERRCODE_WARNING ; + for (i = 0; i < 5; i++) + { + tbuf[i] = PGUNSIXBIT(ssval); + ssval >>= 6; + } + tbuf[i] = '\0'; + + pq_sendbyte(&msgbuf, PG_DIAG_SQLSTATE); + pq_sendstring(&msgbuf, tbuf); + + /* M field is required per protocol, so always send something */ + pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_PRIMARY); + if (msg) + pq_sendstring(&msgbuf, msg); + else + pq_sendstring(&msgbuf, _("missing error text")); + + pq_sendbyte(&msgbuf, '\0'); /* terminator */ + } + else + { + /* Old style --- gin up a backwards-compatible message */ + StringInfoData buf; + + initStringInfo(&buf); + + appendStringInfo(&buf, "%s: ", "NOTICE"); + + if (msg) + appendStringInfoString(&buf, msg); + else + appendStringInfoString(&buf, _("missing error text")); + + appendStringInfoChar(&buf, '\n'); + + pq_sendstring(&msgbuf, buf.data); + + pfree(buf.data); + } + + pq_endmessage(&msgbuf); + + /* + * This flush is normally not necessary, since postgres.c will flush out + * waiting data when control returns to the main loop. But it seems best + * to leave it here, so that the client has some clue what happened if the + * backend dies before getting back to the main loop ... error/notice + * messages should not be a performance-critical path anyway, so an extra + * flush won't hurt much ... + */ + pq_flush(); +} + +static bool +is_serial_control_query(char cmdType,char * query) +{ + char * buf = NULL; + int len = 0; + int i = 0; + char * p = NULL; + + if ((cmdType != CMD_TYPE_SELECT ) || + ( query == NULL)) + { + return false; + } + + p = query; + len = strlen(query) +1; + buf = malloc(len); + if (buf == NULL) + return false; + + memset(buf,0,len); + for ( i = 0 ; i < len ; i ++) + { + *(buf+i) = toupper(*(query+i)); + } + if ((strstr(buf,"NEXTVAL") != NULL) || + (strstr(buf,"SETVAL") != NULL)) + { + free(buf); + buf = NULL; + return true; + } + free(buf); + buf = NULL; + return false; +} + +static bool +is_select_into_query(char cmdType,char * query) +{ + char * buf = NULL; + int len = 0; + int i = 0; + char * p = NULL; + + if ((cmdType != CMD_TYPE_SELECT ) || + ( query == NULL)) + { + return false; + } + + p = query; + len = strlen(query) +1; + buf = malloc(len); + if (buf == NULL) + return false; + + memset(buf,0,len); + for ( i = 0 ; i < len ; i ++) + { + *(buf+i) = toupper(*(query+i)); + } + if (strstr(buf,"INTO") != NULL) + { + free(buf); + buf = NULL; + return true; + } + if (strstr(buf,"CREATE") != NULL) + { + free(buf); + buf = NULL; + return true; + } + free(buf); + buf = NULL; + return false; +} + +static int +send_response_to_replication_server(const char * notice) +{ + ReplicateHeader header; + int status; + + if (PGR_Lock_Noticed) + { + return STATUS_OK; + } + if ((notice == NULL) || + (PGR_Sock_To_Replication_Server < 0)) + { + return STATUS_ERROR; + } + + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = CMD_STS_RESPONSE; + if (!strcmp(notice,PGR_QUERY_ABORTED_NOTICE_CMD)) + { + header.cmdType = CMD_TYPE_FRONTEND_CLOSED; + } + header.query_size = htonl(strlen(notice)); + status = send_replicate_packet(PGR_Sock_To_Replication_Server,&header,(char *)notice); + return status; +} + +void +PGR_Notice_Transaction_Query_Done(void) +{ + send_response_to_replication_server(PGR_QUERY_DONE_NOTICE_CMD); +} + +void +PGR_Notice_Transaction_Query_Aborted(void) +{ + send_response_to_replication_server(PGR_QUERY_ABORTED_NOTICE_CMD); +} + +int +PGR_Notice_Conflict(void) +{ + const char * msg = NULL ; + int rtn = STATUS_OK; + + msg = PGR_LOCK_CONFLICT_NOTICE_CMD ; + if (PGR_Check_Lock.deadlock == true) + { + msg = PGR_DEADLOCK_DETECT_NOTICE_CMD ; + } + if (PGR_Check_Lock.dest == TO_FRONTEND) + { + ReadyForQuery(DestRemote); + EndCommand(msg,DestRemote); +#ifdef CONTROL_LOCK_CONFLICT + rtn = wait_lock_answer(); +#endif /* CONTROL_LOCK_CONFLICT */ + } + else + { + send_response_to_replication_server(msg); +#ifdef CONTROL_LOCK_CONFLICT + rtn = PGR_Recv_Trigger (PGR_Replication_Timeout); +#endif /* CONTROL_LOCK_CONFLICT */ + } + return rtn; +} + +#ifdef CONTROL_LOCK_CONFLICT +static int +wait_lock_answer(void) +{ + char result[PGR_MESSAGE_BUFSIZE+4]; + int rtn = 0; + + memset(result,0,sizeof(result)); + rtn = read_trigger(result, PGR_MESSAGE_BUFSIZE); + if (rtn < 0) + return STATUS_ERROR; + return STATUS_OK; +} + +static int +read_trigger(char * result, int buf_size) +{ + int i = 0; + char c; + int r = 0; + + if ((result == NULL) || (buf_size <= 0 )) + { + return EOF; + } + /* + pq_getbytes(result,buf_size); + */ + while ((r = pq_getbytes(&c,1)) == 0) + { + if (i < buf_size -1) + { + *(result + i) = c; + } + else + { + break; + } + if (c == '\0') + break; + i++; + } + + return r; +} +#endif /* CONTROL_LOCK_CONFLICT */ + +int +PGR_Recv_Trigger (int user_timeout) +{ + char result[PGR_MESSAGE_BUFSIZE]; + int rtn = 0; + int func_no = 0; + + + if (PGR_Lock_Noticed) + { + return STATUS_OK; + } + if (PGR_Sock_To_Replication_Server < 0) + return STATUS_ERROR; + memset(result,0,sizeof(result)); + rtn = PGR_recv_replicate_result(PGR_Sock_To_Replication_Server,result,user_timeout); + if (rtn > 0) + { + func_no = atoi(result); + if (func_no <= 0) + { + func_no = STATUS_OK; + } + return func_no; + } + else + { + if (user_timeout == 0) + { + PGR_Set_Replication_Server_Status(CurrentReplicateServer, DATA_ERR); + } + return STATUS_ERROR; + } + return STATUS_OK; +} + + +int +PGR_Set_Transaction_Mode(int mode,const char * commandTag) +{ + if (commandTag == NULL) + { + return mode; + } + if ((!strcmp(commandTag,"BEGIN")) || + (!strcmp(commandTag,"START TRANSACTION")) ) + { + return (++mode); + } + if (mode > 0) + { + if ((!strncmp(commandTag,"COMMIT",strlen("COMMIT"))) || + (!strncmp(commandTag,"ROLLBACK",strlen("ROLLBACK")))) + { + return (--mode); + } + } + return mode; +} + +static bool +do_not_replication_command(const char * commandTag) +{ + if (commandTag == NULL) + { + return true; + } + if ((!strcmp(commandTag,"SELECT")) || + (!strcmp(commandTag,"CLOSE CURSOR")) || + (!strcmp(commandTag,"MOVE")) || + (!strcmp(commandTag,"FETCH")) || + (!strcmp(commandTag,"EXPLAIN"))) + { + return true; + } + else + { + return false; + } +} + +void +PGR_Set_Replication_Server_Status( ReplicateServerInfo * sp, int status) +{ + if (sp == NULL) + { + return; + } + if (sp->useFlag != status) + { + sp->useFlag = status; + } +} + +int +PGR_Is_Skip_Replication(char * query) +{ + char skip_2[256]; + + if ((query == NULL) || + (MyProcPort == NULL)) + { + return -1; + } + snprintf(skip_2,sizeof(skip_2),SKIP_QUERY_2,MyProcPort->user_name); + if ((strncmp(query,SKIP_QUERY_1,strlen(SKIP_QUERY_1)) == 0) || + (strncmp(query,skip_2,strlen(skip_2)) == 0)) + { + return 3; + } + if ((strncmp(query,SKIP_QUERY_3,strlen(SKIP_QUERY_3)) == 0) || + (strncmp(query,SKIP_QUERY_4,strlen(SKIP_QUERY_4)) == 0)) + { + return 1; + } + return 0; +} + +bool +PGR_Did_Commit_Transaction(void) +{ + + int sock = -1; + int cnt = 0; + ReplicateHeader header; + char * serverName = NULL; + int portNumber=0; + char * result = NULL; + ReplicateServerInfo * sp = NULL; + ReplicateServerInfo * base = NULL; + int socket_type = 0; + char argv[ PGR_CMD_ARG_NUM ][256]; + int argc = 0; + int func_no = 0; + + if (ReplicateCurrentTime->useFlag != DATA_USE) + { + return false; + } + sp = PGR_get_replicate_server_info(); + if (sp == NULL) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_info get error"); + return false; + } + sock = PGR_get_replicate_server_socket( sp , PGR_QUERY_SOCKET); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_socket fail"); + return false; + } + result = malloc(PGR_MESSAGE_BUFSIZE); + if (result == NULL) + { + return false; + } + memset(result,0,PGR_MESSAGE_BUFSIZE); + + serverName = sp->hostName; + portNumber = (int)sp->portNumber; + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = CMD_STS_TRANSACTION_ABORT; + header.cmdType = CMD_TYPE_COMMIT_CONFIRM; + header.port = htons(PostPortNumber); + header.pid = htons(getpid()); + header.tv.tv_sec = htonl(ReplicateCurrentTime->tp.tv_sec); + header.tv.tv_usec = htonl(ReplicateCurrentTime->tp.tv_usec); + header.query_size = htonl(0); + strncpy(header.dbName ,(char *)(MyProcPort->database_name),sizeof(header.dbName)-1); + strncpy(header.userName , (char *)(MyProcPort->user_name),sizeof(header.userName)-1); + strncpy(header.password , PGR_password->password, PASSWORD_MAX_LENGTH ); + memcpy(header.md5Salt ,MyProcPort->md5Salt, sizeof(header.md5Salt)); + memcpy(header.cryptSalt ,MyProcPort->cryptSalt, sizeof(header.cryptSalt)); + if (PGRSelfHostName != NULL) + { + strncpy(header.from_host, PGRSelfHostName, HOSTNAME_MAX_LENGTH); + } + header.replicate_id = htonl(ReplicationLog_Info.PGR_Replicate_ID); + header.request_id = 0; + + base = sp; + PGR_Sock_To_Replication_Server = sock; + + cnt = 0; + while (send_replicate_packet(sock,&header,"") != STATUS_OK) + { + if (cnt > MAX_RETRY_TIMES ) + { + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + free(result); + result = NULL; + return false; + } + PGR_Sock_To_Replication_Server = sock; + cnt = 0; + } + cnt ++; + } + + if (PGR_recv_replicate_result(sock,result,6) < 0) + { + free(result); + result = NULL; + return false; + } + /* read answer */ + argc = set_command_args(argv,result); + if (argc >= 1) + { + func_no = atoi(argv[0]); + if (func_no == PGR_TRANSACTION_CONFIRM_ANSWER_FUNC_NO) + { + /* the transaction was commited in other server */ + if (atoi(argv[1]) == PGR_ALREADY_COMMITTED) + { + free(result); + result = NULL; + return true; + } + } + } + free(result); + result = NULL; + return false; +} + +int +PGRsend_system_command(char cmdSts, char cmdType) +{ + ReplicateServerInfo * sp = NULL; + int sock = -1; + int socket_type = 0; + char * result = NULL; + char * serverName = NULL; + int portNumber=0; + ReplicateHeader header; + int cnt = 0; + ReplicateServerInfo * base = NULL; + + sp = PGR_get_replicate_server_info(); + if (sp == NULL) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_info get error"); + return STATUS_ERROR; + } + sock = PGR_get_replicate_server_socket( sp , PGR_QUERY_SOCKET); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"PGR_get_replicate_server_socket fail"); + return STATUS_ERROR; + } + result = malloc(PGR_MESSAGE_BUFSIZE); + if (result == NULL) + { + return STATUS_ERROR; + } + memset(result,0,PGR_MESSAGE_BUFSIZE); + + serverName = sp->hostName; + portNumber = (int)sp->portNumber; + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = cmdSts; + header.cmdType = cmdType; + header.port = htons(PostPortNumber); + header.pid = htons(getpid()); + header.tv.tv_sec = htonl(ReplicateCurrentTime->tp.tv_sec); + header.tv.tv_usec = htonl(ReplicateCurrentTime->tp.tv_usec); + header.query_size = htonl(0); + strncpy(header.dbName ,(char *)(MyProcPort->database_name),sizeof(header.dbName)-1); + strncpy(header.userName , (char *)(MyProcPort->user_name),sizeof(header.userName)-1); + strncpy(header.password , PGR_password->password, PASSWORD_MAX_LENGTH ); + memcpy(header.md5Salt ,MyProcPort->md5Salt, sizeof(header.md5Salt)); + memcpy(header.cryptSalt ,MyProcPort->cryptSalt, sizeof(header.cryptSalt)); + if (PGRSelfHostName != NULL) + { + strncpy(header.from_host, PGRSelfHostName, HOSTNAME_MAX_LENGTH); + } + header.replicate_id = htonl(ReplicationLog_Info.PGR_Replicate_ID); + header.request_id = 0; + + base = sp; + PGR_Sock_To_Replication_Server = sock; + cnt = 0; + while (send_replicate_packet(sock,&header,"") != STATUS_OK) + { + if (cnt > MAX_RETRY_TIMES ) + { + sock = get_new_replication_socket( base, sp, socket_type); + if (sock < 0) + { + if (Debug_pretty_print) + elog(DEBUG1,"all replication servers may be down"); + PGR_Stand_Alone->is_stand_alone = true; + free(result); + result = NULL; + return STATUS_ERROR; + } + PGR_Sock_To_Replication_Server = sock; + cnt = 0; + } + cnt ++; + } + free(result); + result = NULL; + return STATUS_OK; +} + +static char * +get_hostName(char * str) +{ + char * top = NULL; + char * p = NULL; + + p = str; + while ( *p != '\0') + { + if (*p == '\'') + { + *p = '\0'; + p++; + if (top == NULL) + { + top = p; + } + } + p++; + } + return top; +} + +char * +PGR_Remove_Comment(char * str) +{ + char * p = NULL; + p = str; + while( *p != '\0') + { + while(isspace(*p)) + { + p++; + } + if ((!memcmp(p,"--",2)) || + (!memcmp(p,"//",2))) + { + while((*p != '\n') && (*p != '\0')) + { + p++; + } + continue; + } + break; + } + return p; +} + +void +PGR_Force_Replicate_Query(void) +{ + if (PGR_Retry_Query.useFlag == DATA_USE) + { + PGR_Send_Replicate_Command(PGR_Retry_Query.query_string, + PGR_Retry_Query.query_len, + PGR_Retry_Query.cmdSts, + PGR_Retry_Query.cmdType); + } +} + +void +PGR_Notice_DeadLock(void) +{ + ReplicateHeader header; + + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = CMD_STS_NOTICE; + header.cmdType = CMD_TYPE_DEADLOCK_DETECT; + header.query_size = 0; + send_replicate_packet(PGR_Sock_To_Replication_Server,&header,(char *)NULL); +} + +void +PGR_Set_Cluster_Status(int status) +{ + if (ClusterDBData != NULL) + { + if (ClusterDBData->status != status) + { + ClusterDBData->status = status; + } + } +} + +int +PGR_Get_Cluster_Status(void) +{ + if (ClusterDBData != NULL) + { + return (ClusterDBData->status); + } + return 0; +} + +int +PGR_Check_Replicate_Server_Status(ReplicateServerInfo * sp) +{ + ReplicateHeader header; + char * result = NULL; + int status; + int fdP; + + result = malloc(PGR_MESSAGE_BUFSIZE + 4); + if (result == NULL) + { + if (Debug_pretty_print) + elog(DEBUG1,"malloc failed in PGR_Check_Replicate_Server_Status()"); + return STATUS_ERROR; + } + + memset(&header, 0, sizeof(ReplicateHeader)); + memset(result, 0, PGR_MESSAGE_BUFSIZE + 4); + + header.cmdSys = CMD_SYS_PREREPLICATE; + header.cmdSts = CMD_STS_OTHER; + header.cmdType = CMD_TYPE_OTHER; + header.port = htons(PostPortNumber); + header.pid = htons(getpid()); + header.query_size = 0; + strncpy(header.dbName ,(char *)(MyProcPort->database_name),sizeof(header.dbName)-1); + strncpy(header.userName , (char *)(MyProcPort->user_name),sizeof(header.userName)-1); + strncpy(header.password , PGR_password->password, PASSWORD_MAX_LENGTH ); + memcpy(header.md5Salt ,MyProcPort->md5Salt, sizeof(header.md5Salt)); + memcpy(header.cryptSalt ,MyProcPort->cryptSalt, sizeof(header.cryptSalt)); + header.request_id = htonl(get_next_request_id()); + header.rlog = 0; + if (PGRSelfHostName != NULL) { + strncpy(header.from_host, PGRSelfHostName, HOSTNAME_MAX_LENGTH); + } + + /* open a new socket for lifecheck */ + if ((status = PGR_Create_Socket_Connect(&fdP, sp->hostName, sp->portNumber)) == STATUS_ERROR) { + if (Debug_pretty_print) { + elog(DEBUG1,"create socket failed in PGR_Check_Replicate_Server_Status()"); + } + + /* status = STATUS_OK */ + } else { + if ((status = send_replicate_packet(fdP, &header, (char *)NULL)) == STATUS_OK) { + /* receive result to check for possible deadlock */ + status = (0 >= PGR_recv_replicate_result(fdP, result ,0)) + ? STATUS_OK : STATUS_ERROR; + } + } + + free(result); + PGR_Close_Sock(&fdP); + + return status; +} + +static int +return_current_oid(void) +{ + char msg[PGR_MESSAGE_BUFSIZE]; + + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + + if (ShmemVariableCache->nextOid < ((Oid) FirstBootstrapObjectId)) + { + ShmemVariableCache->nextOid = FirstBootstrapObjectId; + ShmemVariableCache->oidCount = 0; + } + + if (ShmemVariableCache->oidCount == 0) + { + XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH); + ShmemVariableCache->oidCount = VAR_OID_PREFETCH; + } + LWLockRelease(OidGenLock); + + memset(msg,0,sizeof(msg)); + snprintf(msg, sizeof(msg), "%u", ShmemVariableCache->nextOid); + if (PGR_Check_Lock.dest == TO_FRONTEND) + { + pq_puttextmessage('C',msg); + pq_flush(); + } + else + { + send_response_to_replication_server(msg); + } + return STATUS_OK; +} + +static int +sync_oid(char * oid) +{ + uint32_t next_oid = 0; + int offset = 0; + char msg[PGR_MESSAGE_BUFSIZE]; + + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + + next_oid = strtoul(oid, NULL, 10); + if (next_oid <= 0) + return STATUS_ERROR; + next_oid ++; + offset = next_oid - ShmemVariableCache->nextOid ; + if (offset <= 0) + return STATUS_ERROR; + + if (next_oid < FirstBootstrapObjectId) + { + ShmemVariableCache->nextOid = FirstBootstrapObjectId; + ShmemVariableCache->oidCount = 0; + } + + /* If we run out of logged for use oids then we must log more */ + while (ShmemVariableCache->oidCount - offset <= 0) + { + offset -= (ShmemVariableCache->oidCount) ; + (ShmemVariableCache->nextOid) += (ShmemVariableCache->oidCount); + XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH); + ShmemVariableCache->oidCount = VAR_OID_PREFETCH; + } + + (ShmemVariableCache->nextOid) += offset; + (ShmemVariableCache->oidCount) -= offset; + + LWLockRelease(OidGenLock); + + memset(msg,0,sizeof(msg)); + snprintf(msg, sizeof(msg), "%u", ShmemVariableCache->nextOid); + if (PGR_Check_Lock.dest == TO_FRONTEND) + { + pq_puttextmessage('C',msg); + pq_flush(); + } + else + { + send_response_to_replication_server(msg); + } + return STATUS_OK; +} + +int +PGR_lo_import(char * filename) +{ + char * result = NULL; + LOArgs *lo_args; + int len = 0; + int buf_size = 0; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + + len = strlen(filename); + buf_size = sizeof(LOArgs) + len; + lo_args = (LOArgs *)malloc(buf_size + 4); + if (lo_args == (LOArgs *)NULL) + { + return STATUS_ERROR; + } + memset(lo_args, 0, buf_size + 4); + lo_args->arg1 = htonl((uint32_t)len); + memcpy(lo_args->buf, filename, len); + + result = PGR_Send_Replicate_Command((char *)lo_args, + buf_size, + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_IMPORT); + + free(lo_args); + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_create(int flags) +{ + char * result = NULL; + LOArgs lo_args; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + memset(&lo_args, 0, sizeof(LOArgs)); + lo_args.arg1 = htonl(flags); + + result = PGR_Send_Replicate_Command((char *)&lo_args, + sizeof(LOArgs), + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_CREATE); + + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_open(Oid lobjId,int32 mode) +{ + char * result = NULL; + LOArgs lo_args; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + memset(&lo_args, 0, sizeof(LOArgs)); + lo_args.arg1 = htonl((uint32_t)lobjId); + lo_args.arg2 = htonl((uint32_t)mode); + + result = PGR_Send_Replicate_Command((char *)&lo_args, + sizeof(LOArgs), + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_OPEN); + + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_close(int32 fd) +{ + char * result = NULL; + LOArgs lo_args; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + memset(&lo_args, 0, sizeof(LOArgs)); + lo_args.arg1 = htonl((uint32_t)fd); + + result = PGR_Send_Replicate_Command((char *)&lo_args, + sizeof(LOArgs), + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_CLOSE); + + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_write(int fd, char *buf, int len) +{ + char * result = NULL; + LOArgs *lo_args = NULL; + int buf_size = 0; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + buf_size = sizeof(LOArgs) + len; + lo_args = malloc(buf_size + 4); + if (lo_args == (LOArgs *)NULL) + { + return STATUS_ERROR; + } + memset(lo_args, 0, buf_size + 4); + lo_args->arg1 = htonl((uint32_t)fd); + lo_args->arg2 = htonl((uint32_t)len); + memcpy(lo_args->buf, buf, len); + result = PGR_Send_Replicate_Command((char *)lo_args, + buf_size, + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_WRITE); + + free(lo_args); + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_lseek(int32 fd, int32 offset, int32 whence) +{ + char * result = NULL; + LOArgs lo_args; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + memset(&lo_args, 0, sizeof(LOArgs)); + lo_args.arg1 = htonl((uint32_t)fd); + lo_args.arg2 = htonl((uint32_t)offset); + lo_args.arg3 = htonl((uint32_t)whence); + + result = PGR_Send_Replicate_Command((char *)&lo_args, + sizeof(LOArgs), + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_LSEEK); + + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +int +PGR_lo_unlink(Oid lobjId) +{ + char * result = NULL; + LOArgs lo_args; + + if ((PGR_Is_Replicated_Query == true) || + (PGR_Retry_Query.cmdSts == CMD_STS_TRANSACTION)) + { + return STATUS_OK; + } + if ((PGR_Retry_Query.cmdSts != CMD_STS_QUERY) || + (PGR_Retry_Query.cmdType != CMD_TYPE_SELECT)) + { + return STATUS_OK; + } + memset(&lo_args, 0, sizeof(LOArgs)); + lo_args.arg1 = htonl((uint32_t)lobjId); + + result = PGR_Send_Replicate_Command((char *)&lo_args, + sizeof(LOArgs), + CMD_STS_LARGE_OBJECT, + CMD_TYPE_LO_UNLINK); + + if (result != NULL) + { + free(result); + return STATUS_OK; + } + + return STATUS_ERROR; +} + +Oid +PGRGetNewObjectId(Oid last_id) +{ + Oid newId = 0; + + if (last_id == 0) + { + newId = (Oid)PGRget_replication_id(); + } + else + { + newId = last_id + 1; + } + return newId; +} + +int +PGR_Send_Input_Message(char cmdType,StringInfo input_message) +{ + int len = 0; + char * ptr = NULL; + char * result = NULL; + + if (input_message == NULL) + { + return STATUS_ERROR; + } + if (PGR_Is_Replicated_Query == true) + { + return STATUS_OK; + } + len = input_message->len+1; + ptr = input_message->data; + + /* check setting of configuration value */ + if ( PGRnotReplicatePreparedSelect == true) + { + if (is_concerned_with_prepared_select(cmdType, ptr+1) == true) + { + return STATUS_OK; + } + } + result = PGR_Send_Replicate_Command(ptr,len, CMD_STS_PREPARE,cmdType); + if (result != NULL) + { + PGR_Reload_Start_Time(); + free(result); + result = NULL; + return STATUS_OK; + } + else + { + return STATUS_ERROR; + } +} + +static bool +is_concerned_with_prepared_select(char cmdType, char * query_string) +{ + if (cmdType == CMD_TYPE_P_PARSE) + { + switch (parse_message(query_string)) + { + case PGR_MESSAGE_SELECT: + pgr_skip_in_prepared_query = true; + break; + case PGR_MESSAGE_PREPARE: + if (is_prepared_as_select(query_string) == true) + { + pgr_skip_in_prepared_query = true; + } + break; + case PGR_MESSAGE_EXECUTE: + case PGR_MESSAGE_DEALLOCATE: + if (is_statement_as_select(query_string) == true) + { + pgr_skip_in_prepared_query = true; + } + break; + } + if (pgr_skip_in_prepared_query == true) + { + return true; + } + } + if (pgr_skip_in_prepared_query == true) + { + if (cmdType == CMD_TYPE_P_SYNC) + { + pgr_skip_in_prepared_query = false; + } + return true; + } + return false; +} + +static int +skip_non_blank(char * ptr, int max) +{ + int i= 0; + while(!isspace(*(ptr+i))) + { + if ((*(ptr+1) == '(') || (*(ptr+1) == ')')) + { + return i; + } + i++; + if (i > max) + return -1; + } + return i; +} + +static int +skip_blank(char * ptr, int max) +{ + int i = 0; + while(isspace(*(ptr+i))) + { + i++; + if (i > max) + return -1; + } + return i; +} + +static int +parse_message(char * query_string) +{ + char * ptr =NULL; + int rtn = 0; + int i = 0; + int len = 0; + if (query_string == NULL) + { + return PGR_MESSAGE_OTHER; + } + len = strlen (query_string); + if (len <= 0) + { + return PGR_MESSAGE_OTHER; + } + ptr = (char *)query_string; + i = 0; + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return PGR_MESSAGE_OTHER; + i += rtn; + + if (!strncasecmp(ptr+i,"SELECT",strlen("SELECT"))) + { + return PGR_MESSAGE_SELECT; + } + if (!strncasecmp(ptr+i,"PREPARE",strlen("PREPARE"))) + { + return PGR_MESSAGE_PREPARE; + } + if (!strncasecmp(ptr+i,"EXECUTE",strlen("EXECUTE"))) + { + return PGR_MESSAGE_EXECUTE; + } + if (!strncasecmp(ptr+i,"DEALLOCATE",strlen("DEALLOCATE"))) + { + return PGR_MESSAGE_DEALLOCATE; + } + return PGR_MESSAGE_OTHER; +} + +static bool +is_prepared_as_select(char * query_string) +{ + char * ptr =NULL; + int rtn = 0; + int i = 0; + int len = 0; + int args =0; + if (query_string == NULL) + { + return false; + } + ptr = (char *)query_string; + len = strlen (query_string); + i = 0; + /* skip "PREPARE" word */ + rtn = skip_non_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* skip plan_name */ + rtn = skip_non_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* skip args */ + args = 0; + if (*(ptr+i) == '(') + { + args ++; + i++; + while(args > 0) + { + if (*(ptr+i) == ')') + args --; + else if (*(ptr+i) == '(') + args ++; + i++; + if (i >= len) + return false; + } + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + } + /* skip "AS" word */ + i += strlen("AS"); + if (i >= len) + return false; + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* check "SELECT" word */ + if (len-i < strlen("SELECT")) + return false; + if (!strncasecmp(ptr+i,"SELECT",strlen("SELECT"))) + { + return true; + } + return false; + +} + +static bool +is_statement_as_select(char * query_string) +{ + char * ptr =NULL; + int rtn = 0; + int i = 0; + int j = 0; + int len = 0; + bool result = false; + PrepareStmt stmt; + char * name = NULL; + if (query_string == NULL) + { + return false; + } + ptr = (char *)query_string; + len = strlen (query_string); + i = 0; + /* skip "EXECUTE" or "DEALLOCATE" word */ + rtn = skip_non_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + /* skip space */ + rtn = skip_blank(ptr+i, len-i); + if (rtn < 0) + return false; + i += rtn; + if ((name = malloc(len)) == NULL) + return false; + memset(name,0,len); + j = 0; + while(isalnum(*(ptr+i))) + { + *(name+j) = *(ptr+i); + i++; + j++; + if (i > len) + return false; + } + stmt.name = name; + result = PGR_is_select_prepared_statement(&stmt); + free(name); + return result; +} + +bool +PGR_is_select_prepare_query(void) +{ + if (debug_query_string == NULL) + { + return false; + } + return (is_prepared_as_select((char *)debug_query_string)); +} + +char * +PGR_get_md5salt(char * md5Salt, char * string) +{ + char buf[24]; + char * ptr = NULL; + int len = 0; + int i = 0; + int cnt = 0; + int index = 0; + bool set_flag = false; + + ptr = (char *)md5Salt; + len = strlen(string); + for ( i = 0 ; i < len ; i ++) + { + if (*(string+i) == ')') + { + buf[index++] = '\0'; + *ptr = (char)atoi(buf); + set_flag = false; + } + if (set_flag) + { + buf[index++] = *(string+i); + } + if (*(string+i) == '(') + { + set_flag = true; + index = 0; + ptr = (char *)(md5Salt + cnt); + cnt++; + } + } + return md5Salt; +} + +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/backend/libpq/replicate_com.c pgcluster-1.7.0rc7/src/backend/libpq/replicate_com.c --- postgresql-8.2.4/src/backend/libpq/replicate_com.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/libpq/replicate_com.c 2007-02-18 22:52:16.000000000 +0100 @@ -0,0 +1,675 @@ +/*-------------------------------------------------------------------- + * FILE: + * replicate_com.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at backend for the replication. + * Low level I/O functions that called by in these functions are + * contained in 'replicate_com.c'. + * + *-------------------------------------------------------------------- + */ + +/*-------------------------------------- + * INTERFACE ROUTINES + * + * setup/teardown: + * PGR_Close_Sock + * PGR_Free_Conf_Data + * I/O call: + * PGR_Create_Socket_Connect + * PGR_Create_Socket_Bind + * PGR_Create_Acception + * table handling: + * PGR_Get_Conf_Data + *------------------------------------- + */ +#ifdef USE_REPLICATION + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#include +#include + +#include "libpq/libpq.h" +#include "miscadmin.h" +#include "nodes/print.h" +#include "utils/guc.h" +#include "parser/parser.h" +#include "access/xact.h" +#include "replicate_com.h" + +int PGR_Create_Socket_Connect(int * fdP, char * hostName , unsigned short portNumber); +void PGR_Close_Sock(int * sock); +int PGR_Create_Socket_Bind(int * fdP, char * hostName , unsigned short portNumber); +int PGR_Create_Acception(int fd, int * sockP, char * hostName , unsigned short portNumber); +int PGR_Free_Conf_Data(void); +int PGR_Get_Conf_Data(char * dir , char * fname); +void PGRset_recovery_packet_no(RecoveryPacket * packet, int packet_no); +unsigned int PGRget_ip_by_name(char * host); +int PGRget_time_value(char *str); + +static char * get_string(char * buf); +static bool is_start_tag(char * ptr); +static bool is_end_tag(char * ptr); +static void init_conf_data(ConfDataType *conf); +static int get_key(char * key, char * str); +static int get_conf_key_value(char * key, char * value , char * str); +static int add_conf_data(char *table,int rec_no, char *key,char * value); +static int get_table_data(FILE * fp,char * table, int rec_no); +static int get_single_data(char * str); +static int get_conf_file(char * fname); + +/*-------------------------------------------------------------------- + * SYMBOL + * PGR_Create_Socket_Connect() + * NOTES + * create new socket + * ARGS + * int * fdP: + * char * hostName: + * unsigned short portNumber: + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGR_Create_Socket_Connect(int * fdP, char * hostName , unsigned short portNumber) +{ + + int sock; + size_t len = 0; + struct sockaddr_in addr; + int one = 1; + + if ((*hostName == '\0') || (portNumber < 1000)) + { + * fdP = -1; + return STATUS_ERROR; + } + if ((*fdP = socket(AF_INET, SOCK_STREAM, 0)) < 0) + { + * fdP = -1; + return STATUS_ERROR; + } + if ((setsockopt(*fdP, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one))) == -1) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + if (setsockopt(*fdP, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) < 0) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + + addr.sin_family = AF_INET; + if ((hostName == NULL ) || (hostName[0] == '\0')) + addr.sin_addr.s_addr = htonl(INADDR_ANY); + else + { + struct hostent *hp; + + hp = gethostbyname(hostName); + if ((hp == NULL) || (hp->h_addrtype != AF_INET)) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length); + } + + addr.sin_port = htons(portNumber); + len = sizeof(struct sockaddr_in); + + if ((sock = connect(*fdP,(struct sockaddr*)&addr,len)) < 0) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + + return STATUS_OK; +} + +int +PGR_Create_Socket_Bind(int * fdP, char * hostName , unsigned short portNumber) +{ + + int err; + size_t len = 0; + struct sockaddr_in addr; + int one = 1; + + if ((*fdP = socket(AF_INET, SOCK_STREAM, 0)) < 0) + { + return STATUS_ERROR; + } + if ((setsockopt(*fdP, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one))) == -1) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + addr.sin_family = AF_INET; + if ((hostName == NULL ) || (hostName[0] == '\0')) + addr.sin_addr.s_addr = htonl(INADDR_ANY); + else + { + struct hostent *hp; + + hp = gethostbyname(hostName); + if ((hp == NULL) || (hp->h_addrtype != AF_INET)) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length); + } + + addr.sin_port = htons(portNumber); + len = sizeof(struct sockaddr_in); + + err = bind(*fdP, (struct sockaddr *) & addr, len); + if (err < 0) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + err = listen(*fdP, MAX_SOCKET_QUEUE ); + if (err < 0) + { + PGR_Close_Sock(fdP); + return STATUS_ERROR; + } + return STATUS_OK; +} + +int +PGR_Create_Acception(int fd, int * sockP, char * hostName , unsigned short portNumber) +{ + int sock; + struct sockaddr addr; + size_t len = 0; + int one = 1; + + len = sizeof(struct sockaddr); + if ((sock = accept(fd, &addr, &len)) < 0) + { + *sockP = -1; + return STATUS_ERROR; + } + + if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) < 0) + { + return STATUS_ERROR; + } + if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) < 0) + { + return STATUS_ERROR; + } + *sockP = sock; + + return STATUS_OK; +} + +void +PGR_Close_Sock(int * sock) +{ + close( (int)*sock); + *sock = -1; +} + +static char * +get_string(char * buf) +{ + int i,len1,len2,start_flag; + char *readp, *writep; + + writep = readp = buf; + i = len1 = 0; + while (*(readp +i) != '\0') + { + if (!isspace(*(readp+ i))) + { + len1 ++; + } + i++; + } + start_flag = len2 = 0; + while (*readp != '\0') + { + if (*readp == '#') + { + *writep = '\0'; + break; + } + if (isspace(*readp)) + { + if ((len2 >= len1) || (!start_flag)) + { + readp++; + continue; + } + *writep = *readp; + } + else + { + start_flag = 1; + *writep = *readp; + len2 ++; + } + readp ++; + writep ++; + } + *writep = '\0'; + return buf; +} + +static bool +is_start_tag(char * ptr) +{ + if ((*ptr == '<') && (*(ptr+1) != '/')) + { + return true; + } + return false; +} + +static bool +is_end_tag(char * ptr) +{ + if ((*ptr == '<') && (*(ptr+1) == '/')) + { + return true; + } + return false; +} + +static void +init_conf_data(ConfDataType *conf) +{ + memset(conf->table,0,sizeof(conf->table)); + memset(conf->key,0,sizeof(conf->key)); + memset(conf->value,0,sizeof(conf->value)); + conf->rec_no = 0; + conf->last = NULL; + conf->next = NULL; +} + +static int +get_key(char * key, char * str) +{ + int offset = 1; + char * ptr_s,*ptr_e; + + ptr_s = strchr(str,'<'); + if (ptr_s == NULL) + { + return STATUS_ERROR; + } + if (*(ptr_s+1) == '/') + { + offset = 2; + } + ptr_e = strchr(str,'>'); + if (ptr_e == NULL) + { + return STATUS_ERROR; + } + *ptr_e = '\0'; + strcpy(key,ptr_s + offset); + *ptr_e = '>'; + return STATUS_OK; +} + +static int +get_conf_key_value(char * key, char * value , char * str) +{ + int i; + int len1,len2,start_flag; + char * ptr_s,*ptr_e; + + if(get_key(key,str) == STATUS_ERROR) + { + return STATUS_ERROR; + } + ptr_e = strchr(str,'>'); + if (ptr_e == NULL) + { + return STATUS_ERROR; + } + ptr_s = ptr_e + 1; + + len1 = 0; + while ((*ptr_s != '<') && (*ptr_s != '\0')) + { + if (! isspace(*ptr_s)) + { + len1 ++; + } + ptr_s ++; + } + ptr_s = ptr_e + 1; + i = len2 = start_flag = 0; + while ((*ptr_s != '<') && (*ptr_s != '\0')) + { + if (isspace(*ptr_s)) + { + if ((len2 >= len1) || (!start_flag)) + { + ptr_s ++; + continue; + } + *(value + i) = *ptr_s; + } + else + { + start_flag = 1; + *(value + i) = *ptr_s; + len2 ++; + } + i++; + ptr_s ++; + } + *(value + i) = '\0'; + return STATUS_OK; +} + +static int +add_conf_data(char *table,int rec_no, char *key,char * value) +{ + ConfDataType * conf_data; + + conf_data = (ConfDataType *)malloc(sizeof(ConfDataType)); + if (conf_data == NULL) + { + return STATUS_ERROR; + } + init_conf_data(conf_data); + if (table != NULL) + { + memcpy(conf_data->table,table,sizeof(conf_data->table)); + } + else + { + memset(conf_data->table,0,sizeof(conf_data->table)); + } + memcpy(conf_data->key,key,sizeof(conf_data->key)); + memcpy(conf_data->value,value,sizeof(conf_data->value)); + conf_data->rec_no = rec_no; + if (ConfData_Top == (ConfDataType *)NULL) + { + ConfData_Top = conf_data; + conf_data->last = (char *)NULL; + } + if (ConfData_End == (ConfDataType *)NULL) + { + conf_data->last = (char *)NULL; + } + else + { + conf_data->last = (char *)ConfData_End; + ConfData_End->next = (char *)conf_data; + } + ConfData_End = conf_data; + conf_data->next = (char *)NULL; + return STATUS_OK; +} + +static int +get_table_data(FILE * fp,char * table, int rec_no) +{ + char buf[1024]; + char key_buf[1024]; + char value_buf[1024]; + int len = 0; + char * ptr; + + while (fgets(buf,sizeof(buf),fp) != NULL) + { + /* + * pic up a data string + */ + ptr = get_string(buf); + len = strlen(ptr); + if (len == 0) + { + continue; + } + if (is_end_tag(ptr)) + { + if(get_key(key_buf,ptr) == STATUS_ERROR) + { + return STATUS_ERROR; + } + if (!strcmp(key_buf,table)) + { + return STATUS_OK; + } + } + if (is_start_tag(ptr)) + { + if(get_conf_key_value(key_buf,value_buf,ptr) == STATUS_ERROR) + { + return STATUS_ERROR; + } + add_conf_data(table,rec_no,key_buf,value_buf); + } + } + return STATUS_ERROR; +} + +static int +get_single_data(char * str) +{ + char key_buf[1024]; + char value_buf[1024]; + if(get_conf_key_value(key_buf,value_buf,str) == STATUS_ERROR) + { + return STATUS_ERROR; + } + add_conf_data(NULL,0,key_buf,value_buf); + return STATUS_OK; +} + + +static int +get_conf_file(char * fname) +{ + FILE * fp = NULL; + int len; + char buf[1024]; + char key_buf[1024]; + char last_key_buf[1024]; + char *ptr; + int rec_no = 0; + + /* + * configuration file open + */ + if ((fp = fopen(fname,"r")) == NULL) + { + return STATUS_ERROR; + } + /* + * configuration file read + */ + memset(last_key_buf,0,sizeof(last_key_buf)); + memset(key_buf,0,sizeof(key_buf)); + while (fgets(buf,sizeof(buf),fp) != NULL) + { + /* + * pic up a data string + */ + ptr = get_string(buf); + len = strlen(ptr); + if (len == 0) + { + continue; + } + if (is_start_tag(ptr)) + { + if(get_key(key_buf,ptr) == STATUS_ERROR) + { + fclose(fp); + return STATUS_ERROR; + } + if (strstr(ptr,"next; + free (conf); + conf = nextp; + } + ConfData_Top = ConfData_End = (ConfDataType *)NULL; + return STATUS_OK; +} + +int +PGR_Get_Conf_Data(char * dir , char * fname) +{ + + int status; + + char * conf_file; + if ((dir == NULL) || ( fname == NULL)) + { + return STATUS_ERROR; + } + conf_file = malloc(strlen(dir) + strlen(fname) + 2); + if (conf_file == NULL) + { + return STATUS_ERROR; + } + sprintf(conf_file,"%s/%s",dir,fname); + + ConfData_Top = ConfData_End = (ConfDataType * )NULL; + status = get_conf_file(conf_file); + free (conf_file); + conf_file = NULL; + + return status; +} + +void +PGRset_recovery_packet_no(RecoveryPacket * packet, int packet_no) +{ + if (packet == NULL) + { + return; + } + packet->packet_no = htons(packet_no) ; + +} + +unsigned int +PGRget_ip_by_name(char * host) +{ + struct hostent *hp = NULL; + unsigned int ip = 0; + unsigned char uc = 0; + int i; + + if ((host == NULL) || (*host == '\0')) + { + return 0; + } + hp = gethostbyname( host ); + if (hp == NULL) + { + return 0; + } + for (i = 3 ; i>= 0 ; i --) + { + uc = (unsigned char)hp->h_addr_list[0][i]; + ip = ip | uc; + if (i > 0) + ip = ip << 8; + } + return ip; +} + +int +PGRget_time_value(char *str) +{ + int i,len; + char * ptr; + int unit = 1; + + if (str == NULL) + return -1; + + len = strlen(str); + ptr = str; + for (i = 0; i < len ; i ++,ptr++) + { + if ((! isdigit(*ptr)) && (! isspace(*ptr))) + { + switch (*ptr) + { + case 'm': + case 'M': + unit = 60; + break; + case 'h': + case 'H': + unit = 60*60; + break; + } + *ptr = '\0'; + break; + } + } + return (atoi(str) * unit); +} + +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/backend/main/main.c pgcluster-1.7.0rc7/src/backend/main/main.c --- postgresql-8.2.4/src/backend/main/main.c 2007-01-04 01:58:01.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/main/main.c 2007-02-18 22:52:16.000000000 +0100 @@ -316,6 +316,13 @@ printf(_(" -r FILENAME send stdout and stderr to given file\n")); printf(_(" -x NUM internal use\n")); +#ifdef USE_REPLICATION + printf(_("\nOptions for PGCluster only:\n")); + printf(_(" -R recovery startup with rsync\n")); + printf(_(" -u recovery startup with rsync(it is not create backup files.\n")); + printf(_(" -U recovery startup with pg_dump\n")); +#endif /* USE_REPLICATION */ + printf(_("\nPlease read the documentation for the complete list of run-time\n" "configuration settings and how to set them on the command line or in\n" "the configuration file.\n\n" diff -aruN postgresql-8.2.4/src/backend/parser/gram.y pgcluster-1.7.0rc7/src/backend/parser/gram.y --- postgresql-8.2.4/src/backend/parser/gram.y 2006-11-05 23:42:09.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/parser/gram.y 2007-02-18 22:52:16.000000000 +0100 @@ -412,10 +412,10 @@ QUOTE READ REAL REASSIGN RECHECK REFERENCES REINDEX RELATIVE_P RELEASE RENAME - REPEATABLE REPLACE RESET RESTART RESTRICT RETURNING RETURNS REVOKE RIGHT + REPEATABLE REPLACE REPLICATION RESET RESTART RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE - SAVEPOINT SCHEMA SCROLL SECOND_P SECURITY SELECT SEQUENCE + SAVEPOINT SCHEMA SCROLL SECOND_P SECURITY SELECT SEQUENCE SERVER SERIALIZABLE SESSION SESSION_USER SET SETOF SHARE SHOW SIMILAR SIMPLE SMALLINT SOME STABLE START STATEMENT STATISTICS STDIN STDOUT STORAGE STRICT_P SUBSTRING SUPERUSER_P SYMMETRIC @@ -1224,6 +1224,12 @@ n->name = $2; $$ = (Node *) n; } + | SHOW REPLICATION SERVER + { + VariableShowStmt *n = makeNode(VariableShowStmt); + n->name = "replication_server"; + $$ = (Node *) n; + } | SHOW TIME ZONE { VariableShowStmt *n = makeNode(VariableShowStmt); @@ -8678,6 +8684,7 @@ | RENAME | REPEATABLE | REPLACE + | REPLICATION | RESET | RESTART | RESTRICT @@ -8692,6 +8699,7 @@ | SCROLL | SECOND_P | SECURITY + | SERVER | SEQUENCE | SERIALIZABLE | SESSION diff -aruN postgresql-8.2.4/src/backend/parser/keywords.c pgcluster-1.7.0rc7/src/backend/parser/keywords.c --- postgresql-8.2.4/src/backend/parser/keywords.c 2006-10-07 23:51:02.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/parser/keywords.c 2007-02-18 22:52:16.000000000 +0100 @@ -281,6 +281,7 @@ {"relative", RELATIVE_P}, {"release", RELEASE}, {"rename", RENAME}, + {"replication", REPLICATION}, {"repeatable", REPEATABLE}, {"replace", REPLACE}, {"reset", RESET}, diff -aruN postgresql-8.2.4/src/backend/parser/parse_clause.c pgcluster-1.7.0rc7/src/backend/parser/parse_clause.c --- postgresql-8.2.4/src/backend/parser/parse_clause.c 2006-11-28 13:54:41.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/parser/parse_clause.c 2007-02-18 22:52:16.000000000 +0100 @@ -34,6 +34,9 @@ #include "rewrite/rewriteManip.h" #include "utils/guc.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ #define ORDER_CLAUSE 0 #define GROUP_CLAUSE 1 @@ -154,7 +157,18 @@ * analyze.c will eventually do the corresponding heap_close(), but *not* * release the lock. */ +#ifdef USE_REPLICATION + if (PGRautoLockTable == true) + { + pstate->p_target_relation = heap_openrv(relation, ShareRowExclusiveLock); + } + else + { + pstate->p_target_relation = heap_openrv(relation, RowExclusiveLock); + } +#else pstate->p_target_relation = heap_openrv(relation, RowExclusiveLock); +#endif /* USE_REPLICATION */ /* * Now build an RTE. diff -aruN postgresql-8.2.4/src/backend/parser/parse_relation.c pgcluster-1.7.0rc7/src/backend/parser/parse_relation.c --- postgresql-8.2.4/src/backend/parser/parse_relation.c 2006-10-04 02:29:56.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/parser/parse_relation.c 2007-02-18 22:52:16.000000000 +0100 @@ -30,6 +30,9 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* GUC parameter */ bool add_missing_from; @@ -636,7 +639,14 @@ * to a rel in a statement, be careful to get the right access level * depending on whether we're doing SELECT FOR UPDATE/SHARE. */ +#ifdef USE_REPLICATION + if (PGRautoLockTable == true) + lockmode = isLockedRel(pstate, refname) ? ShareRowExclusiveLock : AccessShareLock; + else + lockmode = isLockedRel(pstate, refname) ? RowShareLock : AccessShareLock; +#else lockmode = isLockedRel(pstate, refname) ? RowShareLock : AccessShareLock; +#endif /* USE_REPLICATION */ rel = heap_openrv(relation, lockmode); rte->relid = RelationGetRelid(rel); diff -aruN postgresql-8.2.4/src/backend/postmaster/postmaster.c pgcluster-1.7.0rc7/src/backend/postmaster/postmaster.c --- postgresql-8.2.4/src/backend/postmaster/postmaster.c 2007-01-04 01:58:01.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/postmaster/postmaster.c 2007-02-18 22:52:16.000000000 +0100 @@ -122,6 +122,9 @@ #include "storage/spin.h" #endif +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * List of active backends (or child processes anyway; we don't actually @@ -363,6 +366,61 @@ #define EXIT_STATUS_0(st) ((st) == 0) #define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1) +#ifdef USE_REPLICATION +char * Query_String = NULL; +ReplicateServerInfo * ReplicateServerData = NULL; +ReplicateServerInfo * CurrentReplicateServer = NULL; +ReplicateServerInfo * LastReplicateServer = NULL; +int ReplicateServerShmid = -1; +int TransactionQuery = 0; +int TransactionSock = -1; +int Transaction_Mode = 0; +bool PGR_Noticed_Abort = false; +bool Session_Authorization_Mode = false; +bool Create_Temp_Table_Mode = false; +ConfDataType * ConfData_Top = (ConfDataType *)NULL; +ConfDataType * ConfData_End = (ConfDataType *)NULL; +int RecoveryPortNumber = 0; +char * RsyncPath = NULL; +char * RsyncOption = NULL; +char * PgDumpPath = NULL; +bool RsyncCompress = true; +ReplicateNow * ReplicateCurrentTime = NULL; +CopyData * PGRCopyData = NULL; +bool PGR_Copy_Data_Need_Replicate = false; +PGR_Stand_Alone_Type * PGR_Stand_Alone = NULL; +PGR_Not_Replicate_Type * PGR_Not_Replicate = NULL; +int PGR_Not_Replicate_Rec_Num = 0; +bool PGR_Is_Replicated_Query = false; +PGR_Check_Lock_Type PGR_Check_Lock; +int PGR_Sock_To_Replication_Server = -1; +bool PGR_Need_Notice = false; +bool PGR_Lock_Noticed = false; +bool PGR_Recovery_Option = false; +int PGR_recovery_mode = 0; +char * PGRSelfHostName = NULL; +int PGR_Pending_Sem_Num = 0; +bool PGR_Reliable_Mode_Wait = true; +PGR_Retry_Query_Type PGR_Retry_Query; +int ClusterDBShmid = -1; +ClusterDBInfo * ClusterDBData = NULL; +PGR_Password_Info * PGR_password = NULL; +int PGR_Replication_Timeout = 60; +int PGR_Lifecheck_Timeout = 3; +int PGR_Lifecheck_Interval = 11; + +/* initialize in utils/misc/guc.c */ +bool PGRforceLoadBalance = false; +bool PGRcheckConstraintWithLock = false; +bool PGRautoLockTable = true; +bool PGRnotReplicatePreparedSelect = false; + +bool needToUpdateReplicateIdOnNextQueryIsDone=false; +bool PGR_Is_Sync_OID = false; + +static int Master_Pid = 0; +static int Lifecheck_Pid = 0; +#endif /* USE_REPLICATION */ /* * Postmaster main entry point @@ -375,6 +433,11 @@ char *userDoption = NULL; int i; +#ifdef USE_REPLICATION + PGR_Check_Lock.check_lock_conflict = false; + PGR_Check_Lock.status_lock_conflict = STATUS_OK; +#endif /* USE REPLICATION */ + MyProcPid = PostmasterPid = getpid(); IsPostmasterEnvironment = true; @@ -420,10 +483,24 @@ * tcop/postgres.c (the option sets should not conflict) * and with the common help() function in main/main.c. */ - while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) + while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:URu")) != -1) { switch (opt) { +#ifdef USE_REPLICATION + case 'U': + PGR_Recovery_Option = true; + PGR_recovery_mode = PGR_HOT_RECOVERY; + break; + case 'R': + PGR_Recovery_Option = true; + PGR_recovery_mode = PGR_COLD_RECOVERY; + break; + case 'u': + PGR_Recovery_Option = true; + PGR_recovery_mode = PGR_WITHOUT_BACKUP; + break; +#endif /* USE_REPLICATION */ case 'A': SetConfigOption("debug_assertions", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; @@ -696,6 +773,30 @@ */ CreateDataDirLockFile(true); +#ifdef USE_REPLICATION + if (PGR_Get_Conf_Data( DataDir, CLUSTER_CONF_FILE ) == STATUS_OK) + { + if (PGR_Init_Replicate_Server_Data() != STATUS_OK) + { + fprintf(stderr,"PGR_Init_Replicate_Server_Data failed\n"); + ExitPostmaster(0); + } + PGR_Set_Replicate_Server_Socket(); + PGR_Free_Conf_Data(); + if ((PGR_Recovery_Option) && + (PGR_recovery_mode != PGR_HOT_RECOVERY)) + { + fprintf(stderr,"Start in recovery mode! \n"); + fprintf(stderr,"Please wait until a data synchronization finishes from Master DB... \n"); + if (PGR_Recovery_Main(PGR_recovery_mode) != STATUS_OK) + { + fprintf(stderr,"PGR_Recovery_Main() failed with cold recovery\n"); + ExitPostmaster(0); + } + } + } +#endif /* USE_REPLICATION */ + /* * If timezone is not set, determine what the OS uses. (In theory this * should be done during GUC initialization, but because it can take as @@ -960,6 +1061,21 @@ */ StartupPID = StartupDataBase(); +#ifdef USE_REPLICATION + Master_Pid = PGR_Master_Main(); + if (Master_Pid < 0) + { + elog(DEBUG1,"PGR_Master_Main failed"); + ExitPostmaster(1); + } + Lifecheck_Pid = PGR_Lifecheck_Main(); + if (Lifecheck_Pid < 0) + { + elog(DEBUG1,"PGR_Lifecheck_Main failed"); + ExitPostmaster(1); + } +#endif /* USE_REPLICATION */ + status = ServerLoop(); /* @@ -1133,6 +1249,60 @@ last_touch_time = time(NULL); nSockets = initMasks(&readmask); +#ifdef USE_REPLICATION + if (PGR_Recovery_Option) + { + int pid = 0; + pid = fork_process(); + if (pid == 0) /* child */ + { + fprintf(stderr,"Start in recovery mode! \n"); + fprintf(stderr,"Please wait until a data synchronization finishes from Master DB... \n"); + IsUnderPostmaster = true; /* we are a postmaster subprocess now */ + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + /* Lose the postmaster's on-exit routines and port connections */ + on_exit_reset(); + /* Release postmaster's working memory context */ + MemoryContextSwitchTo(TopMemoryContext); + MemoryContextDelete(PostmasterContext); + PostmasterContext = NULL; + if (PGR_recovery_mode == PGR_HOT_RECOVERY) + { + if (PGR_Recovery_Main(PGR_recovery_mode) != STATUS_OK) + { + elog(DEBUG1,"PGR_Recovery_Main() failed with hot recovery."); + ExitPostmaster(1); + } + } + else + { + if (PGR_recovery_queue_data_req() != STATUS_OK) + { + elog(DEBUG1,"PGR_recovery_queue_data_req failed"); + ExitPostmaster(1); + } + } + PGR_recovery_finish_send(); + PGR_Recovery_Option = false; + fprintf(stderr,"OK! The data synchronization with Master DB was finished. \n"); + + ExitPostmaster(0); + } + else if (pid < 0) + { + ExitPostmaster(1); + } + } + if (PGR_password != NULL) + { + if(PGR_password->password != NULL) + memset(PGR_password->password,0,PASSWORD_MAX_LENGTH); + memset(PGR_password->md5Salt,0,sizeof(PGR_password->md5Salt)); + memset(PGR_password->cryptSalt,0,sizeof(PGR_password->cryptSalt)); + } +#endif /* USE_REPLICATION */ for (;;) { @@ -1591,6 +1761,9 @@ ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already"))); +#ifdef USE_REPLICATION + return STATUS_ERROR; +#endif break; case CAC_OK: default: @@ -1858,6 +2031,23 @@ (errmsg_internal("postmaster received signal %d", postgres_signal_arg))); +#ifdef USE_REPLICATION + if (PGR_Get_Cluster_Status() == STATUS_RECOVERY) + { + PGR_recovery_error_send(); + PGR_Recovery_Option = false; + } + if (Master_Pid > 0) + { + kill (Master_Pid,postgres_signal_arg); + } + if (Lifecheck_Pid > 0) + { + kill (Lifecheck_Pid,postgres_signal_arg); + } + PGR_delete_shm(); +#endif /* USE_REPLICATION */ + switch (postgres_signal_arg) { case SIGTERM: @@ -3452,6 +3642,16 @@ * MUST -- vadim 05-10-1999 */ +#ifdef USE_REPLICATION + if (PGR_Get_Cluster_Status() == STATUS_RECOVERY) + { + write_stderr("sorry, recovery failed."); + PGR_recovery_error_send(); + PGR_Recovery_Option = false; + } + PGR_delete_shm(); +#endif /* USE_REPLICATION */ + proc_exit(status); } diff -aruN postgresql-8.2.4/src/backend/storage/large_object/inv_api.c pgcluster-1.7.0rc7/src/backend/storage/large_object/inv_api.c --- postgresql-8.2.4/src/backend/storage/large_object/inv_api.c 2006-09-07 17:37:25.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/storage/large_object/inv_api.c 2007-02-18 22:52:16.000000000 +0100 @@ -36,6 +36,10 @@ #include "utils/fmgroids.h" #include "utils/resowner.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ + /* * All accesses to pg_largeobject and its index make use of a single Relation @@ -188,6 +192,9 @@ * use. We can use the index on pg_largeobject for checking OID * uniqueness, even though it has additional columns besides OID. */ +#ifdef USE_REPLICATION + PGR_Is_Sync_OID = true; +#endif /* USE_REPLICATION */ if (!OidIsValid(lobjId)) { open_lo_relation(); @@ -206,6 +213,9 @@ */ CommandCounterIncrement(); +#ifdef USE_REPLICATION + PGR_Is_Sync_OID = false; +#endif /* USE_REPLICATION */ return lobjId; } diff -aruN postgresql-8.2.4/src/backend/storage/lmgr/deadlock.c pgcluster-1.7.0rc7/src/backend/storage/lmgr/deadlock.c --- postgresql-8.2.4/src/backend/storage/lmgr/deadlock.c 2006-09-23 01:20:13.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/storage/lmgr/deadlock.c 2007-02-18 22:52:16.000000000 +0100 @@ -30,6 +30,9 @@ #include "storage/proc.h" #include "utils/memutils.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* One edge in the waits-for graph */ typedef struct @@ -217,6 +220,13 @@ if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges)) elog(FATAL, "deadlock seems to have disappeared"); +#ifdef USE_REPLICATION + if (PGR_Notice_Conflict() == STATUS_ERROR) + { + return FALSE; + } + PGR_Lock_Noticed =true; +#endif return true; /* cannot find a non-deadlocked state */ } @@ -426,6 +436,18 @@ int numLockModes, lm; +#ifdef USE_REPLICATION + /* + * In PGCluster mode , conflicts with procs has younger rep-id didn't + * matter. It's also processed younger proc's CheckDeadLock(). + * It's nesseary to make sure all nodes have same deadlock order. + * So, always most young (rep-id) process only will rollback by deadlock. + */ + if ( MyProc->replicationId!=0 && + MyProc -> replicationId < checkProc->replicationId) + return false; + +#endif /* * Have we already seen this proc? */ diff -aruN postgresql-8.2.4/src/backend/storage/lmgr/lmgr.c pgcluster-1.7.0rc7/src/backend/storage/lmgr/lmgr.c --- postgresql-8.2.4/src/backend/storage/lmgr/lmgr.c 2006-10-04 02:29:57.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/storage/lmgr/lmgr.c 2007-02-18 22:52:16.000000000 +0100 @@ -26,6 +26,9 @@ #include "utils/inval.h" #include "utils/lsyscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * RelationInitLockInfo @@ -476,9 +479,16 @@ SET_LOCKTAG_TRANSACTION(tag, xid); +#ifdef USE_REPLICATION + if (!LockAcquire(&tag, ExclusiveLock, false,false)) + elog(ERROR, "XactLockTableWait: LockAcquire failed"); + + LockRelease(&tag, ExclusiveLock,false); +#else (void) LockAcquire(&tag, ShareLock, false, false); LockRelease(&tag, ShareLock, false); +#endif /* USE_REPLICATION */ if (!TransactionIdIsInProgress(xid)) break; @@ -635,3 +645,37 @@ } return false; /* default case */ } + +#ifdef USE_REPLICATION +/* + * XactLockTableWait + * + * Wait for the specified transaction to commit or abort. + */ +void +XactLockTableWaitForCluster(TransactionId xid,Buffer buffer) +{ + LOCKTAG tag; + TransactionId myxid = GetCurrentTransactionId(); + + Assert(!TransactionIdEquals( xid, myxid )); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + SET_LOCKTAG_TRANSACTION(tag, xid); + + if (!LockAcquire(&tag, ExclusiveLock, false,false)) + elog(ERROR, "XactLockTableWait: LockAcquire failed"); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + LockRelease(&tag, ExclusiveLock,false); + + /* + * Transaction was committed/aborted/crashed - we have to update + * pg_clog if transaction is still marked as running. + */ + if (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid)) + TransactionIdAbort(xid); +} +#endif /*USE_REPLICATION*/ diff -aruN postgresql-8.2.4/src/backend/storage/lmgr/lock.c pgcluster-1.7.0rc7/src/backend/storage/lmgr/lock.c --- postgresql-8.2.4/src/backend/storage/lmgr/lock.c 2006-10-04 02:29:57.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/storage/lmgr/lock.c 2007-02-18 22:52:16.000000000 +0100 @@ -42,6 +42,10 @@ #include "utils/ps_status.h" #include "utils/resowner.h" +#ifdef USE_REPLICATION +#include "storage/lmgr.h" +#include "replicate.h" +#endif /* USE_REPLICATION */ /* This configuration variable is used to set the lock table size */ int max_locks_per_xact; /* set by guc.c */ @@ -737,6 +741,10 @@ status = LockCheckConflicts(lockMethodTable, lockmode, lock, proclock, MyProc); +#ifdef USE_REPLICATION + PGR_Check_Lock.status_lock_conflict = status; + PGR_Check_Lock.deadlock = false; +#endif /* USE_REPLICATION */ if (status == STATUS_OK) { /* No conflict with held or previously requested locks */ @@ -746,6 +754,17 @@ else { Assert(status == STATUS_FOUND); +#ifdef USE_REPLICATION + if ((PGR_Need_Notice == true) && + (PGR_Check_Lock.check_lock_conflict == true)) + { + if (!PGR_Lock_Noticed && PGR_Notice_Conflict() == STATUS_ERROR) + { + return FALSE; + } + PGR_Lock_Noticed = true; + } +#endif /* USE_REPLICATION */ /* * We can't acquire the lock immediately. If caller specified no diff -aruN postgresql-8.2.4/src/backend/storage/lmgr/proc.c pgcluster-1.7.0rc7/src/backend/storage/lmgr/proc.c --- postgresql-8.2.4/src/backend/storage/lmgr/proc.c 2006-11-21 21:59:52.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/storage/lmgr/proc.c 2007-02-18 22:52:16.000000000 +0100 @@ -43,6 +43,9 @@ #include "storage/procarray.h" #include "storage/spin.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* GUC variables */ int DeadlockTimeout = 1000; @@ -263,6 +266,9 @@ MyProc->lwWaitLink = NULL; MyProc->waitLock = NULL; MyProc->waitProcLock = NULL; +#ifdef USE_REPLICATION + MyProc->replicationId = 0; +#endif for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(MyProc->myProcLocks[i])); @@ -395,6 +401,9 @@ MyProc->lwWaitLink = NULL; MyProc->waitLock = NULL; MyProc->waitProcLock = NULL; +#ifdef USE_REPLICATION + MyProc->replicationId = 0; +#endif for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(MyProc->myProcLocks[i])); @@ -737,6 +746,17 @@ GrantAwaitedLock(); return STATUS_OK; } +#ifdef USE_REPLICATION + if(proc->replicationId == 0 || + (MyProc->replicationId > proc->replicationId && + proc->heldLocks & aheadRequests) ) { + elog(DEBUG1,"origin's RId = %d , MyProc->RId = %d , skip",proc->replicationId,MyProc->replicationId); + aheadRequests |= (1 << proc->waitLockMode); + proc = (PGPROC *) MAKE_PTR(proc->links.next); + continue; + } + +#endif /* Break out of loop to put myself before him */ break; } @@ -752,8 +772,21 @@ } else { +#ifdef USE_REPLICATION + proc = (PGPROC *) &(waitQueue->links); + for (i = 0; i < waitQueue->size+1; i++){ + elog(DEBUG1,"origin's RId = %d , MyProc->RId = %d",proc->replicationId,MyProc->replicationId); + if(proc->replicationId == 0 || + MyProc->replicationId > proc->replicationId) { + proc= (PGPROC *) MAKE_PTR(proc->links.next); + }else { + break; + } + } +#else /* I hold no locks, so I can't push in front of anyone. */ proc = (PGPROC *) &(waitQueue->links); +#endif /* USE_REPLICATION */ } /* @@ -776,7 +809,11 @@ * CheckDeadLock's recovery code, except that we shouldn't release the * semaphore since we haven't tried to lock it yet. */ +#ifdef USE_REPLICATION + if (early_deadlock && proc->replicationId < MyProc->replicationId) +#else if (early_deadlock) +#endif { RemoveFromWaitQueue(MyProc, hashcode); return STATUS_ERROR; @@ -976,6 +1013,9 @@ CheckDeadLock(void) { int i; +#ifdef USE_REPLICATION + bool pgr_notice = false; +#endif /* USE_REPLICATION */ /* * Acquire exclusive lock on the entire shared lock data structures. Must @@ -1047,6 +1087,10 @@ * such processes. */ +#ifdef USE_REPLICATION + pgr_notice = true; +#endif + /* * Release locks acquired at head of routine. Order is not critical, so * do it back-to-front to avoid waking another CheckDeadLock instance @@ -1055,6 +1099,12 @@ check_done: for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(FirstLockMgrLock + i); +#ifdef USE_REPLICATION + if (pgr_notice == true) + { + PGR_Notice_DeadLock(); + } +#endif } @@ -1110,6 +1160,15 @@ { TimestampTz fin_time; struct itimerval timeval; +#ifdef USE_REPLICATION + int useFlag = 0; + + if (ReplicateCurrentTime != NULL) + { + useFlag = ReplicateCurrentTime->useFlag; + ReplicateCurrentTime->useFlag = DATA_INIT; + } +#endif /* USE_REPLICATION */ if (is_statement_timeout) { @@ -1154,6 +1213,12 @@ fin_time = GetCurrentTimestamp(); fin_time = TimestampTzPlusMilliseconds(fin_time, delayms); deadlock_timeout_active = true; +#ifdef USE_REPLICATION + if (ReplicateCurrentTime != NULL) + { + ReplicateCurrentTime->useFlag = useFlag; + } +#endif /* USE_REPLICATION */ if (fin_time >= statement_fin_time) return true; } @@ -1167,6 +1232,12 @@ MemSet(&timeval, 0, sizeof(struct itimerval)); timeval.it_value.tv_sec = delayms / 1000; timeval.it_value.tv_usec = (delayms % 1000) * 1000; +#ifdef USE_REPLICATION + if (ReplicateCurrentTime != NULL) + { + ReplicateCurrentTime->useFlag = useFlag; + } +#endif /* USE_REPLICATION */ if (setitimer(ITIMER_REAL, &timeval, NULL)) return false; return true; @@ -1232,12 +1303,30 @@ CheckStatementTimeout(void) { TimestampTz now; +#ifdef USE_REPLICATION + int useFlag = 0; +#endif /* USE_REPLICATION */ if (!statement_timeout_active) return true; /* do nothing if not active */ +#ifdef USE_REPLICATION + if (ReplicateCurrentTime != NULL) + { + useFlag = ReplicateCurrentTime->useFlag; + ReplicateCurrentTime->useFlag = DATA_INIT; + } +#endif /* USE_REPLICATION */ + now = GetCurrentTimestamp(); +#ifdef USE_REPLICATION + if (ReplicateCurrentTime != NULL) + { + ReplicateCurrentTime->useFlag = useFlag; + } +#endif /* USE_REPLICATION */ + if (now >= statement_fin_time) { /* Time to die */ diff -aruN postgresql-8.2.4/src/backend/tcop/postgres.c pgcluster-1.7.0rc7/src/backend/tcop/postgres.c --- postgresql-8.2.4/src/backend/tcop/postgres.c 2007-01-04 01:58:01.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/tcop/postgres.c 2007-02-18 22:52:16.000000000 +0100 @@ -68,6 +68,10 @@ #include "pgstat.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ + extern int optind; extern char *optarg; @@ -91,7 +95,9 @@ /* wait N seconds to allow attach from a debugger */ int PostAuthDelay = 0; - +#ifdef USE_REPLICATION +bool PGR_Not_Replication_Query = false; +#endif /* USE_REPLICATION */ /* ---------------- * private variables @@ -753,6 +759,24 @@ bool was_logged = false; char msec_str[32]; +#ifdef USE_REPLICATION + char * query_ptr = NULL; + char * null_ptr = NULL; + int skip_cnt = 0; + int status = 0; + + PGR_Reliable_Mode_Wait = false; + query_ptr = (char *)query_string; + if (PGR_Is_Replicated_Query == false) + { + PGR_Is_Replicated_Query = PGR_Is_Replicated_Command(query_ptr); + } + PGR_Retry_Query.query_string = (char *)query_string; + PGR_Retry_Query.query_len = strlen(query_string); + PGR_Retry_Query.cmdSts = CMD_STS_OTHER; + PGR_Retry_Query.cmdType = CMD_TYPE_OTHER; +#endif /* USE_REPLICATION */ + /* * Report query to various monitoring facilities. */ @@ -831,6 +855,18 @@ DestReceiver *receiver; int16 format; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = false; + PGR_Reliable_Mode_Wait = false; + + PGR_Retry_Query.query_string = NULL; + PGR_Retry_Query.query_len = 0; + PGR_Retry_Query.cmdSts = CMD_STS_OTHER; + PGR_Retry_Query.cmdType = CMD_TYPE_OTHER; + PGR_Retry_Query.useFlag = DATA_INIT; + PGR_Lock_Noticed = false; +#endif /* USE_REPLICATION */ + /* * Get the command name for use in status display (it also becomes the * default completion tag, down inside PortalRun). Set ps_status and @@ -853,10 +889,232 @@ */ if (IsAbortedTransactionBlockState() && !IsTransactionExitStmt(parsetree)) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); + } + +#ifdef USE_REPLICATION + Query_String = NULL; + query_ptr = PGR_Remove_Comment(query_ptr); + PGR_Check_Lock.dest = TO_FRONTEND; + PGR_Need_Notice = false; + PGR_Check_Lock.check_lock_conflict = false; + + /* skip replication during recovery mode runing */ + if (PGR_Get_Cluster_Status() == STATUS_RECOVERY) + { + /* + PGR_Not_Replication_Query = true; + */ + PGR_Is_Replicated_Query = true; + if (!strcmp(commandTag,"SELECT")) + { + if (PGR_Is_System_Command(query_ptr)) + { + status = PGR_Call_System_Command(query_ptr); + if (status == STATUS_SKIP_QUERY) + { + EndCommand(PGR_ALREADY_REPLICATED_NOTICE_CMD,dest); + break; + } + else + { + EndCommand("SYSTEM_COMMAND",dest); + continue; + } + } + } + Transaction_Mode = PGR_Set_Transaction_Mode(Transaction_Mode,commandTag); + if (Transaction_Mode > 0) + { + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + } + goto Skip_Replication; + } + + /* + if (!xact_started) + { + start_xact_command(); + xact_started = true; + } + */ + if (skip_cnt == 0) + { + skip_cnt = PGR_Is_Skip_Replication(query_ptr); + } + null_ptr = PGR_scan_terminate (query_ptr); + if(null_ptr != NULL) + { + *null_ptr = '\0'; + } + Transaction_Mode = PGR_Set_Transaction_Mode(Transaction_Mode,commandTag); + if ((PGR_Is_Replicated_Query ) || + (skip_cnt != 0)) + { + if (skip_cnt > 0) + { + skip_cnt --; + } + else + { + skip_cnt = 0; + } + PGR_Copy_Data_Need_Replicate = false; + if (!strncmp(commandTag,"SELECT",strlen("SELECT"))) + { + if (PGR_Is_System_Command(query_ptr)) + { + status = PGR_Call_System_Command(query_ptr); + if (status == STATUS_SKIP_QUERY) + { + EndCommand(PGR_ALREADY_REPLICATED_NOTICE_CMD,dest); + break; + } + else + { + EndCommand("SYSTEM_COMMAND",dest); + continue; + } + } + } + PGR_Check_Lock.status_lock_conflict = STATUS_OK; + PGR_Check_Lock.dest = TO_FRONTEND; + } + else + { + PGR_Copy_Data_Need_Replicate = false; + + /* check cluster db status */ + /* + if ((PGR_Get_Cluster_Status() == STATUS_RECOVERY) && + (PGR_Not_Replication_Query == false) && + (Transaction_Mode == 0 ) ) + { + elog(WARNING, "This query is not permitted while recovery db "); + if(null_ptr != NULL) + { + *null_ptr = ';'; + query_ptr = null_ptr +1; + } + continue; + } + */ + if (PGR_Is_Stand_Alone() == true) + { + if (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) + { + if (!strcmp(commandTag, "SHOW")) { + VariableShowStmt *stmt = (VariableShowStmt *)parsetree; + if (!strcmp(stmt->name, "replication_server")) { + PGR_Not_Replication_Query = true; + } + } + + if (PGR_Not_Replication_Query == false) + elog(ERROR, "This query is not permitted when all replication servers fell down "); + } + } + else if ((PGRforceLoadBalance == false) && + ((PGR_Not_Replication_Query == false ) || + (!strcmp(commandTag,"SELECT")))) + { + status = PGR_replication(query_ptr,dest,parsetree,commandTag); + if (status == STATUS_REPLICATED) + { + if (xact_started) + { + finish_xact_command(); + xact_started = false; + } + CommandCounterIncrement(); + continue; + } + else if (status == STATUS_ERROR) + { + if (!strcmp(commandTag, "SHOW")) { + VariableShowStmt *stmt = (VariableShowStmt *)parsetree; + if (!strcmp(stmt->name, "replication_server")) { + PGR_Not_Replication_Query = true; + } + } + else if (PGR_Stand_Alone->permit == PERMIT_READ_ONLY) + { + elog(ERROR, "This query is not permitted when all replication servers fell down "); + } + } + else if (status == STATUS_DEADLOCK_DETECT) + { + PGR_Need_Notice = false; + elog(ERROR, "postmaster deadlock detected"); + continue; + } + else if (status == STATUS_REPLICATION_ABORT) + { + PGR_Need_Notice = false; + elog(ERROR, "replication server should be down, transaction aborted."); + continue; + } + else if (status != STATUS_CONTINUE) + { + PGR_Check_Lock.dest = TO_FRONTEND; + } + else + { + PGR_Check_Lock.dest = TO_REPLICATION_SERVER; + PGR_Reliable_Mode_Wait = true; + } + } + } + if(null_ptr != NULL) + { + *null_ptr = ';'; + query_ptr = null_ptr +1; + } + if (!PGR_Is_Replicated_Query ) + { + if ((!strcmp(commandTag,"BEGIN")) || + (!strcmp(commandTag, "START TRANSACTION")) || + (Transaction_Mode == 0 ) ) + { + PGR_Reload_Start_Time(); + } + } + if (((IsA(parsetree, TransactionStmt)) || + (Transaction_Mode > 0) || + (Create_Temp_Table_Mode == true) || + (Session_Authorization_Mode == true)) || + (!strcmp(commandTag,"COPY"))) + { + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + } + else + { + if (PGR_Not_Replication_Query == false) + { + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + } + else + { + if ((PGR_Is_Replicated_Query ) && + (!strncmp(commandTag, "SELECT",strlen("SELECT")))) + { + PGR_Need_Notice = true; + PGR_Check_Lock.check_lock_conflict = true; + } + } + } +Skip_Replication: +#endif /* USE_REPLICATION */ /* Make sure we are in a transaction command */ start_xact_command(); @@ -983,7 +1241,44 @@ * command the client sent, regardless of rewriting. (But a command * aborted by error will not send an EndCommand report at all.) */ +#ifdef USE_REPLICATION + /* + * In Non-CONTROL LOCK CONFLICT mode, we *MUST NOT* send command tag twice. + * So , if it was already sent for lock notification , we didn't send + * tag here. also ReadyForQuery,too. + */ + if(!(PGR_Is_Replicated_Query && PGR_Lock_Noticed)) +#endif EndCommand(completionTag, dest); + +#ifdef USE_REPLICATION + if(PGR_Is_Replicated_Query && + needToUpdateReplicateIdOnNextQueryIsDone) { + ++(ReplicationLog_Info.PGR_Replicate_ID); + + if (CurrentReplicateServer != NULL) + { + /* set replicate id in this system */ + ++(CurrentReplicateServer->replicate_id); + } + elog(DEBUG1,"increased replicate_id to %d",CurrentReplicateServer->replicate_id); + needToUpdateReplicateIdOnNextQueryIsDone=false; + } + + if (PGR_Get_Cluster_Status() != STATUS_RECOVERY) + { + if ((PGR_Need_Notice == true) && + (PGRforceLoadBalance == false)) + { + PGR_Notice_Transaction_Query_Done(); + } + if ((Transaction_Mode == 0) && + (ReplicateCurrentTime != NULL)) + { + ReplicateCurrentTime->use_seed = 1; + } + } +#endif } /* end loop over parsetrees */ /* @@ -1144,11 +1439,15 @@ */ if (IsAbortedTransactionBlockState() && !IsTransactionExitStmt(parsetree)) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); - + } /* * OK to analyze, rewrite, and plan this query. Note that the * originally specified parameter set is not required to be complete, @@ -1382,11 +1681,15 @@ if (IsAbortedTransactionBlockState() && (!IsTransactionExitStmtList(pstmt->query_list) || numParams != 0)) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); - + } /* * Create the portal. Allow silent replacement of an existing portal only * if the unnamed portal is specified. @@ -1769,11 +2072,15 @@ */ if (IsAbortedTransactionBlockState() && !IsTransactionExitStmtList(portal->parseTrees)) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); - + } /* Check for cancel signal before we start execution */ CHECK_FOR_INTERRUPTS(); @@ -2101,11 +2408,15 @@ */ if (IsAbortedTransactionBlockState() && PreparedStatementReturnsTuples(pstmt)) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); - + } if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -2171,11 +2482,15 @@ */ if (IsAbortedTransactionBlockState() && portal->tupDesc) + { +#ifdef USE_REPLICATION + Transaction_Mode = 0; +#endif ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " "commands ignored until end of transaction block"))); - + } if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -2332,6 +2647,9 @@ * backend. This is necessary precisely because we don't clean up our * shared memory state. */ +#ifdef USE_REPLICATION + PGR_delete_shm(); +#endif /* USE_REPLICATION */ exit(2); } @@ -2369,6 +2687,9 @@ } } +#ifdef USE_REPLICATION + PGR_delete_shm(); +#endif /* USE_REPLICATION */ errno = save_errno; } @@ -2383,6 +2704,9 @@ void authdie(SIGNAL_ARGS) { +#ifdef USE_REPLICATION + PGR_delete_shm(); +#endif /* USE_REPLICATION */ exit(1); } @@ -3369,6 +3693,14 @@ pgstat_report_activity(""); } +#ifdef USE_REPLICATION + /* + * In Non-CONTROL LOCK CONFLICT mode, we *MUST NOT* send command tag twice. + * So , if it was already sent for lock notification , we didn't send + * tag here. also ReadyForQuery,too. + */ + if(!(PGR_Is_Replicated_Query && PGR_Lock_Noticed)) +#endif ReadyForQuery(whereToSendOutput); send_ready_for_query = false; } @@ -3409,6 +3741,26 @@ if (ignore_till_sync && firstchar != EOF) continue; +#ifdef USE_REPLICATION + if ((firstchar == CMD_TYPE_P_PARSE) || + (firstchar == CMD_TYPE_P_BIND) || + (firstchar == CMD_TYPE_P_DESCRIBE) || + (firstchar == CMD_TYPE_P_EXECUTE) || + (firstchar == CMD_TYPE_P_SYNC) || + (firstchar == CMD_TYPE_P_CLOSE)) + { + if (PGR_Send_Input_Message(firstchar, &input_message) != STATUS_OK) + { + if ((PGR_Is_Stand_Alone() == true) && + (PGR_Stand_Alone->permit == PERMIT_READ_ONLY)) + { + elog(WARNING, "This query is not permitted when all replication servers fell down "); + break; + } + } + } +#endif /* USE_REPLICATION */ + switch (firstchar) { case 'Q': /* simple query */ @@ -3622,6 +3974,27 @@ case 'X': case EOF: +#ifdef USE_REPLICATION + if (PGRforceLoadBalance == false) + { + if (PGR_Is_Replicated_Query == false) + { + PGR_Noticed_Abort = true; + PGRsend_system_command(CMD_STS_TRANSACTION_ABORT, CMD_TYPE_FRONTEND_CLOSED); + } + else if ((Transaction_Mode >= 1) && (PGR_Noticed_Abort == false)) + { + if (PGR_Did_Commit_Transaction() == true) + { + pgstat_report_activity("commit"); + exec_simple_query("commit"); + } + } + } + /* + PGR_Notice_Transaction_Query_Aborted(); + */ +#endif /* USE_REPLICATION */ /* * Reset whereToSendOutput to prevent ereport from attempting * to send any more messages to client. diff -aruN postgresql-8.2.4/src/backend/tcop/pquery.c pgcluster-1.7.0rc7/src/backend/tcop/pquery.c --- postgresql-8.2.4/src/backend/tcop/pquery.c 2006-10-04 02:29:58.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/tcop/pquery.c 2007-02-18 22:52:16.000000000 +0100 @@ -24,6 +24,9 @@ #include "tcop/utility.h" #include "utils/memutils.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * ActivePortal is the currently executing Portal (the most closely nested, @@ -188,6 +191,19 @@ strcpy(completionTag, "???"); break; } +#ifdef USE_REPLICATION + if ((PGR_Is_Replicated_Query == true ) && + (PGR_Get_Cluster_Status() != STATUS_RECOVERY)) + { + /* + * Replicated *SELECT* query is used to replicate + * ONLY lock and function execution , results . All of + * them will be discarded by pgrp processes. + * So , we don't need to send it. + */ + dest = None_Receiver; + } +#endif /*USE_REPLICATION */ } /* Now take care of any queued AFTER triggers */ diff -aruN postgresql-8.2.4/src/backend/tcop/utility.c pgcluster-1.7.0rc7/src/backend/tcop/utility.c --- postgresql-8.2.4/src/backend/tcop/utility.c 2006-10-04 02:29:58.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/tcop/utility.c 2007-02-18 22:52:16.000000000 +0100 @@ -54,6 +54,9 @@ #include "utils/guc.h" #include "utils/syscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * Error-checking support for DROP commands @@ -1289,29 +1292,48 @@ case T_SelectStmt: tag = "SELECT"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_TransactionStmt: { TransactionStmt *stmt = (TransactionStmt *) parsetree; +#ifdef USE_REPLICATION + bool isInTransaction=IsTransactionBlock(); +#endif /* USE_REPLICATION */ + switch (stmt->kind) { case TRANS_STMT_BEGIN: tag = "BEGIN"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query=isInTransaction; +#endif /* USE_REPLICATION */ break; case TRANS_STMT_START: tag = "START TRANSACTION"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query=isInTransaction; +#endif /* USE_REPLICATION */ break; case TRANS_STMT_COMMIT: tag = "COMMIT"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query=!isInTransaction; +#endif /* USE_REPLICATION */ break; case TRANS_STMT_ROLLBACK: case TRANS_STMT_ROLLBACK_TO: tag = "ROLLBACK"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query=!isInTransaction; +#endif /* USE_REPLICATION */ break; case TRANS_STMT_SAVEPOINT: @@ -1343,10 +1365,16 @@ case T_DeclareCursorStmt: tag = "DECLARE CURSOR"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_ClosePortalStmt: tag = "CLOSE CURSOR"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_FetchStmt: @@ -1355,6 +1383,9 @@ tag = (stmt->ismove) ? "MOVE" : "FETCH"; } +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_CreateDomainStmt: @@ -1677,10 +1708,16 @@ tag = "VACUUM"; else tag = "ANALYZE"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_ExplainStmt: tag = "EXPLAIN"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_VariableSetStmt: @@ -1689,6 +1726,14 @@ case T_VariableShowStmt: tag = "SHOW"; +#ifdef USE_REPLICATION + { + VariableShowStmt *stmt = (VariableShowStmt *)parsetree; + if (strcasecmp(stmt->name, "replication_server")) { + PGR_Not_Replication_Query = true; + } + } +#endif /* USE_REPLICATION */ break; case T_VariableResetStmt: @@ -1755,10 +1800,16 @@ case T_CheckPointStmt: tag = "CHECKPOINT"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_ReindexStmt: tag = "REINDEX"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; case T_CreateConversionStmt: @@ -1783,14 +1834,35 @@ case T_PrepareStmt: tag = "PREPARE"; +#ifdef USE_REPLICATION + if ((PGRnotReplicatePreparedSelect == true) && + (PGR_is_select_prepare_query() == true)) + { + PGR_Not_Replication_Query = true; + } +#endif /* USE_REPLICATION */ break; case T_ExecuteStmt: tag = "EXECUTE"; +#ifdef USE_REPLICATION + if ((PGRnotReplicatePreparedSelect == true) && + (PGR_is_select_prepared_statement((PrepareStmt *)parsetree) == true)) + { + PGR_Not_Replication_Query = true; + } +#endif /* USE_REPLICATION */ break; case T_DeallocateStmt: tag = "DEALLOCATE"; +#ifdef USE_REPLICATION + if ((PGRnotReplicatePreparedSelect == true) && + (PGR_is_select_prepared_statement((PrepareStmt *)parsetree) == true)) + { + PGR_Not_Replication_Query = true; + } +#endif /* USE_REPLICATION */ break; default: @@ -1800,6 +1872,13 @@ break; } +#ifdef USE_REPLICATION + if(PGRforceLoadBalance == true) + { + PGR_Not_Replication_Query = true; + } +#endif /* USE_REPLICATION */ + return tag; } @@ -1835,7 +1914,12 @@ tag = "SELECT FOR SHARE"; } else + { tag = "SELECT"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ + } break; case CMD_UPDATE: tag = "UPDATE"; @@ -1853,6 +1937,9 @@ elog(WARNING, "unrecognized commandType: %d", (int) parsetree->commandType); tag = "???"; +#ifdef USE_REPLICATION + PGR_Not_Replication_Query = true; +#endif /* USE_REPLICATION */ break; } diff -aruN postgresql-8.2.4/src/backend/utils/adt/float.c pgcluster-1.7.0rc7/src/backend/utils/adt/float.c --- postgresql-8.2.4/src/backend/utils/adt/float.c 2006-10-05 03:40:45.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/utils/adt/float.c 2007-02-18 22:52:16.000000000 +0100 @@ -66,6 +66,9 @@ #include "utils/array.h" #include "utils/builtins.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ #ifndef M_PI /* from my RH5.2 gcc math.h file - thomas 2000-04-03 */ @@ -1886,7 +1889,11 @@ float8 result; /* result [0.0 - 1.0) */ +#ifdef USE_REPLICATION + result = ((double) PGR_Random()) / ((double) MAX_RANDOM_VALUE + 1); +#else result = (double) random() / ((double) MAX_RANDOM_VALUE + 1); +#endif /* USE_REPLICATION */ PG_RETURN_FLOAT8(result); } diff -aruN postgresql-8.2.4/src/backend/utils/adt/nabstime.c pgcluster-1.7.0rc7/src/backend/utils/adt/nabstime.c --- postgresql-8.2.4/src/backend/utils/adt/nabstime.c 2006-07-14 16:52:24.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/utils/adt/nabstime.c 2007-02-18 22:52:16.000000000 +0100 @@ -27,6 +27,10 @@ #include "utils/builtins.h" #include "utils/nabstime.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ + #define MIN_DAYNUM (-24856) /* December 13, 1901 */ #define MAX_DAYNUM 24854 /* January 18, 2038 */ @@ -92,7 +96,13 @@ { time_t now; +#ifdef USE_REPLICATION + struct timeval tp; + PGR_GetTimeOfDay(&tp,NULL); + now = tp.tv_sec; +#else now = time(NULL); +#endif /* USE_REPLICATION */ return (AbsoluteTime) now; } @@ -1031,9 +1041,14 @@ { time_t sec; +#ifdef USE_REPLICATION + struct timeval tp; + PGR_GetTimeOfDay(&tp,NULL); + sec = tp.tv_sec; +#else if (time(&sec) < 0) PG_RETURN_ABSOLUTETIME(INVALID_ABSTIME); - +#endif PG_RETURN_ABSOLUTETIME((AbsoluteTime) sec); } @@ -1588,7 +1603,11 @@ int len; pg_time_t tt; +#ifdef USE_REPLICATION + PGR_GetTimeOfDay(&tp,NULL); +#else gettimeofday(&tp, NULL); +#endif /* USE_REPLICATION */ tt = (pg_time_t) tp.tv_sec; pg_strftime(templ, sizeof(templ), "%a %b %d %H:%M:%S.%%06d %Y %Z", pg_localtime(&tt, global_timezone)); diff -aruN postgresql-8.2.4/src/backend/utils/adt/ri_triggers.c pgcluster-1.7.0rc7/src/backend/utils/adt/ri_triggers.c --- postgresql-8.2.4/src/backend/utils/adt/ri_triggers.c 2006-10-04 02:29:59.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/utils/adt/ri_triggers.c 2007-02-18 22:52:16.000000000 +0100 @@ -40,6 +40,9 @@ #include "utils/typcache.h" #include "miscadmin.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* ---------- * Local definitions @@ -271,8 +274,18 @@ * ---------- */ quoteRelationName(pkrelname, pk_rel); +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) + snprintf(querystr, sizeof(querystr), "SELECT 1 FROM ONLY %s x FOR UPDATE OF x", + pkrelname); + else + snprintf(querystr, sizeof(querystr), "SELECT 1 FROM ONLY %s x ", + pkrelname); + +#else snprintf(querystr, sizeof(querystr), "SELECT 1 FROM ONLY %s x FOR SHARE OF x", pkrelname); +#endif /* USE_REPLICATION */ /* Prepare and save the plan */ qplan = ri_PlanCheck(querystr, 0, NULL, @@ -416,6 +429,9 @@ queryoids[i] = SPI_gettypeid(fk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_FK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ @@ -577,6 +593,9 @@ queryoids[i] = SPI_gettypeid(pk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_PK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ @@ -733,6 +752,9 @@ queryoids[i] = SPI_gettypeid(pk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_PK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ @@ -922,6 +944,9 @@ queryoids[i] = SPI_gettypeid(pk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_PK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ @@ -1428,6 +1453,9 @@ queryoids[i] = SPI_gettypeid(pk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_PK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ @@ -1607,6 +1635,9 @@ queryoids[i] = SPI_gettypeid(pk_rel->rd_att, qkey.keypair[i][RI_KEYPAIR_PK_IDX]); } +#ifdef USE_REPLICATION + if (PGRcheckConstraintWithLock) +#endif /* USE_REPLICATION */ strcat(querystr, " FOR SHARE OF x"); /* Prepare and save the plan */ diff -aruN postgresql-8.2.4/src/backend/utils/adt/timestamp.c pgcluster-1.7.0rc7/src/backend/utils/adt/timestamp.c --- postgresql-8.2.4/src/backend/utils/adt/timestamp.c 2006-11-11 02:14:19.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/utils/adt/timestamp.c 2007-02-18 22:52:16.000000000 +0100 @@ -39,6 +39,9 @@ #error -ffast-math is known to break this code #endif +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* Set at postmaster start */ TimestampTz PgStartTime; @@ -948,7 +951,11 @@ TimestampTz result; struct timeval tp; +#ifdef USE_REPLICATION + PGR_GetTimeOfDay(&tp,NULL); +#else gettimeofday(&tp, NULL); +#endif result = (TimestampTz) tp.tv_sec - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY); diff -aruN postgresql-8.2.4/src/backend/utils/error/assert.c pgcluster-1.7.0rc7/src/backend/utils/error/assert.c --- postgresql-8.2.4/src/backend/utils/error/assert.c 2006-03-05 16:58:46.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/utils/error/assert.c 2007-02-18 22:52:16.000000000 +0100 @@ -19,6 +19,10 @@ #include +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ + /* * ExceptionalCondition - Handles the failure of an Assert() */ @@ -39,6 +43,18 @@ fileName, lineNumber); } +#ifdef USE_REPLICATION + if ((PGR_Check_Lock.dest == TO_REPLICATION_SERVER ) && + (PGR_Need_Notice == true)) + { + PGR_Notice_Transaction_Query_Aborted(); + } + if (PGR_Copy_Data_Need_Replicate) + { + PGR_Set_Copy_Data(PGRCopyData,NULL,0,1); + } +#endif /* USE_REPLICATION */ + #ifdef SLEEP_ON_ASSERT /* diff -aruN postgresql-8.2.4/src/backend/utils/error/elog.c pgcluster-1.7.0rc7/src/backend/utils/error/elog.c --- postgresql-8.2.4/src/backend/utils/error/elog.c 2006-11-28 13:54:42.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/utils/error/elog.c 2007-02-18 22:52:16.000000000 +0100 @@ -70,6 +70,9 @@ #include "utils/memutils.h" #include "utils/ps_status.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* Global variables */ ErrorContextCallback *error_context_stack = NULL; @@ -314,6 +317,16 @@ MemoryContext oldcontext; ErrorContextCallback *econtext; +#ifdef USE_REPLICATION + int status = 0; + bool parse_error_flag = false; + + if ((edata->message) && (strstr(edata->message,"parse error") != NULL)) + { + parse_error_flag = true; + } +#endif /* USE_REPLICATION */ + recursion_depth++; CHECK_STACK_DEPTH(); @@ -363,6 +376,24 @@ * handler should reset it to something else soon. */ +#ifdef USE_REPLICATION + if (parse_error_flag) + { + if ((PGR_Check_Lock.dest != TO_FRONTEND) && + (Transaction_Mode > 0)) + { + PGR_Force_Replicate_Query(); + } + } + if (PGR_Copy_Data_Need_Replicate) + { + PGR_Set_Copy_Data(PGRCopyData,NULL,0,1); + } + else if (PGR_Need_Notice == true) + { + PGR_Notice_Transaction_Query_Done(); + } +#endif /* USE_REPLICATION */ recursion_depth--; PG_RE_THROW(); } @@ -377,7 +408,16 @@ * client_min_messages above FATAL, so don't look at output_to_client. */ if (elevel >= FATAL && whereToSendOutput == DestRemote) + { +#ifdef USE_REPLICATION + if (PGR_Copy_Data_Need_Replicate) + { + PGR_Set_Copy_Data(PGRCopyData,NULL,0,1); + } +#endif /* USE_REPLICATION */ pq_endcopyout(true); + } + /* Emit the message to the right places */ EmitErrorReport(); @@ -417,6 +457,34 @@ if (PG_exception_stack == NULL && whereToSendOutput == DestRemote) whereToSendOutput = DestNone; +#ifdef USE_REPLICATION + if (CurrentReplicateServer != NULL) + { + if (PGR_Need_Notice == true) + { + PGR_Notice_Transaction_Query_Aborted(); + } + if (PGR_Copy_Data_Need_Replicate) + { + PGR_Set_Copy_Data(PGRCopyData,NULL,0,1); + } + else + { + if ((!PGR_Is_Replicated_Query ) && + (PGR_Check_Lock.dest != TO_FRONTEND) && + (PGR_Reliable_Mode_Wait == true) && + (CurrentReplicateServer->response_mode == PGR_RELIABLE_MODE)) + { + status = PGR_Recv_Trigger(0); + } + } + } + if (TransactionSock != -1) + { + close (TransactionSock); + TransactionSock = -1; + } +#endif /* USE_REPLICATION */ /* * fflush here is just to improve the odds that we get to see the * error message, in case things are so hosed that proc_exit crashes. @@ -436,6 +504,34 @@ if (elevel >= PANIC) { +#ifdef USE_REPLICATION + if (CurrentReplicateServer != NULL) + { + if (PGR_Need_Notice == true) + { + PGR_Notice_Transaction_Query_Aborted(); + } + if (PGR_Copy_Data_Need_Replicate) + { + PGR_Set_Copy_Data(PGRCopyData,NULL,0,1); + } + else + { + if ((!PGR_Is_Replicated_Query ) && + (PGR_Check_Lock.dest != TO_FRONTEND) && + (PGR_Reliable_Mode_Wait == true) && + (CurrentReplicateServer->response_mode == PGR_RELIABLE_MODE)) + { + status = PGR_Recv_Trigger(PGR_Replication_Timeout); + } + } + } + if (TransactionSock != -1) + { + close (TransactionSock); + TransactionSock = -1; + } +#endif /* USE_REPLICATION */ /* * Serious crash time. Postmaster will observe SIGABRT process exit * status and kill the other backends too. diff -aruN postgresql-8.2.4/src/backend/utils/fmgr/fmgr.c pgcluster-1.7.0rc7/src/backend/utils/fmgr/fmgr.c --- postgresql-8.2.4/src/backend/utils/fmgr/fmgr.c 2006-10-04 02:30:01.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/utils/fmgr/fmgr.c 2007-02-18 22:52:16.000000000 +0100 @@ -25,6 +25,9 @@ #include "utils/fmgrtab.h" #include "utils/lsyscache.h" #include "utils/syscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * Declaration for old-style function pointer type. This is now used only @@ -218,7 +221,12 @@ ReleaseSysCache(procedureTuple); return; } - +#ifdef USE_REPLICATION + if (PGR_Replicate_Function_Call() != STATUS_OK) + { + return; + } +#endif /* USE_REPLICATION */ switch (procedureStruct->prolang) { case INTERNALlanguageId: diff -aruN postgresql-8.2.4/src/backend/utils/mb/mbutils.c pgcluster-1.7.0rc7/src/backend/utils/mb/mbutils.c --- postgresql-8.2.4/src/backend/utils/mb/mbutils.c 2006-10-04 02:30:02.000000000 +0200 +++ pgcluster-1.7.0rc7/src/backend/utils/mb/mbutils.c 2007-02-18 22:52:16.000000000 +0100 @@ -15,6 +15,9 @@ #include "utils/memutils.h" #include "utils/syscache.h" +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ /* * We handle for actual FE and BE encoding setting encoding-identificator * and encoding-name too. It prevent searching and conversion from encoding @@ -442,6 +445,11 @@ dest_encoding; FmgrInfo *flinfo; +#ifdef USE_REPLICATION + if (PGR_Is_Replicated_Query) + return (char *)src; +#endif /* USE_REPLICATION */ + if (is_client_to_server) { src_encoding = ClientEncoding->encoding; diff -aruN postgresql-8.2.4/src/backend/utils/misc/guc.c pgcluster-1.7.0rc7/src/backend/utils/misc/guc.c --- postgresql-8.2.4/src/backend/utils/misc/guc.c 2006-11-29 15:50:07.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/utils/misc/guc.c 2007-02-18 22:52:16.000000000 +0100 @@ -25,6 +25,9 @@ #include #endif +#ifdef USE_REPLICATION +#include "replicate.h" +#endif /* USE_REPLICATION */ #include "access/gin.h" #include "access/twophase.h" @@ -236,6 +239,9 @@ char *role_string; char *session_authorization_string; +#ifdef USE_REPLICATION +static void ShowReplicationServerConfig(DestReceiver *dest); +#endif /* USE_REPLICATION */ /* * Displayable names for context types (enum GucContext) @@ -970,6 +976,40 @@ &pg_krb_caseins_users, false, NULL, NULL }, +#ifdef USE_REPLICATION + { + {"pgr_force_loadbalance", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("force loadbalance mode"), + NULL + }, + &PGRforceLoadBalance, + false, NULL, NULL + }, + { + {"check_constraint_with_lock", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("check constrain with lock"), + NULL + }, + &PGRcheckConstraintWithLock, + false, NULL, NULL + }, + { + {"auto_lock_table", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("auto lock table"), + NULL + }, + &PGRautoLockTable, + true, NULL, NULL + }, + { + {"not_replicate_prepared_select", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("not replicate the prepared as select"), + NULL + }, + &PGRnotReplicatePreparedSelect, + false, NULL, NULL + }, +#endif { {"escape_string_warning", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, @@ -4830,6 +4870,10 @@ { if (pg_strcasecmp(name, "all") == 0) ShowAllGUCConfig(dest); +#ifdef USE_REPLICATION + else if (strcasecmp(name, "replication_server") == 0) + ShowReplicationServerConfig(dest); +#endif else ShowGUCConfigOption(name, dest); } @@ -6512,5 +6556,72 @@ return nbuf; } +#ifdef USE_REPLICATION +/* + * SHOW REPLICATION SERVER command + */ +static void +ShowReplicationServerConfig(DestReceiver *dest) +{ + TupOutputState *tstate; + TupleDesc tupdesc; + char *values[4]; + char buffer[256]; + ReplicateServerInfo *sp; + + /* need a tuple descriptor representing two TEXT columns */ + tupdesc = CreateTemplateTupleDesc(4, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0 ); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "host_name", + TEXTOID, -1, 0 ); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "port_num", + TEXTOID, -1, 0 ); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "recovery_port_num", + TEXTOID, -1, 0 ); + + /* prepare for projection of tuples */ + tstate = begin_tup_output_tupdesc(dest, tupdesc); + + sp = ReplicateServerData; + while (sp->useFlag != DATA_END) { + if (PGR_Check_Replicate_Server_Status(sp) == STATUS_ERROR) { + PGR_Set_Replication_Server_Status(sp, DATA_ERR); + } + + sp++; + } + + sp = ReplicateServerData; + while (sp->useFlag != DATA_END) { + if (sp->useFlag == DATA_USE) { + values[0] = "ALIVE"; + } else if (sp->useFlag == DATA_ERR) { + values[0] = "DEAD"; + } else if (sp->useFlag == DATA_INIT) { + values[0] = "STANDBY"; + } else { + values[0] = "UNKNOWN"; + } + + values[1] = (char *) sp->hostName; + + snprintf(buffer, sizeof(buffer), "%d", sp->portNumber); + values[2] = pstrdup(buffer); + + snprintf(buffer, sizeof(buffer), "%d", sp->recoveryPortNumber); + values[3] = pstrdup(buffer); + + do_tup_output(tstate, values); + + pfree(values[2]); + pfree(values[3]); + + sp++; + } + + end_tup_output(tstate); +} +#endif /* USE_REPLICATION */ #include "guc-file.c" diff -aruN postgresql-8.2.4/src/backend/utils/misc/postgresql.conf.sample pgcluster-1.7.0rc7/src/backend/utils/misc/postgresql.conf.sample --- postgresql-8.2.4/src/backend/utils/misc/postgresql.conf.sample 2007-01-20 22:42:06.000000000 +0100 +++ pgcluster-1.7.0rc7/src/backend/utils/misc/postgresql.conf.sample 2007-02-18 22:52:16.000000000 +0100 @@ -469,3 +469,12 @@ #--------------------------------------------------------------------------- #custom_variable_classes = '' # list of custom variable class names + + +#--------------------------------------------------------------------------- +# PGCluster +#--------------------------------------------------------------------------- + +# auto_lock_table = true +# check_constraint_with_lock = false +# not_replicate_prepared_select = false diff -aruN postgresql-8.2.4/src/bin/initdb/initdb.c pgcluster-1.7.0rc7/src/bin/initdb/initdb.c --- postgresql-8.2.4/src/bin/initdb/initdb.c 2006-10-04 20:58:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/bin/initdb/initdb.c 2007-02-18 22:52:16.000000000 +0100 @@ -122,6 +122,11 @@ static int n_buffers = 50; static int n_fsm_pages = 20000; +#ifdef USE_REPLICATION +static char *cluster_conf_file; +static char *pgreplicate_conf_file; +static char *pglb_conf_file; +#endif /* USE_REPLICATION */ /* * Warning messages for authentication methods */ @@ -1352,6 +1357,14 @@ free(conflines); +#ifdef USE_REPLICATION + /* cluster.conf */ + conflines = readfile(cluster_conf_file); + snprintf(path, sizeof(path), "%s/cluster.conf", pg_data); + writefile(path, conflines); + chmod(path, 0600); + free(conflines); +#endif /* USE_REPLICATION */ check_ok(); } @@ -2712,6 +2725,11 @@ set_input(&info_schema_file, "information_schema.sql"); set_input(&features_file, "sql_features.txt"); set_input(&system_views_file, "system_views.sql"); +#ifdef USE_REPLICATION + set_input(&cluster_conf_file, "cluster.conf.sample"); + set_input(&pgreplicate_conf_file, "pgreplicate.conf.sample"); + set_input(&pglb_conf_file, "pglb.conf.sample"); +#endif /* USE_REPLICATION */ set_info_version(); @@ -2730,6 +2748,16 @@ desc_file, shdesc_file, conf_file, hba_file, ident_file); +#ifdef USE_REPLICATION + fprintf(stderr, + "PGCLUSTER_VERSION=%s\n" + "CLUSTER_CONF_SAMPLE=%s\nPGREPLICATE_CONF_SAMPLE=%s\n" + "PGLB_CONF_SAMPLE=%s\n", + PGCLUSTER_VERSION, + cluster_conf_file, + pgreplicate_conf_file, + pglb_conf_file); +#endif /* USE_REPLICATION */ if (show_setting) exit(0); } @@ -2744,6 +2772,11 @@ check_input(info_schema_file); check_input(features_file); check_input(system_views_file); +#ifdef USE_REPLICATION + check_input(cluster_conf_file); + check_input(pgreplicate_conf_file); + check_input(pglb_conf_file); +#endif /* USE_REPLICATION */ setlocales(); diff -aruN postgresql-8.2.4/src/bin/pg_dump/pg_dump.c pgcluster-1.7.0rc7/src/bin/pg_dump/pg_dump.c --- postgresql-8.2.4/src/bin/pg_dump/pg_dump.c 2006-10-10 01:36:59.000000000 +0200 +++ pgcluster-1.7.0rc7/src/bin/pg_dump/pg_dump.c 2007-02-18 22:52:16.000000000 +0100 @@ -119,6 +119,9 @@ /* flag to turn on/off dollar quoting */ static int disable_dollar_quoting = 0; +#ifdef USE_REPLICATION + bool nonReplicate=true; +#endif static void help(const char *progname); static void expand_schema_name_patterns(SimpleStringList *patterns, @@ -235,6 +238,9 @@ {"column-inserts", no_argument, NULL, 'D'}, {"host", required_argument, NULL, 'h'}, {"ignore-version", no_argument, NULL, 'i'}, +#ifdef USE_REPLICATION + {"non-replicate", no_argument ,NULL, 'r'}, +#endif {"no-reconnect", no_argument, NULL, 'R'}, {"oids", no_argument, NULL, 'o'}, {"no-owner", no_argument, NULL, 'O'}, @@ -368,6 +374,11 @@ pgport = optarg; break; +#ifdef USE_REPLICATION + case 'r': + nonReplicate = true; + break; +#endif case 'R': /* no-op, still accepted for backwards compatibility */ break; @@ -553,6 +564,11 @@ /* * Start serializable transaction to dump consistent data. */ +#ifdef USE_REPLICATION + if(nonReplicate) { + do_sql_command(g_conn, "set pgr_force_loadbalance to on"); + } +#endif /* USE_REPLICATION */ do_sql_command(g_conn, "BEGIN"); do_sql_command(g_conn, "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); @@ -751,6 +767,9 @@ printf(_(" -o, --oids include OIDs in dump\n")); printf(_(" -O, --no-owner skip restoration of object ownership\n" " in plain text format\n")); +#ifdef USE_REPLICATION + printf(_(" -r, --non-replicate No queries replicate. Available only in pgcluster.\n")); +#endif printf(_(" -s, --schema-only dump only the schema, no data\n")); printf(_(" -S, --superuser=NAME specify the superuser user name to use in\n" " plain text format\n")); diff -aruN postgresql-8.2.4/src/bin/pg_dump/pg_dumpall.c pgcluster-1.7.0rc7/src/bin/pg_dump/pg_dumpall.c --- postgresql-8.2.4/src/bin/pg_dump/pg_dumpall.c 2006-11-21 23:19:46.000000000 +0100 +++ pgcluster-1.7.0rc7/src/bin/pg_dump/pg_dumpall.c 2007-02-18 22:52:16.000000000 +0100 @@ -97,6 +97,9 @@ {"oids", no_argument, NULL, 'o'}, {"no-owner", no_argument, NULL, 'O'}, {"port", required_argument, NULL, 'p'}, +#ifdef USE_REPLICATION + {"non-replicate", no_argument ,NULL, 'r'}, +#endif {"password", no_argument, NULL, 'W'}, {"schema-only", no_argument, NULL, 's'}, {"superuser", required_argument, NULL, 'S'}, @@ -161,7 +164,7 @@ pgdumpopts = createPQExpBuffer(); - while ((c = getopt_long(argc, argv, "acdDgh:ioOp:sS:U:vWxX:", long_options, &optindex)) != -1) + while ((c = getopt_long(argc, argv, "acdDgh:ioOp:rsS:U:vWxX:", long_options, &optindex)) != -1) { switch (c) { @@ -215,6 +218,11 @@ #endif break; +#ifdef USE_REPLICATION + case 'r': + appendPQExpBuffer(pgdumpopts, " -r"); + break; +#endif /* USE_REPLICATION */ case 's': schema_only = true; appendPQExpBuffer(pgdumpopts, " -s"); @@ -397,6 +405,9 @@ printf(_("\nConnection options:\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); printf(_(" -p, --port=PORT database server port number\n")); +#ifdef USE_REPLICATION + printf(_(" -r, --non-replicate No queries replicate. Available only in pgcluster.\n")); +#endif /* USE_REPLICATION */ printf(_(" -U, --username=NAME connect as specified database user\n")); printf(_(" -W, --password force password prompt (should happen automatically)\n")); diff -aruN postgresql-8.2.4/src/include/commands/prepare.h pgcluster-1.7.0rc7/src/include/commands/prepare.h --- postgresql-8.2.4/src/include/commands/prepare.h 2006-10-04 02:30:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/include/commands/prepare.h 2007-02-18 22:52:16.000000000 +0100 @@ -64,4 +64,8 @@ extern bool PreparedStatementReturnsTuples(PreparedStatement *stmt); extern List *FetchPreparedStatementTargetList(PreparedStatement *stmt); +#ifdef USE_REPLICATION +extern bool PGR_is_select_prepared_statement(PrepareStmt *stmt); +#endif /* USE_REPLICATION */ + #endif /* PREPARE_H */ diff -aruN postgresql-8.2.4/src/include/pg_config.h.in pgcluster-1.7.0rc7/src/include/pg_config.h.in --- postgresql-8.2.4/src/include/pg_config.h.in 2006-11-06 04:44:38.000000000 +0100 +++ pgcluster-1.7.0rc7/src/include/pg_config.h.in 2007-02-18 22:52:17.000000000 +0100 @@ -673,3 +673,7 @@ /* Define to empty if the keyword `volatile' does not work. Warning: valid code using `volatile' can become incorrect without. Disable with care. */ #undef volatile + +/* PGCluster version */ +#undef PGCLUSTER_VERSION + diff -aruN postgresql-8.2.4/src/include/replicate.h pgcluster-1.7.0rc7/src/include/replicate.h --- postgresql-8.2.4/src/include/replicate.h 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/include/replicate.h 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,223 @@ +/*------------------------------------------------------------------------- + * + * replicate.h + * Primary include file for replicate server .c files + * + * This should be the first file included by replicate modules. + * + *------------------------------------------------------------------------- + */ +#ifndef REPLICATE_H +#define REPLICATE_H + +#ifndef _SYS_TIME_H +#include +#endif +#include "tcop/dest.h" +#include "storage/proc.h" +#include "lib/stringinfo.h" +#include "replicate_com.h" + +#define STAND_ALONE_TAG "When_Stand_Alone" +#define NOT_REPLICATE_INFO_TAG "Not_Replicate_Info" +#define DB_NAME_TAG "DB_Name" +#define TABLE_NAME_TAG "Table_Name" +#define RSYNC_PATH_TAG "Rsync_Path" +#define RSYNC_OPTION_TAG "Rsync_Option" +#define RSYNC_COMPRESS_TAG "Rsync_Compress" +#define PG_DUMP_PATH_TAG "Pg_Dump_Path" + +#define CLUSTER_CONF_FILE "cluster.conf" +#define DEFAULT_RSYNC "/usr/bin/rsync" +#define DEFAULT_PG_DUMP "/usr/local/pgsql/bin/pg_dump" +#define NOT_SESSION_AUTHORIZATION (0) +#define SESSION_AUTHORIZATION_BEGIN (1) +#define SESSION_AUTHORIZATION_END (2) + +#define READ_ONLY_IF_STAND_ALONE "read_only" +#define READ_WRITE_IF_STAND_ALONE "read_write" +#define PERMIT_READ_ONLY (1) +#define PERMIT_READ_WRITE (2) +#define STATUS_REPLICATED (3) +#define STATUS_CONTINUE (4) +#define STATUS_CONTINUE_SELECT (5) +#define STATUS_NOT_REPLICATE (6) +#define STATUS_SKIP_QUERY (7) +#define STATUS_RECOVERY (11) +#define STATUS_REPLICATION_ABORT (98) +#define STATUS_DEADLOCK_DETECT (99) + +#define TO_REPLICATION_SERVER (0) +#define TO_FRONTEND (1) + +#define PGR_DEADLOCK_DETECTION_MSG "deadlock detected!" +#define PGR_REPLICATION_ABORT_MSG "replication aborted!" +#define SKIP_QUERY_1 "begin; select getdatabaseencoding(); commit" +#define SKIP_QUERY_2 "BEGIN; SELECT usesuper FROM pg_catalog.pg_user WHERE usename = '%s'; COMMIT" +#define SKIP_QUERY_3 "SET autocommit TO 'on'" +#define SKIP_QUERY_4 "SET search_path = public" +#define SYS_QUERY_1 "set pgr_force_loadbalance to on" + +#define PGR_1ST_RECOVERY (1) +#define PGR_2ND_RECOVERY (2) +#define PGR_COLD_RECOVERY (1) +#define PGR_HOT_RECOVERY (2) +#define PGR_WITHOUT_BACKUP (3) + +#define PGR_MESSAGE_OTHER (0) +#define PGR_MESSAGE_SELECT (1) +#define PGR_MESSAGE_PREPARE (2) +#define PGR_MESSAGE_EXECUTE (3) +#define PGR_MESSAGE_DEALLOCATE (4) + +typedef struct +{ + bool is_stand_alone; + int permit; +} PGR_Stand_Alone_Type; + +typedef struct +{ + char db_name[DBNAME_MAX_LENGTH]; + char table_name[TABLENAME_MAX_LENGTH]; +} PGR_Not_Replicate_Type; + +typedef struct +{ + bool check_lock_conflict; + bool deadlock; + int status_lock_conflict; + int dest; +} PGR_Check_Lock_Type; + +typedef struct +{ + char * query_string; + int query_len; + char cmdSts; + char cmdType; + char useFlag; +} PGR_Retry_Query_Type; + + +/* replicaition log */ +typedef struct { + uint32_t PGR_Replicate_ID; + uint32_t PGR_Request_ID; +} PGR_ReplicationLog_Info; + +typedef struct { + char * password; + char md5Salt[4]; + char cryptSalt[2]; +} PGR_Password_Info; + +extern char * Query_String; +extern int TransactionQuery; +extern int Transaction_Mode; +extern bool PGR_Noticed_Abort; +extern bool Session_Authorization_Mode; +extern bool Create_Temp_Table_Mode; +extern int RecoveryPortNumber; +extern char * RsyncPath; +extern char * RsyncOption; +extern bool RsyncCompress; +extern char * PgDumpPath; +extern int TransactionSock; +extern ReplicateNow * ReplicateCurrentTime; +extern CopyData * PGRCopyData; +extern bool PGR_Copy_Data_Need_Replicate; +extern PGR_Stand_Alone_Type * PGR_Stand_Alone; +extern PGR_Not_Replicate_Type * PGR_Not_Replicate; +extern int PGR_Not_Replicate_Rec_Num; +extern bool autocommit; +extern bool PGR_Is_Replicated_Query; +extern PGR_Check_Lock_Type PGR_Check_Lock; +extern int PGR_Sock_To_Replication_Server; +extern bool PGR_Need_Notice; +extern bool PGR_Lock_Noticed; +extern bool PGR_Recovery_Option; +extern int PGR_recovery_mode; +extern ReplicateServerInfo * CurrentReplicateServer; +extern ReplicateServerInfo * LastReplicateServer; +extern char * PGRSelfHostName; +extern int PGR_Pending_Sem_Num; +extern int PGR_Response_Mode; +extern bool PGR_Reliable_Mode_Wait; +extern PGR_Retry_Query_Type PGR_Retry_Query; +extern bool needToUpdateReplicateIdOnNextQueryIsDone; +extern PGR_ReplicationLog_Info ReplicationLog_Info; +extern bool PGR_Not_Replication_Query; +extern bool PGR_Is_Sync_OID; +extern PGR_Password_Info * PGR_password; + +/* backend/utils/misc/guc.c */ +extern bool PGRforceLoadBalance; +extern bool PGRcheckConstraintWithLock; +extern bool PGRautoLockTable; +extern bool PGRnotReplicatePreparedSelect; + +/* in backend/libpq/replicate.c */ +extern int PGR_Init_Replicate_Server_Data(void); +extern int PGR_Set_Replicate_Server_Socket(void); +extern int PGR_get_replicate_server_socket ( ReplicateServerInfo * sp , int socket_type ); +extern ReplicateServerInfo * PGR_get_replicate_server_info(void); +extern ReplicateServerInfo * PGR_check_replicate_server_info(void); +extern char * PGR_Send_Replicate_Command(char * query_string, int query_len, char cmdSts ,char cmdType); +extern bool PGR_Is_Replicated_Command(char * query); +extern int Xlog_Check_Replicate(int operation); +extern int PGR_Replicate_Function_Call(void); +extern void PGR_delete_shm(void); +extern int PGR_replication(char * query_string, CommandDest dest, Node *parsetree, const char * commandTag); +extern bool PGR_Is_System_Command(char * query); +extern int PGR_Call_System_Command(char * command); +extern int PGR_GetTimeOfDay(struct timeval *tp,struct timezone *tpz); +extern long PGR_Random(void); +extern int PGR_Set_Current_Time(char * sec, char * usec); +extern int PGR_Send_Copy(CopyData * copy, int end); +extern CopyData * PGR_Set_Copy_Data(CopyData * copy, char *str, int len, int end); +extern char * PGR_scan_terminate( char * str); +extern bool PGR_Is_Stand_Alone(void); +extern void PGR_Send_Message_To_Frontend(char * msg); +extern void PGR_Notice_Transaction_Query_Done(void); +extern void PGR_Notice_Transaction_Query_Aborted(void); +extern int PGRsend_system_command(char cmdSts, char cmdType); +extern int PGR_Notice_Conflict(void); +extern int PGR_Recv_Trigger (int user_timeout); +extern void PGR_Set_Replication_Server_Status( ReplicateServerInfo * sp, int status); +extern int PGR_Is_Skip_Replication(char * query); +extern bool PGR_Did_Commit_Transaction(void); +extern int PGR_Set_Transaction_Mode(int mode,const char * commandTag); +extern char * PGR_Remove_Comment(char * str); +extern void PGR_Force_Replicate_Query(void); +extern void PGR_Notice_DeadLock(void); +extern void PGR_Set_Cluster_Status(int status); +extern int PGR_Get_Cluster_Status(void); +extern int PGR_Check_Replicate_Server_Status(ReplicateServerInfo * sp); +extern int PGR_lo_import(char * filename); +extern int PGR_lo_create(int flags); +extern int PGR_lo_open(Oid lobjId,int32 mode); +extern int PGR_lo_close(int32 fd); +extern int PGR_lo_write(int fd, char *buf, int len); +extern int PGR_lo_lseek(int32 fd, int32 offset, int32 whence); +extern int PGR_lo_unlink(Oid lobjId); +extern uint32_t PGRget_replication_id(void); +extern Oid PGRGetNewObjectId(Oid last_id); +extern int PGR_Send_Input_Message(char cmdType,StringInfo input_message); +extern bool PGR_is_select_prepare_query(void); +extern char * PGR_get_md5salt(char * md5Salt, char * string); +extern int PGR_recv_replicate_result(int sock,char * result,int user_timeout); + +/* in backend/libpq/recovery.c */ +extern int PGR_Master_Main(void); +extern int PGR_Recovery_Main(int mode); +extern int PGR_recovery_error_send(void); +extern int PGR_recovery_finish_send(void); +extern int PGR_recovery_queue_data_req(void); + +/* in backend/libpq/lifecheck.c */ +extern int PGR_Lifecheck_Main(void); + +/* in backend/access/transam/xact.c */ +extern void PGR_Reload_Start_Time(void); +#endif /* REPLICATE_H */ diff -aruN postgresql-8.2.4/src/include/replicate_com.h pgcluster-1.7.0rc7/src/include/replicate_com.h --- postgresql-8.2.4/src/include/replicate_com.h 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/include/replicate_com.h 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * replicate.h + * Primary include file for replicate server .c files + * + * This should be the first file included by replicate modules. + * + *------------------------------------------------------------------------- + */ +#ifndef REPLICATE_COM_H +#define REPLICATE_COM_H 1 + +#ifndef _SYS_TYPES_H +#include +#endif +#ifndef _INTTYPES_H +#include +#endif +#ifndef _NETINET_IN_H +#include +#endif + +#include "c.h" +#include "pg_config.h" + +/* default values */ +#define DEFAULT_PGLB_PORT (6001) +#define DEFAULT_PGLB_RECOVERY_PORT (6101) +#define DEFAULT_PGLB_LIFECHECK_PORT (6201) +#define DEFAULT_CLUSTER_PORT (5432) +#define DEFAULT_CLUSTER_RECOVERY_PORT (7101) +#define DEFAULT_CLUSTER_LIFECHECK_PORT (7201) +#define DEFAULT_PGRP_PORT (8001) +#define DEFAULT_PGRP_RECOVERY_PORT (8101) +#define DEFAULT_PGRP_LIFECHECK_PORT (8201) +#define DEFAULT_PGRP_RLOG_PORT (8301) +#define MAX_DB_SERVER (32) + +/************************** +* * +* Packet ID definition * +* * +***************************/ +/*========================= + Replication packet id +===========================*/ +#define CMD_SYS_REPLICATE 'R' +/*------------------------- + Simple Query +--------------------------*/ +#define CMD_STS_SET_SESSION_AUTHORIZATION 'S' +#define CMD_STS_TRANSACTION 'T' +#define CMD_STS_TEMP_TABLE 'E' +#define CMD_STS_QUERY 'Q' +#define CMD_STS_OTHER 'O' + +#define CMD_TYPE_VACUUM 'V' +#define CMD_TYPE_ANALYZE 'A' +#define CMD_TYPE_REINDEX 'N' +#define CMD_TYPE_SELECT 'S' +#define CMD_TYPE_EXPLAIN 'X' +#define CMD_TYPE_SET 'T' +#define CMD_TYPE_RESET 't' +#define CMD_TYPE_INSERT 'I' +#define CMD_TYPE_DELETE 'D' +#define CMD_TYPE_EXECUTE 'U' +#define CMD_TYPE_UPDATE 'U' +#define CMD_TYPE_BEGIN 'B' +#define CMD_TYPE_COMMIT 'E' +#define CMD_TYPE_ROLLBACK 'R' +#define CMD_TYPE_CONNECTION_CLOSE 'x' +#define CMD_TYPE_SESSION_AUTHORIZATION_BEGIN 'a' +#define CMD_TYPE_SESSION_AUTHORIZATION_END 'b' +#define CMD_TYPE_SAVEPOINT 's' +#define CMD_TYPE_ROLLBACK_TO_SAVEPOINT 'r' +#define CMD_TYPE_RELEASE_SAVEPOINT 'l' +#define CMD_TYPE_OTHER 'O' + +/*========================= + System call packet id +===========================*/ +#define CMD_SYS_CALL 'S' +#define CMD_SYS_PREREPLICATE 'Z' + +#define CMD_STS_NOTICE 'N' +#define CMD_STS_RESPONSE 'R' +#define CMD_STS_TRANSACTION_ABORT 'A' +#define CMD_STS_QUERY_SUSPEND 'P' +#define CMD_STS_QUERY_DONE 'D' + +#define CMD_TYPE_COMMIT_CONFIRM 'c' +#define CMD_TYPE_QUERY_CONFIRM 'q' +#define CMD_TYPE_DEADLOCK_DETECT 'd' +#define CMD_TYPE_FRONTEND_CLOSED 'x' + +/*---------------------------- + Copy Command +------------------------------*/ +#define CMD_STS_COPY 'C' + +#define CMD_TYPE_COPY 'C' +#define CMD_TYPE_COPY_DATA 'd' +#define CMD_TYPE_COPY_DATA_END 'e' + +/*---------------------------- + Large Object +------------------------------*/ +#define CMD_STS_LARGE_OBJECT 'L' + +#define CMD_TYPE_LO_IMPORT 'I' +#define CMD_TYPE_LO_CREATE 'C' +#define CMD_TYPE_LO_OPEN 'O' +#define CMD_TYPE_LO_WRITE 'W' +#define CMD_TYPE_LO_LSEEK 'S' +#define CMD_TYPE_LO_CLOSE 'X' +#define CMD_TYPE_LO_UNLINK 'U' + +/*------------------------- + Prepare/Params Query +--------------------------*/ +#define CMD_STS_PREPARE 'P' + +#define CMD_TYPE_P_PARSE 'P' +#define CMD_TYPE_P_BIND 'B' +#define CMD_TYPE_P_EXECUTE 'E' +#define CMD_TYPE_P_FASTPATH 'F' +#define CMD_TYPE_P_CLOSE 'C' +#define CMD_TYPE_P_DESCRIBE 'D' +#define CMD_TYPE_P_FLUSH 'H' +#define CMD_TYPE_P_SYNC 'S' + +/*========================= + Lifecheck packet id +===========================*/ +#define CMD_SYS_LIFECHECK 'W' +#define CMD_STS_LOADBALANCER 'A' +#define CMD_STS_CLUSTER 'B' +#define CMD_STS_REPLICATOR 'C' + +#define PGR_TRANSACTION_SOCKET (0) +#define PGR_QUERY_SOCKET (1) + +#define DATA_FREE (0) +#define DATA_INIT (1) +#define DATA_USE (2) +#define DATA_ERR (90) +#define DATA_END (-1) +#define HOSTNAME_MAX_LENGTH (128) +#define DBNAME_MAX_LENGTH (128) +#define USERNAME_MAX_LENGTH (128) +#define PASSWORD_MAX_LENGTH (128) +#define TABLENAME_MAX_LENGTH (128) +#define PATH_MAX_LENGTH (256) +#define MAX_SERVER_NUM (128) +#define MAX_RETRY_TIMES (3) +#define MAX_SOCKET_QUEUE (100000) +#define TRANSACTION_ERROR_RESULT "TRANSACTION_ERROR" +#define REPLICATE_SERVER_SHM_KEY (1020) +/* target -> replicate */ +#define RECOVERY_PREPARE_REQ (1) +/* replicate -> master */ +#define RECOVERY_PGDATA_REQ (2) +/* master -> replicate */ +#define RECOVERY_PGDATA_ANS (3) +/* replicate -> target */ +#define RECOVERY_PREPARE_ANS (4) +/* target -> replicate */ +#define RECOVERY_START_REQ (5) +/* replicate -> master */ +#define RECOVERY_FSYNC_REQ (6) +/* master -> replicate */ +#define RECOVERY_FSYNC_ANS (7) +/* replicate -> target */ +#define RECOVERY_START_ANS (8) +/* target -> replicate */ +#define RECOVERY_QUEUE_DATA_REQ (9) +/* replicate -> target */ +#define RECOVERY_QUEUE_DATA_ANS (10) +/* target -> replicate */ +#define RECOVERY_FINISH (11) + +#define RECOVERY_ERROR_OCCUPIED (100) +#define RECOVERY_ERROR_CONNECTION (101) +#define RECOVERY_ERROR_TARGET_ONLY (102) +#define RECOVERY_ERROR_ANS (200) + +/* lifecheck ask from cluster db */ +#define LIFECHECK_ASK_FROM_CLUSTER (1) +/* lifecheck response from replication server */ +#define LIFECHECK_RES_FROM_REPLICATOR (2) +/* lifecheck ask from replication server */ +#define LIFECHECK_ASK_FROM_REPLICATOR (3) +/* lifecheck response from cluster db */ +#define LIFECHECK_RES_FROM_CLUSTER (4) + +#define REPLICATION_SERVER_INFO_TAG "Replicate_Server_Info" +#define HOST_NAME_TAG "Host_Name" +#define PORT_TAG "Port" +#define RECOVERY_PORT_TAG "Recovery_Port" +#define LIFECHECK_PORT_TAG "LifeCheck_Port" +#define TIMEOUT_TAG "Replication_Timeout" +#define LIFECHECK_TIMEOUT_TAG "LifeCheck_Timeout" +#define LIFECHECK_INTERVAL_TAG "LifeCheck_Interval" + +#define RECOVERY_INIT (0) +#define RECOVERY_PREPARE_START (1) +#define RECOVERY_START_1 (2) +#define RECOVERY_CLEARED (3) +#define RECOVERY_WAIT_CLEAN (10) +#define RECOVERY_ERROR (99) + +/* response mode */ +#define PGR_FAST_MODE (0) +#define PGR_NORMAL_MODE (1) +#define PGR_RELIABLE_MODE (2) + +#define RECOVERY_TIMEOUT (600) +#ifndef COMPLETION_TAG_BUFSIZE +#define COMPLETION_TAG_BUFSIZE (128) +#endif + +/* replicate log type */ +#define FROM_R_LOG_TYPE (1) +#define FROM_C_DB_TYPE (2) +#define CONNECTION_SUSPENDED_TYPE (3) + +#define PGR_SYSTEM_COMMAND_FUNC "PGR_SYSTEM_COMMAND_FUNCTION" +#define PGR_STARTUP_REPLICATION_SERVER_FUNC_NO (1) +#define PGR_CHANGE_REPLICATION_SERVER_FUNC_NO (2) +#define PGR_SET_CURRENT_TIME_FUNC_NO (3) +#define PGR_NOTICE_DEADLOCK_DETECTION_FUNC_NO (4) +#define PGR_TRANSACTION_CONFIRM_ANSWER_FUNC_NO (5) +#define PGR_RELIABLE_MODE_DONE_FUNC_NO (6) +#define PGR_NOTICE_ABORT_FUNC_NO (7) +#define PGR_SET_CURRENT_REPLICATION_QUERY_ID_NO (8) +#define PGR_QUERY_CONFIRM_ANSWER_FUNC_NO (9) +#define PGR_GET_OID_FUNC_NO (10) +#define PGR_SET_OID_FUNC_NO (11) + +#define PGR_CMD_ARG_NUM (10) +#define PGR_LOCK_CONFLICT_NOTICE_CMD "PGR_LOCK_CONFLICT_NOTICE_CMD" +#define PGR_DEADLOCK_DETECT_NOTICE_CMD "PGR_DEADLOCK_DETECT_NOTICE_CMD" +#define PGR_QUERY_DONE_NOTICE_CMD "PGR_QUERY_DONE_NOTICE_CMD" +#define PGR_QUERY_ABORTED_NOTICE_CMD "PGR_QUERY_ABORTED_NOTICE_CMD" +#define PGR_RETRY_LOCK_QUERY_CMD "PGR_RETRY_LOCK_QUERY_CMD" +#define PGR_NOT_YET_REPLICATE_NOTICE_CMD "PGR_NOT_YET_REPLICATE_NOTICE_CMD" +#define PGR_ALREADY_REPLICATED_NOTICE_CMD "PGR_ALREADY_REPLICATED_NOTICE_CMD" +#define PGR_NOT_YET_COMMIT (0) +#define PGR_ALREADY_COMMITTED (1) + +#define COPYBUFSIZ (8192) +#define MAX_WORDS (24) +#define MAX_WORD_LETTERS (48) +#define PGR_MESSAGE_BUFSIZE (128) +#define INT_LENGTH (12) +#define PGR_MAX_COUNTER (0x0FFFFFFF) +#define PGR_GET_OVER_FLOW_FILTER (0xF0000000) +#define PGR_GET_DATA_FILTER (0x0FFFFFFF) +#define PGR_SET_OVER_FLOW (0x10000000) +#define PGR_MIN_COUNTER (0x0000000F) + +#define STRCMP(x,y) (strncmp(x,y,strlen(y))) + +/* life check target */ +#define SYN_TO_LOAD_BALANCER (0) +#define SYN_TO_CLUSTER_DB (1) +#define SYN_TO_REPLICATION_SERVER (2) +#define LIFE_CHECK_TRY_COUNT (2) +#define LIFE_CHECK_STOP (0) +#define LIFE_CHECK_START (1) + +#ifndef HAVE_UNION_SEMUN +union semun { + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; +}; +#endif + +typedef struct ReplicateHeaderType +{ + char cmdSys; + char cmdSts; /* + Q:query + T:transaction + */ + char cmdType; /* + S:select + I:insert + D:delete + U:update + B:begin + E:commit/rollback/end + O:others + */ + char rlog; /* + -- kind of replication log -- + 1: send from replication log + 2: send from cluster db (should be retry) + 3: connection suspended + */ + uint16_t port; + uint16_t pid; + uint32_t query_size; + char from_host[HOSTNAME_MAX_LENGTH]; + char dbName[DBNAME_MAX_LENGTH]; + char userName[USERNAME_MAX_LENGTH]; + struct timeval tv; + uint32_t query_id; + int isAutoCommit; /* 0 if autocommit is off. 1 if autocommit is on */ + uint32_t request_id; + uint32_t replicate_id; + char password[PASSWORD_MAX_LENGTH]; + char md5Salt[4]; + char cryptSalt[2]; + char dummySalt[2]; +} ReplicateHeader; + +typedef struct RecoveryPacketType +{ + uint16_t packet_no; /* + 1:start recovery prepare + 2:ask pgdata + 3:ans pgdata + 4:send master info + 5:start queueing query + 6:requst fsync + 7:ready to fsync + 8:pepared master + 9:finished rsync + */ + uint16_t max_connect; + uint16_t port; + uint16_t recoveryPort; + char hostName[HOSTNAME_MAX_LENGTH]; + char pg_data[PATH_MAX_LENGTH]; + char userName[USERNAME_MAX_LENGTH]; +} RecoveryPacket; + +typedef struct +{ + char table[128]; + int rec_no; + char key[128]; + char value[128]; + char * last; + char * next; +} ConfDataType; + + +typedef struct ReplicateServerInfoType +{ + uint32_t useFlag; + char hostName[HOSTNAME_MAX_LENGTH]; + uint16_t portNumber; + uint16_t recoveryPortNumber; + uint16_t lifecheckPortNumber; + uint16_t RLogPortNumber; + uint32_t sock; + uint32_t rlog_sock; + uint32_t replicate_id; + uint16_t response_mode; + uint16_t retry_count; +} ReplicateServerInfo; + + +typedef struct ReplicateNowType +{ + uint32_t replicate_id; + int useFlag; + int use_seed; + int use_time; + int offset_sec; + int offset_usec; + struct timeval tp; +} ReplicateNow; + +typedef struct CopyDataType +{ + int cnt; + char copy_data[COPYBUFSIZ]; +} CopyData; + +typedef struct ClusterDBInfoType +{ + int status; +} ClusterDBInfo; + +typedef struct +{ + uint32_t arg1; + uint32_t arg2; + uint32_t arg3; + char buf[1]; +} LOArgs; + +typedef struct +{ + int length; + char data[1]; +} ArrayData; + +extern ConfDataType * ConfData_Top; +extern ConfDataType * ConfData_End; +extern ReplicateServerInfo * ReplicateServerData; +extern ClusterDBInfo * ClusterDBData; +extern int ReplicateServerShmid; +extern int ClusterDBShmid; +extern bool PGR_Under_Replication_Server; +extern int PGR_Replication_Timeout; +extern int PGR_Lifecheck_Timeout; +extern int PGR_Lifecheck_Interval; + +/* in backend/libpq/replicate_com.c */ +extern int PGR_Create_Socket_Connect(int * fdP, char * hostName , unsigned short portNumber); +extern void PGR_Close_Sock(int * sock); +extern int PGR_Create_Socket_Bind(int * fdP, char * hostName , unsigned short portNumber); +extern int PGR_Create_Acception(int fd, int * sockP, char * hostName , unsigned short portNumber); +extern int PGR_Free_Conf_Data(void); +extern int PGR_Get_Conf_Data(char * dir , char * fname); +extern void PGRset_recovery_packet_no(RecoveryPacket * packet, int packet_no); +extern unsigned int PGRget_ip_by_name(char * host); +extern int PGRget_time_value(char *str); + +extern void PGRwrite_log_file(FILE * fp, const char * fmt,...); +extern void show_debug(const char * fmt,...); +extern void show_error(const char * fmt,...); + + + +#endif /* REPLICATE_COM_H */ diff -aruN postgresql-8.2.4/src/include/storage/lmgr.h pgcluster-1.7.0rc7/src/include/storage/lmgr.h --- postgresql-8.2.4/src/include/storage/lmgr.h 2006-08-18 18:09:13.000000000 +0200 +++ pgcluster-1.7.0rc7/src/include/storage/lmgr.h 2007-02-18 22:52:17.000000000 +0100 @@ -15,6 +15,7 @@ #define LMGR_H #include "storage/lock.h" +#include "storage/bufmgr.h" #include "utils/rel.h" @@ -69,4 +70,5 @@ /* Knowledge about which locktags describe temp objects */ extern bool LockTagIsTemp(const LOCKTAG *tag); +extern void XactLockTableWaitForCluster(TransactionId xid,Buffer buffer); #endif /* LMGR_H */ diff -aruN postgresql-8.2.4/src/include/storage/proc.h pgcluster-1.7.0rc7/src/include/storage/proc.h --- postgresql-8.2.4/src/include/storage/proc.h 2006-10-04 02:30:10.000000000 +0200 +++ pgcluster-1.7.0rc7/src/include/storage/proc.h 2007-02-18 22:52:17.000000000 +0100 @@ -97,6 +97,9 @@ SHM_QUEUE myProcLocks[NUM_LOCK_PARTITIONS]; struct XidCache subxids; /* cache for subtransaction XIDs */ +#ifdef USE_REPLICATION + unsigned int replicationId; /* id for replication. */ +#endif }; /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ diff -aruN postgresql-8.2.4/src/interfaces/libpq/Makefile pgcluster-1.7.0rc7/src/interfaces/libpq/Makefile --- postgresql-8.2.4/src/interfaces/libpq/Makefile 2006-12-28 01:01:12.000000000 +0100 +++ pgcluster-1.7.0rc7/src/interfaces/libpq/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -33,7 +33,7 @@ OBJS= fe-auth.o fe-connect.o fe-exec.o fe-misc.o fe-print.o fe-lobj.o \ fe-protocol2.o fe-protocol3.o pqexpbuffer.o pqsignal.o fe-secure.o \ - md5.o ip.o wchar.o encnames.o noblock.o pgstrcasecmp.o thread.o \ + dllist.o md5.o ip.o wchar.o encnames.o noblock.o pgstrcasecmp.o thread.o \ $(filter crypt.o getaddrinfo.o inet_aton.o open.o snprintf.o strerror.o strlcpy.o, $(LIBOBJS)) ifeq ($(PORTNAME), cygwin) @@ -89,6 +89,9 @@ encnames.c wchar.c : % : $(backend_src)/utils/mb/% rm -f $@ && $(LN_S) $< . +dllist.c : % : $(backend_src)/lib/dllist.c + rm -f $@ && $(LN_S) $< . + # We need several not-quite-identical variants of .DEF files to build libpq # DLLs for Windows. These are made from the single source file exports.txt. @@ -169,7 +172,7 @@ rm -f '$(DESTDIR)$(includedir)/libpq-fe.h' '$(DESTDIR)$(includedir_internal)/libpq-int.h' '$(DESTDIR)$(includedir_internal)/pqexpbuffer.h' '$(DESTDIR)$(datadir)/pg_service.conf.sample' clean distclean: clean-lib - rm -f $(OBJS) pg_config_paths.h crypt.c getaddrinfo.c inet_aton.c noblock.c open.c pgstrcasecmp.c snprintf.c strerror.c strlcpy.c thread.c md5.c ip.c encnames.c wchar.c pthread.h exports.list + rm -f $(OBJS) pg_config_paths.h crypt.c getaddrinfo.c inet_aton.c noblock.c open.c pgstrcasecmp.c snprintf.c strerror.c strlcpy.c thread.c md5.c ip.c encnames.c wchar.c pthread.h exports.list dllist.c rm -f pg_config_paths.h # Might be left over from a Win32 client-only build maintainer-clean: distclean diff -aruN postgresql-8.2.4/src/interfaces/libpq/fe-auth.c pgcluster-1.7.0rc7/src/interfaces/libpq/fe-auth.c --- postgresql-8.2.4/src/interfaces/libpq/fe-auth.c 2006-10-04 02:30:12.000000000 +0200 +++ pgcluster-1.7.0rc7/src/interfaces/libpq/fe-auth.c 2007-02-18 22:52:17.000000000 +0100 @@ -51,6 +51,10 @@ #include "fe-auth.h" #include "libpq/md5.h" +#ifdef USE_REPLICATION +#include "replicate_com.h" +bool PGR_Under_Replication_Server = false; +#endif /* USE_REPLICATION */ #ifdef KRB5 /* @@ -412,6 +416,19 @@ free(crypt_pwd); return STATUS_ERROR; } +#ifdef USE_REPLICATION + if (PGR_Under_Replication_Server) + { + /* + * When this module is called from the replication server, + * there is no need encrypt password. + * Since the password was already encrypted at the Cluster DB + */ + int size = 2 * (MD5_PASSWD_LEN + 1); + memset(crypt_pwd,0, size); + strncpy(crypt_pwd,password, size); + } +#endif /* USE_REPLICATION */ break; } case AUTH_REQ_CRYPT: diff -aruN postgresql-8.2.4/src/makefiles/Makefile.aix pgcluster-1.7.0rc7/src/makefiles/Makefile.aix --- postgresql-8.2.4/src/makefiles/Makefile.aix 2006-09-19 17:36:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.aix 2007-02-18 22:52:17.000000000 +0100 @@ -44,3 +44,5 @@ $(CC) $(LDFLAGS) $(LDFLAGS_SL) -o $@ $*.o -Wl,-bE:$*$(EXPSUFF) $(SHLIB_LINK) sqlmansect = 7 +CFLAGS += -pthread +LDFLAGS += -L/usr/lib/threads diff -aruN postgresql-8.2.4/src/makefiles/Makefile.freebsd pgcluster-1.7.0rc7/src/makefiles/Makefile.freebsd --- postgresql-8.2.4/src/makefiles/Makefile.freebsd 2006-04-19 18:32:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.freebsd 2007-02-18 22:52:17.000000000 +0100 @@ -28,3 +28,5 @@ endif sqlmansect = 7 + +LIBS += -lc_r diff -aruN postgresql-8.2.4/src/makefiles/Makefile.hpux pgcluster-1.7.0rc7/src/makefiles/Makefile.hpux --- postgresql-8.2.4/src/makefiles/Makefile.hpux 2006-02-07 18:36:13.000000000 +0100 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.hpux 2007-02-18 22:52:17.000000000 +0100 @@ -10,6 +10,9 @@ # correctly in the LP64 data model. LIBS := -lxnet $(LIBS) +# add thread lib for PGCluster +LIBS := -lpthread $(LIBS) + # Set up rpath so that the executables don't need SHLIB_PATH to be set. # (Note: --disable-rpath is a really bad idea on this platform...) ifeq ($(with_gnu_ld), yes) diff -aruN postgresql-8.2.4/src/makefiles/Makefile.linux pgcluster-1.7.0rc7/src/makefiles/Makefile.linux --- postgresql-8.2.4/src/makefiles/Makefile.linux 2005-12-09 22:19:36.000000000 +0100 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.linux 2007-02-18 22:52:17.000000000 +0100 @@ -14,3 +14,4 @@ $(CC) -shared -o $@ $< sqlmansect = 7 +LIBS += -lpthread diff -aruN postgresql-8.2.4/src/makefiles/Makefile.netbsd pgcluster-1.7.0rc7/src/makefiles/Makefile.netbsd --- postgresql-8.2.4/src/makefiles/Makefile.netbsd 2006-04-19 18:32:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.netbsd 2007-02-18 22:52:17.000000000 +0100 @@ -30,3 +30,4 @@ endif sqlmansect = 7 +LIBS += -lpthread diff -aruN postgresql-8.2.4/src/makefiles/Makefile.openbsd pgcluster-1.7.0rc7/src/makefiles/Makefile.openbsd --- postgresql-8.2.4/src/makefiles/Makefile.openbsd 2006-04-19 18:32:08.000000000 +0200 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.openbsd 2007-02-18 22:52:17.000000000 +0100 @@ -28,3 +28,4 @@ endif sqlmansect = 7 +LIBS += -lc_r diff -aruN postgresql-8.2.4/src/makefiles/Makefile.solaris pgcluster-1.7.0rc7/src/makefiles/Makefile.solaris --- postgresql-8.2.4/src/makefiles/Makefile.solaris 2005-12-09 22:19:36.000000000 +0100 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.solaris 2007-02-18 22:52:17.000000000 +0100 @@ -20,3 +20,4 @@ $(LD) -G -Bdynamic -o $@ $< sqlmansect = 5sql +LIBS += -lpthread diff -aruN postgresql-8.2.4/src/makefiles/Makefile.sunos4 pgcluster-1.7.0rc7/src/makefiles/Makefile.sunos4 --- postgresql-8.2.4/src/makefiles/Makefile.sunos4 2002-09-05 00:54:18.000000000 +0200 +++ pgcluster-1.7.0rc7/src/makefiles/Makefile.sunos4 2007-02-18 22:52:17.000000000 +0100 @@ -11,3 +11,4 @@ $(LD) -assert pure-text -Bdynamic -o $@ $< sqlmansect = 7 +LIBS += -lpthread diff -aruN postgresql-8.2.4/src/pgcluster/Makefile pgcluster-1.7.0rc7/src/pgcluster/Makefile --- postgresql-8.2.4/src/pgcluster/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/pgcluster (server programs) +# +#------------------------------------------------------------------------- + +subdir = src/pgcluster +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +DIRS := libpgc pgrp pglb tool + +all install installdirs uninstall depend distprep: + @for dir in $(DIRS); do $(MAKE) -C $$dir $@ || exit; done + +clean distclean maintainer-clean: + -@for dir in $(DIRS); do $(MAKE) -C $$dir $@; done diff -aruN postgresql-8.2.4/src/pgcluster/libpgc/Makefile pgcluster-1.7.0rc7/src/pgcluster/libpgc/Makefile --- postgresql-8.2.4/src/pgcluster/libpgc/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/libpgc/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,29 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for libpq subsystem (common library for replication server) +# +#------------------------------------------------------------------------- + +subdir = src/pgcluster/libpgc +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = sem.o show.o signal.o + +all: SUBSYS.o + +SUBSYS.o: $(OBJS) + $(LD) $(LDREL) $(LDOUT) SUBSYS.o $(OBJS) + +depend dep: + $(CC) -MM $(CFLAGS) *.c >depend + +distclean: clean + +clean: + rm -f SUBSYS.o $(OBJS) + +ifeq (depend,$(wildcard depend)) +include depend +endif diff -aruN postgresql-8.2.4/src/pgcluster/libpgc/libpgc.h pgcluster-1.7.0rc7/src/pgcluster/libpgc/libpgc.h --- postgresql-8.2.4/src/pgcluster/libpgc/libpgc.h 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/libpgc/libpgc.h 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------- + * + * lilbpgc.h + * external definition of the function for pgreplicate and pglb + * + * This should be the first file included by replicate modules. + * + *------------------------------------------------------------------------- + */ +#ifndef LIBPGC_H +#define LIBPGC_H + +#include + +/* character length of IP address */ +#define ADDRESS_LENGTH (24) + +/* logging file data tag in configuration file */ +#define LOG_INFO_TAG "Log_File_Info" +#define FILE_NAME_TAG "File_Name" +#define FILE_SIZE_TAG "File_Size" +#define LOG_ROTATION_TAG "Rotate" + +typedef struct { + char file_name[256]; + FILE * fp; + int max_size; + int rotation; +} LogFileInf; + +extern LogFileInf * LogFileData; +/* external definition of the function in sem.c */ +extern void PGRsem_unlock( int semid, short sem_num ); +extern void PGRsem_lock( int semid, short sem_num ); + +/* external definition of the function in show.c */ +extern FILE * PGRopen_log_file(char * fname, int max_size, int rotation); +extern void PGRclose_log_file(FILE * fp); +extern void show_debug(const char * fmt,...); +extern void show_error(const char * fmt,...); +extern void PGRwrite_log_file(FILE * fp, const char * fmt,...); + +/* external definition of the function in signal.c */ +typedef void (*PGRsighandler)(int); +extern PGRsighandler PGRsignal(int signo, PGRsighandler sighandler); + +#endif /* LIBPGC_H */ diff -aruN postgresql-8.2.4/src/pgcluster/libpgc/sem.c pgcluster-1.7.0rc7/src/pgcluster/libpgc/sem.c --- postgresql-8.2.4/src/pgcluster/libpgc/sem.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/libpgc/sem.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,67 @@ +/*-------------------------------------------------------------------- + * FILE: + * sem.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for the semapho control. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include +#include +#include +#include +#include +#include +#include + +extern void show_debug(const char * fmt,...); + +void PGRsem_unlock( int semid, short sem_num ); +void PGRsem_lock( int semid, short sem_num ); + +#define PGR_SEM_UNLOCK_WAIT_MSEC (100) +#define PGR_SEM_LOCK_WAIT_MSEC (500) + +void +PGRsem_unlock( int semid, short sem_num ) +{ + int status = 0; + struct sembuf sops; + + sops.sem_num = sem_num; + sops.sem_op = 1; + /*sops.sem_flg = IPC_NOWAIT;*/ + sops.sem_flg = 0; + do + { + status = semop(semid, &sops, 1); + if ((status == -1) && (errno != EINTR)) + { + usleep(PGR_SEM_UNLOCK_WAIT_MSEC); + } + } while (status == -1); +} + +void +PGRsem_lock( int semid, short sem_num ) +{ + int status = 0; + struct sembuf sops; + + sops.sem_num = sem_num; + sops.sem_op = -1; + /*sops.sem_flg = IPC_NOWAIT;*/ + sops.sem_flg = 0; + do + { + status = semop(semid, &sops, 1); + if ((status == -1) && (errno != EINTR)) + { + usleep(PGR_SEM_LOCK_WAIT_MSEC); + } + } while (status == -1); +} + diff -aruN postgresql-8.2.4/src/pgcluster/libpgc/show.c pgcluster-1.7.0rc7/src/pgcluster/libpgc/show.c --- postgresql-8.2.4/src/pgcluster/libpgc/show.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/libpgc/show.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,226 @@ +/*-------------------------------------------------------------------- + * FILE: + * show.c + * + * NOTE: + * This file is composed of the logging and debug functions + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libpgc.h" + +#define TIMESTAMP_SIZE 19 /* format `YYYY-MM-DD HH:MM:SS' */ + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static char* get_current_timestamp(void); +static int file_rotation(char * fname, int max_rotation); + +FILE * PGRopen_log_file(char * fname, int max_size, int rotation); +void PGRclose_log_file(FILE * fp); +void show_debug(const char * fmt,...); +void show_error(const char * fmt,...); +void PGRwrite_log_file(FILE * fp, const char * fmt,...); + +extern int Debug_Print; +extern int Log_Print; + +LogFileInf * LogFileData = NULL; + +static char* +get_current_timestamp(void) +{ + time_t now; + static char buf[TIMESTAMP_SIZE + 1]; + + now = time(NULL); + strftime(buf, sizeof(buf), + "%Y-%m-%d %H:%M:%S", localtime(&now)); + return buf; +} + +void +show_debug(const char * fmt,...) +{ + va_list ap; + char *timestamp; + char buf[256]; + + if (Debug_Print) + { + timestamp = get_current_timestamp(); + fprintf(stdout,"%s [%d] DEBUG:",timestamp, getpid()); + va_start(ap,fmt); + vfprintf(stdout,fmt,ap); + va_end(ap); + fprintf(stdout,"\n"); + fflush(stdout); + if ((Log_Print) && (LogFileData != NULL)) + { + FILE * fp = NULL; + fp = PGRopen_log_file(LogFileData->file_name, LogFileData->max_size, LogFileData->rotation); + va_start(ap,fmt); + vsnprintf(buf,sizeof(buf),fmt,ap); + va_end(ap); + PGRwrite_log_file(fp, buf); + PGRclose_log_file(fp); + } + } +} + +void +show_error(const char * fmt,...) +{ + va_list ap; + char buf[256], *timestamp; + + if (Debug_Print) + { + timestamp = get_current_timestamp(); + fprintf(stderr,"%s [%d] ERROR:",timestamp, getpid()); + va_start(ap,fmt); + vfprintf(stderr,fmt,ap); + va_end(ap); + fprintf(stderr,"\n"); + fflush(stderr); + } + if ((Log_Print) && (LogFileData != NULL)) + { + FILE * fp = NULL; + fp = PGRopen_log_file(LogFileData->file_name, LogFileData->max_size, LogFileData->rotation); + va_start(ap,fmt); + vsnprintf(buf,sizeof(buf),fmt,ap); + va_end(ap); + PGRwrite_log_file(fp, buf); + PGRclose_log_file(fp); + } +} + +void +PGRwrite_log_file(FILE * fp, const char * fmt,...) +{ + char buf[256]; + char log[288]; + char * p; + va_list ap; + time_t t; + + if (fp == NULL) + { + return; + } + if (time(&t) < 0) + { + return; + } + snprintf(log,sizeof(log),"%s ",ctime(&t)); + p = strchr(log,'\n'); + if (p != NULL) + { + *p = ' '; + } + va_start(ap,fmt); + vsnprintf(buf,sizeof(buf),fmt,ap); + va_end(ap); + strcat(log,buf); + strcat(log,"\n"); + if (fputs(log,fp) >= 0) + { + fflush(fp); + } +} + +FILE * +PGRopen_log_file(char * fname, int max_size, int rotation) +{ + int rtn; + struct stat st; + + if (fname == NULL) + { + return (FILE *)NULL; + } + + if (max_size > 0) + { + rtn = stat(fname,&st); + if (rtn == 0) + { + if (st.st_size > max_size) + { + if (file_rotation(fname, rotation) < 0) + { + return (FILE *)NULL; + } + } + } + } + return (fopen(fname,"a")); +} + +void +PGRclose_log_file(FILE * fp) +{ + if (fp != NULL) + { + fflush(fp); + fclose(fp); + } +} + +static int +file_rotation(char * fname, int max_rotation) +{ + char * func = "file_rotation()"; + int i; + int rtn; + struct stat st; + char old_fname[256]; + char new_fname[256]; + + if ((fname == NULL) || (max_rotation < 0)) + { + return -1; + } + + for ( i = max_rotation ; i > 1 ; i -- ) + { + sprintf(old_fname,"%s.%d",fname,i-1); + rtn = stat(old_fname,&st); + if (rtn == 0) + { + sprintf(new_fname,"%s.%d",fname,i); + rtn = rename(old_fname, new_fname); + if (rtn < 0) + { + show_error("%s:rotate failed: (%s)",func,strerror(errno)); + return rtn; + } + } + } + if (max_rotation > 0) + { + sprintf(new_fname,"%s.1",fname); + rtn = rename(fname, new_fname); + } + else + { + rtn = unlink(fname); + } + + return rtn; +} + diff -aruN postgresql-8.2.4/src/pgcluster/libpgc/signal.c pgcluster-1.7.0rc7/src/pgcluster/libpgc/signal.c --- postgresql-8.2.4/src/pgcluster/libpgc/signal.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/libpgc/signal.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,35 @@ +/*-------------------------------------------------------------------- + * FILE: + * replicate.c + * + * NOTE: + * This file is composed of the functions to set signal handler + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ + +#include +#include "pg_config.h" +#include "libpgc.h" + +/* + * Set up a signal handler + */ +PGRsighandler +PGRsignal(int signo, PGRsighandler sighandler) +{ +#if !defined(HAVE_POSIX_SIGNALS) + return signal(signo, func); +#else + struct sigaction act, + oact; + + act.sa_handler = sighandler; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + if (sigaction(signo, &act, &oact) < 0) + return SIG_ERR; + return oact.sa_handler; +#endif /* !HAVE_POSIX_SIGNALS */ +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/AUTHORS pgcluster-1.7.0rc7/src/pgcluster/pglb/AUTHORS --- postgresql-8.2.4/src/pgcluster/pglb/AUTHORS 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/AUTHORS 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,4 @@ +Authors of pglb + +pglb was written by Atsushi Mitani. +pglb is based on pg_pool which is written by Tatsuo Ishii. diff -aruN postgresql-8.2.4/src/pgcluster/pglb/COPYING pgcluster-1.7.0rc7/src/pgcluster/pglb/COPYING --- postgresql-8.2.4/src/pgcluster/pglb/COPYING 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/COPYING 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,14 @@ +Copyright (c) 2003-2006 Atsushi Mitani + +Permission to use, copy, modify, and distribute this software and +its documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appear in all +copies and that both that copyright notice and this permission +notice appear in supporting documentation, and that the name of the +author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. The author makes no representations about the +suitability of this software for any purpose. It is provided "as +is" without express or implied warranty. + +Portions copyright (c) 2003-2006, Tatsuo Ishii diff -aruN postgresql-8.2.4/src/pgcluster/pglb/Makefile pgcluster-1.7.0rc7/src/pgcluster/pglb/Makefile --- postgresql-8.2.4/src/pgcluster/pglb/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,38 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/pgcluster/pgrp +# +#------------------------------------------------------------------------- + +subdir = src/pgcluster/pglb +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS= child.o cluster_table.o load_balance.o main.o pool_auth.o \ + pool_connection_pool.o pool_process_query.o pool_stream.o \ + pool_params.o recovery.o socket.o lifecheck.o + +EXTRA_OBJS = $(top_builddir)/src/backend/libpq/replicate_com.o ../libpgc/SUBSYS.o + +CFLAGS += -DPRINT_DEBUG +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) -DBINDIR=\"$(bindir)\" + +all: pglb + +pglb: $(OBJS) $(libpq_builddir)/libpq.a + $(CC) $(CFLAGS) $(OBJS) $(EXTRA_OBJS) $(libpq) $(LDFLAGS) $(LIBS) -o $@ + +install: all installdirs + $(INSTALL_PROGRAM) pglb$(X) $(DESTDIR)$(bindir)/pglb$(X) + $(INSTALL_DATA) pglb.conf.sample $(DESTDIR)$(datadir)/pglb.conf.sample + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + $(mkinstalldirs) $(DESTDIR)$(datadir) + +uninstall: + rm -f $(addprefix $(DESTDIR)$(bindir)/, pglb$(X)) + rm -f $(DESTDIR)$(datadir)/pglb.conf.sample + +clean distclean maintainer-clean: + rm -f pglb$(X) $(OBJS) diff -aruN postgresql-8.2.4/src/pgcluster/pglb/child.c pgcluster-1.7.0rc7/src/pgcluster/pglb/child.c --- postgresql-8.2.4/src/pgcluster/pglb/child.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/child.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,1194 @@ +/*-------------------------------------------------------------------- + * FILE: + * child.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at child process of pglb. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_CRYPT_H +#include +#endif + +#include "postgres_fe.h" +#include "libpq/pqcomm.h" + +#include "replicate_com.h" +#include "pglb.h" + +/*-------------------------------------- + * GLOBAL VARIABLE DECLARATION + *-------------------------------------- + */ +POOL_CONNECTION * Frontend = NULL; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRpre_fork_children(ClusterTbl * ptr); +int PGRpre_fork_child(ClusterTbl * ptr); +int PGRdo_child( int use_pool); +int PGRcreate_child(ClusterTbl * cluster_p); +pid_t PGRscan_child_tbl(ClusterTbl * cluster_p); +void notice_backend_error(void); +void do_pooling_child(int sig); +int PGRset_status_to_child_tbl(pid_t pid, int status); +int PGRadd_child_tbl(ClusterTbl * cluster_p, pid_t pid, int status); +int PGRget_child_status(pid_t pid); +void PGRreturn_connection_full_error(void); +void PGRreturn_no_connection_error(void); +void PGRquit_children_on_cluster(int rec_no); + +#ifdef NONE_BLOCK +static void set_nonblock(int fd); +#endif +static void unset_nonblock(int fd); +static POOL_CONNECTION *do_accept(int unix_fd, int inet_fd); +static PGR_StartupPacket *read_startup_packet(POOL_CONNECTION *cp); +static int send_startup_packet(POOL_CONNECTION_POOL_SLOT *cp); +static void cancel_request(CancelPacket *sp, int secondary_backend); +static POOL_CONNECTION_POOL *connect_backend(PGR_StartupPacket *sp, POOL_CONNECTION *frontend); +static int send_params(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static void child_end(int sig); +static void PGRreturn_with_error(char *msg); + + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRpre_fork_children() + * NOTES + * pre forked child precesses + * ARGS + * ClusterTbl * ptr: pointer of cluster server table (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRpre_fork_children(ClusterTbl * ptr) +{ + int cnt; + + if (ptr == NULL) + { + return STATUS_ERROR; + } + cnt = 0 ; + while ((ptr->useFlag != TBL_END) && (cnt < ClusterNum)) + { + PGRpre_fork_child(ptr); + cnt ++; + ptr ++; + } + return STATUS_OK; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRpre_fork_child() + * NOTES + * pre forked child precess + * ARGS + * ClusterTbl * ptr: pointer of cluster server table (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRpre_fork_child(ClusterTbl * ptr) +{ + pid_t pid = 0; + int i; + + if (ptr == NULL) + { + return STATUS_ERROR; + } + if (ptr->useFlag == TBL_END) + { + return STATUS_ERROR; + } + for ( i = 0 ; i < ptr->max_connect * Max_Pool ; i ++) + { + pid = PGRcreate_child(ptr); + } + return STATUS_OK; +} +/*-------------------------------------------------------------------- + * SYMBOL + * PGRdo_child() + * NOTES + * execute child process + * ARGS + * int use_pool: usage flag of connection pooling (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRdo_child( int use_pool) +{ + char * func = "PGRdo_child()"; + pid_t pid = 0; + PGR_StartupPacket *sp = NULL; + POOL_CONNECTION *frontend = NULL; + POOL_CONNECTION_POOL *backend = NULL; + int status = 0; + int connection_reuse = 1; + int ssl_request = 0; + int count = 0; + + pid = getpid(); +#ifdef PRINT_DEBUG + show_debug("%s:I am %d",func, pid); +#endif + + /* set up signal handlers */ + PGRsignal(SIGALRM, SIG_DFL); + PGRsignal(SIGTERM, child_end); + PGRsignal(SIGHUP, child_end); + PGRsignal(SIGINT, child_end); + PGRsignal(SIGUSR1, SIG_IGN); + PGRsignal(SIGUSR2, SIG_IGN); + +#ifdef NONE_BLOCK + /* set listen fds to none block */ + set_nonblock(Frontend_FD.unix_fd); + set_nonblock(Frontend_FD.inet_fd); +#endif + +retry_accept: + /* perform accept() */ + frontend = do_accept(Frontend_FD.unix_fd,Frontend_FD.inet_fd); + if (frontend == NULL) + { + /* accept() failed. return to the accept() loop */ + PGRset_status_to_child_tbl(pid,TBL_FREE); + return STATUS_ERROR; + } + + /* unset frontend fd tp none block */ + unset_nonblock(frontend->fd); + + /* read the startup packet */ + sp = 0; +retry_startup: + if (sp) + { + free(sp->startup_packet); + free(sp->database); + free(sp->user); + free(sp); + } + + sp = read_startup_packet(frontend); + if (sp == NULL) + { + /* failed to read the startup packet. return to the + accept() loop */ + pool_close(frontend); + PGRset_status_to_child_tbl(pid,TBL_FREE); + return STATUS_ERROR; + } + PGRset_status_to_child_tbl(pid,TBL_ACCEPT); + + /* cancel request? */ + if (sp->major == 1234 && sp->minor == 5678) + { + cancel_request((CancelPacket *)sp->startup_packet, 0); + pool_close(frontend); + return STATUS_ERROR; + } + + /* SSL? */ + if (sp->major == 1234 && sp->minor == 5679) + { + /* SSL not supported */ +#ifdef PRINT_DEBUG + show_debug("%s:SSLRequest: sent N; retry startup",func); +#endif + if (ssl_request && use_pool) + { + pool_close(frontend); + return STATUS_ERROR; + } + + /* + * say to the frontend "we do not suppport SSL" + * note that this is not a NOTICE response despite it's an 'N'! + */ + pool_write_and_flush(frontend, "N", 1); + ssl_request = 1; + goto retry_startup; + } + + /* + * Ok, negotiaton with frontend has been done. Let's go to the next step. + */ + /* + * if there's no connection associated with user and database, + * we need to connect to the backend and send the startup packet. + */ + count = 0; + if ((backend = pool_get_cp(sp->user, sp->database, sp->major)) == NULL) + { + connection_reuse = 0; + + if ((backend = connect_backend(sp, frontend)) == NULL) + { + /* + PGRset_status_on_cluster_tbl(TBL_ERROR,CurrentCluster); + return STATUS_ERROR; + */ + goto retry_accept; + } + } + else + { + /* reuse existing connection to backend */ + + if (pool_do_reauth(frontend, backend)) + { + pool_close(frontend); + return STATUS_ERROR; + } + + if (MAJOR(backend) == 3) + { + if (send_params(frontend, backend)) + { + pool_close(frontend); + return STATUS_ERROR; + } + } + + /* send ReadyForQuery to frontend */ + pool_write(frontend, "Z", 1); + + if (MAJOR(backend) == 3) + { + int len; + char tstate; + + len = htonl(5); + pool_write(frontend, &len, sizeof(len)); + tstate = TSTATE(backend); + pool_write(frontend, &tstate, 1); + } + + if (pool_flush(frontend) < 0) + { + pool_close(frontend); + return STATUS_ERROR; + } + + } + + /* query process loop */ + for (;;) + { + POOL_STATUS status; + + status = pool_process_query(frontend, backend, 0); + + switch (status) + { + /* client exits */ + case POOL_END: + /* do not cache connection to template0, template1, regression */ + if (!strcmp(sp->database, "template0") || !strcmp(sp->database, "template1") || + !strcmp(sp->database, "regression") || use_pool == NOT_USE_CONNECTION_POOL) + { + pool_close(frontend); + pool_send_frontend_exits(backend); + pool_discard_cp(sp->user, sp->database, sp->major); + } + else + { + POOL_STATUS status1; + + /* send reset request to backend */ + status1 = pool_process_query(frontend, backend, 1); + pool_close(frontend); + + /* if we detect errors on resetting connection, we need to discard + * this connection since it might be in unknown status + */ + if (status1 != POOL_CONTINUE) + pool_discard_cp(sp->user, sp->database, sp->major); + else + pool_connection_pool_timer(backend); + } + break; + + /* error occured. discard backend connection pool + and disconnect connection to the frontend */ + case POOL_ERROR: + show_error("%s:do_child: exits with status 1 due to error",func); + break; + + /* fatal error occured. just exit myself... */ + case POOL_FATAL: + show_error("%s:do_child: fatal error occured",func); + notice_backend_error(); + break; + + /* not implemented yet */ + case POOL_IDLE: + do_accept(Frontend_FD.unix_fd,Frontend_FD.inet_fd); +#ifdef PRINT_DEBUG + show_debug("%s:accept while idle",func); +#endif + break; + + default: + break; + } + + if (status != POOL_CONTINUE) + break; + } + if ((status == POOL_ERROR) || + (status == POOL_FATAL)) + { + PGRset_status_to_child_tbl(pid,TBL_FREE); + return STATUS_ERROR; + } + PGRset_status_to_child_tbl(pid,TBL_INIT); + return STATUS_OK; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRcreate_child() + * NOTES + * create child process + * ARGS + * ClusterTbl * ptr: pointer of cluster server table (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRcreate_child(ClusterTbl * cluster_p) +{ + char * func = "PGRcreate_child()"; + pid_t pid,pgid; + + if (cluster_p == NULL) + return STATUS_ERROR; + +#ifdef PRINT_DEBUG + show_debug("%s:create child [%d@%s]",func,cluster_p->port,cluster_p->hostName); +#endif + PGRsignal(SIGCHLD,PGRrecreate_child); + pgid = getpgid((pid_t)0); + pid = fork(); + if (pid < 0) + { + show_error("%s:fork() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + if (pid == 0) + { + CurrentCluster = cluster_p; + if (pool_init_cp()) + { + show_error("%s:pool_init_cp failed",func); + exit(1); + } + PGRsignal(SIGCHLD,PGRchild_wait); + PGRsignal(SIGTERM, child_end); + PGRsignal(SIGHUP, child_end); + PGRsignal(SIGINT, child_end); + PGRsignal(SIGUSR1,do_pooling_child); + setpgid((pid_t)0,pgid); + for (;;) + { + pause(); + PGRsignal(SIGUSR1,do_pooling_child); + } +#ifdef PRINT_DEBUG + show_debug("%s:create child end [%d@%s]",func,cluster_p->port,cluster_p->hostName); +#endif + child_end(SIGTERM); + } + else + { + PGRadd_child_tbl(cluster_p,pid,TBL_INIT); + } + return pid; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRscan_child_tbl() + * NOTES + * get a child process id that is waiting for connection + * with the cluster server + * ARGS + * ClusterTbl * ptr: pointer of cluster server table (I) + * RETURN + * OK: child process id + * NG: 0 + *-------------------------------------------------------------------- + */ +pid_t +PGRscan_child_tbl(ClusterTbl * cluster_p) +{ + char * func = "PGRscan_child_tbl()"; + ChildTbl * p; + + if ( cluster_p == NULL) + { + show_error("%s:Cluster_Tbl is not initialize",func); + return STATUS_ERROR; + } + p = Child_Tbl; + if ( p == NULL) + { + show_error("%s:Child_Tbl is not initialize",func); + return STATUS_ERROR; + } + while(p->useFlag != TBL_END) + { + if (p->pid <= 0) + { + p++; + continue; + } + if ((p->useFlag == TBL_INIT) && + (p->rec_no == cluster_p->rec_no)) + { + p->useFlag = TBL_USE; + return (p->pid); + } + p++; + } + return 0; +} + +/* notice backend connection error using SIGUSR2 */ +void +notice_backend_error(void) +{ + pid_t pid = getpid(); + + PGRset_status_to_child_tbl(pid,TBL_ERROR); + PGRset_status_on_cluster_tbl(TBL_ERROR_NOTICE,CurrentCluster); + + /* + kill(parent, SIGUSR2); + sleep(1); + */ +} + + +/* + * start up pooling child process + */ +void +do_pooling_child(int sig) +{ + char * func = "do_pooling_child()"; + int rtn; + pid_t pid; + + pid = getpid(); + rtn = PGRdo_child(USE_CONNECTION_POOL); + PGRrelease_connection(CurrentCluster); + if (rtn != STATUS_OK) + { + show_error("%s:PGRdo_child failed",func); + child_end(SIGTERM); + } + return ; +} + +/* + * set status in child process table + */ +int +PGRset_status_to_child_tbl(pid_t pid, int status) +{ + char * func = "PGRset_status_to_child_tbl()"; + ChildTbl * p; + + p = Child_Tbl; + if ( p == NULL) + { + show_error("%s:Child_Tbl is not initialize",func); + return STATUS_ERROR; + } + while(p->useFlag != TBL_END) + { + if (p->pid == pid) + { + p->useFlag = status; + return STATUS_OK; + } + p++; + } + return STATUS_ERROR; +} + +/* + * add child process data in child process table + */ +int +PGRadd_child_tbl(ClusterTbl * cluster_p, pid_t pid, int status) +{ + char * func = "PGRadd_child_tbl()"; + ChildTbl * p; + + p = Child_Tbl; + if ( cluster_p == NULL) + { + show_error("%s:Cluster_Tbl is not initialize",func); + return STATUS_ERROR; + } + if ( p == NULL) + { + show_error("%s:Child_Tbl is not initialize",func); + return STATUS_ERROR; + } + while(p->useFlag != TBL_END) + { + if ((p->useFlag == TBL_FREE) || + (p->useFlag == TBL_ERROR)) + { + p->useFlag = status; + p->rec_no = cluster_p->rec_no; + p->pid = pid; + return STATUS_OK; + } + p++; + } + return STATUS_ERROR; +} + +int +PGRget_child_status(pid_t pid) +{ + char * func = "PGRget_child_status()"; + ChildTbl * p; + + p = Child_Tbl; + if ( p == NULL) + { + show_error("%s:Child_Tbl is not initialize",func); + return STATUS_ERROR; + } + + while (p->useFlag != TBL_END) + { + if (p->pid == pid) + { + return p->useFlag; + } + p++; + } + return STATUS_ERROR; +} + +void +PGRreturn_connection_full_error(void) +{ + PGRreturn_with_error( "Sorry, backend connection is full\n"); +} + +void +PGRreturn_no_connection_error(void) { + PGRreturn_with_error("pglb could not connect to server: no cluster available.\n"); +} + +static void +PGRreturn_with_error (char *msg) +{ + PGR_StartupPacket *sp = NULL; + POOL_CONNECTION *frontend = NULL; + + + /* perform accept() */ + frontend = do_accept(Frontend_FD.unix_fd,Frontend_FD.inet_fd); + if (frontend == NULL) + { + /* accept() failed. return to the accept() loop */ + return ; + } + sp = read_startup_packet(frontend); + if (sp == NULL) + { + /* failed to read the startup packet. return to the + accept() loop */ + pool_close(frontend); + return ; + } + pool_write_and_flush(frontend, "E", 1); + pool_write_and_flush(frontend, msg, strlen(msg)+1); + pool_close(frontend); + return ; +} + +void +PGRquit_children_on_cluster(int rec_no) +{ + char * func = "PGRquit_children_on_cluster()"; + ChildTbl * p; + + if (Child_Tbl == NULL) + { + return; + } + PGRsignal(SIGCHLD,SIG_IGN); + p = Child_Tbl; + while(p->useFlag != TBL_END) + { + if (p->rec_no == rec_no) + { + if (kill (p->pid,SIGTERM) == -1) + { + show_error("%s:could not stop pid: %d (%s)",func,p->pid,strerror(errno)); + return; + } + PGRchild_wait(SIGTERM); + p->useFlag = DATA_FREE; + } + p++; + } + if (Use_Connection_Pool) + { + PGRsignal(SIGCHLD,PGRrecreate_child); + } + else + { + PGRsignal(SIGCHLD,PGRchild_wait); + } +} + +/* ------------------------------------------------------------------- + * private functions + * ------------------------------------------------------------------- + */ + +#ifdef NONE_BLOCK +/* + * set non-block flag + */ +static void set_nonblock(int fd) +{ + char* func = "set_nonblock()"; + int var; + + /* set fd to none blocking */ + var = fcntl(fd, F_GETFL, 0); + if (var == -1) + { + show_error("%s:fcntl failed. %s", func,strerror(errno)); + child_end(SIGTERM); + } + if (fcntl(fd, F_SETFL, var | O_NONBLOCK) == -1) + { + show_error("%s:fcntl failed. %s", func,strerror(errno)); + child_end(SIGTERM); + } +} +#endif + +/* + * unset non-block flag + */ +static void unset_nonblock(int fd) +{ + char * func = "unset_nonblock()"; + int var; + + /* set fd to none blocking */ + var = fcntl(fd, F_GETFL, 0); + if (var == -1) + { + show_error("%s,fcntl failed. %s", func,strerror(errno)); + child_end(SIGTERM); + } + if (fcntl(fd, F_SETFL, var & ~O_NONBLOCK) == -1) + { + show_error("%s,fcntl failed. %s", func,strerror(errno)); + child_end(SIGTERM); + } +} + + +/* +* perform accept() and returns new fd +*/ +static POOL_CONNECTION *do_accept(int unix_fd, int inet_fd) +{ + char * func = "do_accept()"; + fd_set readmask; + int fds; + struct sockaddr addr; + socklen_t addrlen; + int fd = 0; + int afd; + int inet = 0; + POOL_CONNECTION *cp; +#ifdef ACCEPT_PERFORMANCE + struct timeval now1, now2; + static long atime; + static int cnt; +#endif + + FD_ZERO(&readmask); + FD_SET(unix_fd, &readmask); + if (inet_fd) + FD_SET(inet_fd, &readmask); + + fds = select(Max(unix_fd, inet_fd)+1, &readmask, NULL, NULL, NULL); + if (fds == -1) + { + if (errno == EAGAIN || errno == EINTR) + return NULL; + + show_error("%s:select() failed. reason %s",func, strerror(errno)); + return NULL; + } + + if (fds == 0) + return NULL; + + if (FD_ISSET(unix_fd, &readmask)) + { + fd = unix_fd; + } + + if (FD_ISSET(inet_fd, &readmask)) + { + fd = inet_fd; + inet++; + } + + /* + * Note that some SysV systems do not work here. For those + * systems, we need some locking mechanism for the fd. + */ + addrlen = sizeof(addr); + +#ifdef ACCEPT_PERFORMANCE + gettimeofday(&now1,0); +#endif + afd = accept(fd, &addr, &addrlen); + if (afd < 0) + { + /* + * "Resource temporarily unavailable" (EAGAIN or EWOULDBLOCK) + * can be silently ignored. + */ + if (errno != EAGAIN && errno != EWOULDBLOCK) + show_error("%s:accept() failed. reason: %s",func, strerror(errno)); + return NULL; + } +#ifdef ACCEPT_PERFORMANCE + gettimeofday(&now2,0); + atime += (now2.tv_sec - now1.tv_sec)*1000000 + (now2.tv_usec - now1.tv_usec); + cnt++; + if (cnt % 100 == 0) + { + show_error("%s:cnt: %d atime: %ld",func, cnt, atime); + } +#endif +#ifdef PRINT_DEBUG + show_debug("%s:I am %d accept fd %d",func, getpid(), afd); +#endif + + /* set NODELAY and KEEPALIVE options if INET connection */ + if (inet) + { + int on = 1; + + if (setsockopt(afd, IPPROTO_TCP, TCP_NODELAY, + (char *) &on, + sizeof(on)) < 0) + { + show_error("%s:do_accept: setsockopt() failed: %s",func, strerror(errno)); + close(afd); + return NULL; + } + if (setsockopt(afd, SOL_SOCKET, SO_KEEPALIVE, + (char *) &on, + sizeof(on)) < 0) + { + show_error("%s:do_accept: setsockopt() failed: %s", func,strerror(errno)); + close(afd); + return NULL; + } + } + + if ((cp = pool_open(afd)) == NULL) + { + close(afd); + return NULL; + } + return cp; +} + +/* +* read startup packet +*/ +static PGR_StartupPacket *read_startup_packet(POOL_CONNECTION *cp) +{ + char * func = "read_startup_packet()"; + PGR_StartupPacket *sp; + PGR_StartupPacket_v2 *sp2; + int protov; + int len; + char *p; + + sp = (PGR_StartupPacket *)malloc(sizeof(PGR_StartupPacket)); + if (!sp) + { + show_error("%s:read_startup_packet: out of memory",func); + return NULL; + } + + /* read startup packet length */ + if (pool_read(cp, &len, sizeof(len))) + { + free(sp); + return NULL; + } + len = ntohl(len); + len -= sizeof(len); + + if (len <= 0) + { + show_error("%s:read_startup_packet: incorrect packet length (%d)", func,len); + free(sp); + return NULL; + } + + sp->startup_packet = calloc(len, 1); + if (!sp->startup_packet) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + + /* read startup packet */ + if (pool_read(cp, sp->startup_packet, len)) + { + free(sp); + return NULL; + } + + sp->len = len; + memcpy(&protov, sp->startup_packet, sizeof(protov)); + sp->major = ntohl(protov)>>16; + sp->minor = ntohl(protov) & 0x0000ffff; + p = sp->startup_packet; + + switch(sp->major) + { + case PROTO_MAJOR_V2: /* V2 */ + sp2 = (PGR_StartupPacket_v2 *)(sp->startup_packet); + + sp->database = calloc(SM_DATABASE+1, 1); + if (!sp->database) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + strncpy(sp->database, sp2->database, SM_DATABASE); + + sp->user = calloc(SM_USER+1, 1); + if (!sp->user) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + strncpy(sp->user, sp2->user, SM_USER); + + break; + + case PROTO_MAJOR_V3: /* V3 */ + p += sizeof(int); /* skip protocol version info */ + + while(*p) + { + if (!strcmp("user", p)) + { + p += (strlen(p) + 1); + sp->user = strdup(p); + if (!sp->user) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + } + else if (!strcmp("database", p)) + { + p += (strlen(p) + 1); + sp->database = strdup(p); + if (!sp->database) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + } + p += (strlen(p) + 1); + } + break; + + case 1234: /* cancel or SSL request */ + /* set dummy database, user info */ + sp->database = calloc(1, 1); + if (!sp->database) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + sp->user = calloc(1, 1); + if (!sp->user) + { + show_error("%s:read_startup_packet: out of memory",func); + free(sp); + return NULL; + } + break; + + default: + show_error("%s:read_startup_packet: invalid major no: %d",func, sp->major); + free(sp); + return NULL; + } + +#ifdef PRINT_DEBUG + show_debug("%s:Protocol Major: %d Minor: %d database: %s user: %s", + func,sp->major, sp->minor, sp->database, sp->user); +#endif + + return sp; +} + +/* +* send startup packet +*/ +static int send_startup_packet(POOL_CONNECTION_POOL_SLOT *cp) +{ + int len; + + len = htonl(cp->sp->len + sizeof(len)); + pool_write(cp->con, &len, sizeof(len)); + return pool_write_and_flush(cp->con, cp->sp->startup_packet, cp->sp->len); +} + +/* + * process cancel request + */ +static void cancel_request(CancelPacket *sp, int secondary_backend) +{ + char * func = "cancel_request()"; + int len; + int fd; + POOL_CONNECTION *con; + char hostName[128]; + +#ifdef PRINT_DEBUG + show_debug("%s:Cancel request received",func); +#endif + + if (CurrentCluster == NULL) + { + return; + } + if (gethostname(hostName,sizeof(hostName)) < 0) + { + show_error("%s:gethostname() failed. (%s)",func,strerror(errno)); + return ; + } + if (secondary_backend) + { + if (PGRis_same_host(hostName,CurrentCluster->hostName)) + fd = connect_unix_domain_socket(1); + else + fd = connect_inet_domain_socket(1); + } + else + { + if (PGRis_same_host(hostName,CurrentCluster->hostName)) + fd = connect_unix_domain_socket(0); + else + fd = connect_inet_domain_socket(0); + } + + if (fd < 0) + { + show_error("%s:Could not create socket for sending cancel request",func); + return; + } + + con = pool_open(fd); + if (con == NULL) + return; + + len = htonl(sizeof(len) + sizeof(CancelPacket)); + pool_write(con, &len, sizeof(len)); + + if (pool_write_and_flush(con, sp, sizeof(CancelPacket)) < 0) + show_error("%s:Could not send cancel request packet",func); + pool_close(con); +} + +static POOL_CONNECTION_POOL *connect_backend(PGR_StartupPacket *sp, POOL_CONNECTION *frontend) +{ + char * func ="connect_backend()"; + POOL_CONNECTION_POOL *backend; + + /* connect to the backend */ + backend = pool_create_cp(); + if (backend == NULL) + { + pool_send_error_message(frontend, sp->major, "XX000", "connection cache is full", "", + "increace max_pool", __FILE__, __LINE__); + pool_close(frontend); + return NULL; + } + + /* mark this is a backend connection */ + backend->slots[0]->con->isbackend = 1; + /* + * save startup packet info + */ + backend->slots[0]->sp = sp; + + if (pool_config_replication_enabled) + { + backend->slots[1]->con->isbackend = 1; + backend->slots[1]->con->issecondary_backend = 1; + /* + * save startup packet info + */ + backend->slots[1]->sp = sp; + } + + /* send startup packet */ + if (send_startup_packet(backend->slots[0]) < 0) + { + show_error("%s:do_child: fails to send startup packet to the backend",func); + pool_close(frontend); + return NULL; + } + + /* send startup packet */ + if (pool_config_replication_enabled) + { + if (send_startup_packet(backend->slots[1]) < 0) + { + show_error("%s:do_child: fails to send startup packet to the secondary backend",func); + pool_close(frontend); + return NULL; + } + } + + /* + * do authentication stuff + */ + if (pool_do_auth(frontend, backend)) + { + pool_close(frontend); + pool_discard_cp(sp->user, sp->database, sp->major); + return NULL; + } + return backend; +} + +static int send_params(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) +{ + char * func = "send_params()"; + int index; + char *name, *value; + int len, sendlen; + + index = 0; + while (pool_get_param(&MASTER(backend)->params, index++, &name, &value) == 0) + { + pool_write(frontend, "S", 1); + len = sizeof(sendlen) + strlen(name) + 1 + strlen(value) + 1; + sendlen = htonl(len); + pool_write(frontend, &sendlen, sizeof(sendlen)); + pool_write(frontend, name, strlen(name) + 1); + pool_write(frontend, value, strlen(value) + 1); + } + + if (pool_flush(frontend)) + { + show_error("%s:pool_send_params: pool_flush() failed",func); + return -1; + } + return 0; +} + +/* + * ending function of child process + */ +static void +child_end(int sig) +{ + PGRsignal(sig,SIG_IGN); + + pool_finish(); + exit(0); +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/cluster_table.c pgcluster-1.7.0rc7/src/pgcluster/pglb/cluster_table.c --- postgresql-8.2.4/src/pgcluster/pglb/cluster_table.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/cluster_table.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,343 @@ +/*-------------------------------------------------------------------- + * FILE: + * cluster_tbl.c + * + * NOTE: + * This file is composed of the functions to use a cluster table. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRis_cluster_alive(void) ; +ClusterTbl * PGRscan_cluster(void); +void PGRset_key_of_cluster(ClusterTbl * ptr, RecoveryPacket * packet); +ClusterTbl * PGRadd_cluster_tbl (ClusterTbl * conf_data); +ClusterTbl * PGRset_status_on_cluster_tbl (int status, ClusterTbl * ptr); +ClusterTbl * PGRsearch_cluster_tbl(ClusterTbl * conf_data); + +static int set_cluster_tbl(ClusterTbl * ptr , ClusterTbl * conf_data); +static ClusterTbl * search_free_cluster_tbl(void ); +static void write_cluster_status_file(ClusterTbl * ptr); + +int PGRis_cluster_alive(void) +{ + ClusterTbl * ptr = NULL; + int use=0; + ptr = Cluster_Tbl; + + PGRsem_lock(ClusterSemid,MAX_DB_SERVER); + while (ptr->useFlag != TBL_END) + { + if ((ptr->useFlag == TBL_USE) || (ptr->useFlag == TBL_INIT)) + { + use++; + } + ptr++; + } + PGRsem_unlock(ClusterSemid,MAX_DB_SERVER); + return use==0 ? STATUS_ERROR : STATUS_OK; +} + +ClusterTbl * +PGRscan_cluster(void) +{ + char * func = "PGRscan_cluster"; + ClusterTbl * ptr = NULL; + ClusterTbl * rtn = NULL; + int min_use_rate = 100; + int use_rate = 0; + int cnt = 0; + + + ptr = Cluster_Tbl; + if (ptr == NULL) + { + show_error("%s:Cluster Table is not initialize",func); + return (ClusterTbl *)NULL; + } +#ifdef PRINT_DEBUG + show_debug("%s:%d ClusterDB can be used",func,ClusterNum); +#endif + PGRsem_lock(ClusterSemid,MAX_DB_SERVER); + while ((cnt <= ClusterNum) && (ptr->useFlag != TBL_END)) + { +#ifdef PRINT_DEBUG + show_debug("%s:%s [%d],useFlag->%d max->%d use_num->%d\n", + func, ptr->hostName,ptr->port,ptr->useFlag,ptr->max_connect,ptr->use_num); +#endif + cnt ++; + if ((ptr->useFlag != TBL_USE) && (ptr->useFlag != TBL_INIT)) + { + ptr ++; + continue; + } + if (ptr->max_connect <= ptr->use_num) + { + ptr ++; + continue; + } + if (ptr->use_num > 0) + { + use_rate = ptr->use_num * 100 / ptr->max_connect ; + } + else + { + use_rate = 0; + rtn = ptr; + break; + } + if (min_use_rate > use_rate) + { + min_use_rate = use_rate; + rtn = ptr; + } + ptr ++; + } + if (rtn != NULL) + { + rtn->use_num ++; + if (rtn->useFlag == TBL_INIT) + { + PGRset_status_on_cluster_tbl (TBL_USE,rtn); + } + } + PGRsem_unlock(ClusterSemid,MAX_DB_SERVER); + return rtn; +} + +void +PGRset_key_of_cluster(ClusterTbl * ptr, RecoveryPacket * packet) +{ + int max_connect = 0; + int port = 0; + + memset(ptr,0,sizeof(ClusterTbl)); + memcpy(ptr->hostName,packet->hostName,sizeof(ptr->hostName)); + max_connect = ntohs(packet->max_connect); + if (max_connect >= 0) + { + ptr->max_connect = max_connect; + } + else + { + ptr->max_connect = DEFAULT_CONNECT_NUM; + } + port = ntohs(packet->port); + if ( port >= 0) + { + ptr->port = port; + } + else + { + ptr->port = DEFAULT_PORT; + } +} + +ClusterTbl * +PGRadd_cluster_tbl (ClusterTbl * conf_data) +{ + char * func = "PGRadd_cluster_tbl()"; + ClusterTbl * ptr; + + ptr = PGRsearch_cluster_tbl(conf_data); + if ((ptr != NULL) && + ((ptr->useFlag == TBL_USE ) || ((ptr->useFlag == TBL_INIT)))) + { + ptr->max_connect = conf_data->max_connect; + ptr->use_num = 0; + ptr->rate = 0; + return ptr; + } + ptr = search_free_cluster_tbl(); + if (ptr == (ClusterTbl *) NULL) + { + show_error("%s:no more free space in cluster table",func); + return (ClusterTbl *)NULL; + } + if (ClusterNum < Max_DB_Server) + { + set_cluster_tbl( ptr, conf_data); + return ptr; + } + return (ClusterTbl *)NULL; +} + +ClusterTbl * +PGRset_status_on_cluster_tbl (int status, ClusterTbl * ptr) +{ +#ifdef PRINT_DEBUG + char * func = "PGRset_status_on_cluster_tbl()"; +#endif + + if (ptr != (ClusterTbl*)NULL) + { + if (ptr->useFlag != status) + { +#ifdef PRINT_DEBUG + show_debug("%s:host:%s port:%d max:%d use:%d status%d", + func, ptr->hostName,ptr->port,ptr->max_connect,ptr->useFlag,status); +#endif + ptr->useFlag = status; + write_cluster_status_file(ptr); + if (status == TBL_INIT) + { + if (ClusterNum < Max_DB_Server) + ClusterNum ++ ; + } + else if (status != TBL_STOP) + { + if (ClusterNum > 0) + ClusterNum -- ; + } + } + } + return ptr; +} + +static void +write_cluster_status_file(ClusterTbl * ptr) +{ + switch( ptr->useFlag) + { + case TBL_FREE: + PGRwrite_log_file(StatusFp,"port(%d) host:%s free", + ptr->port, + ptr->hostName); + break; + case TBL_INIT: + PGRwrite_log_file(StatusFp,"port(%d) host:%s initialize", + ptr->port, + ptr->hostName); + break; + case TBL_USE: + PGRwrite_log_file(StatusFp,"port(%d) host:%s start use", + ptr->port, + ptr->hostName); + break; + case TBL_ERROR: + PGRwrite_log_file(StatusFp,"port(%d) host:%s error", + ptr->port, + ptr->hostName); + break; + case TBL_END: + PGRwrite_log_file(StatusFp,"port(%d) host:%s end", + ptr->port, + ptr->hostName); + break; + } +} + +ClusterTbl * +PGRsearch_cluster_tbl(ClusterTbl * conf_data) +{ + ClusterTbl *ptr; + int cnt = 0; + int rec_num = 0; + + ptr = Cluster_Tbl; + while ((cnt <= ClusterNum) && (rec_num < Max_DB_Server)) + { + if (ptr->port > 0) + { + if ((!strcmp(ptr->hostName,conf_data->hostName)) && + (ptr->port == conf_data->port)) + { + return ptr; + } + if ((ptr->useFlag == TBL_USE) || (ptr->useFlag == TBL_INIT)) + { + cnt ++; + } + } + ptr ++; + rec_num ++; + } + return (ClusterTbl *)NULL; +} + +static int +set_cluster_tbl(ClusterTbl * ptr , ClusterTbl * conf_data) +{ + int rec_no; + + rec_no = ptr->rec_no; + memcpy(ptr->hostName,conf_data->hostName,sizeof(ptr->hostName)); + ptr->max_connect = conf_data->max_connect; + ptr->port = conf_data->port; + ptr->use_num = conf_data->use_num; + ptr->rate = conf_data->rate; + PGRset_status_on_cluster_tbl (TBL_INIT, ptr); + + return STATUS_OK; +} + +static ClusterTbl * +search_free_cluster_tbl(void ) +{ + ClusterTbl *ptr; + int cnt = 0; + + ptr = Cluster_Tbl; + while ((cnt <= ClusterNum ) && (cnt < Max_DB_Server)) + { + if ((ptr->useFlag == TBL_FREE) || (ptr->useFlag == TBL_ERROR)) + { + return ptr; + } + cnt ++; + ptr ++; + } + return (ClusterTbl *)NULL; +} + diff -aruN postgresql-8.2.4/src/pgcluster/pglb/lifecheck.c pgcluster-1.7.0rc7/src/pgcluster/pglb/lifecheck.c --- postgresql-8.2.4/src/pgcluster/pglb/lifecheck.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/lifecheck.c 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,329 @@ +/*-------------------------------------------------------------------- + * FILE: + * lifecheck.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for the lifecheck. + * + * Portions Copyright (c) 2003-2007, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" +#include "postgres_fe.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + + +#ifdef HAVE_CRYPT_H +#include +#endif + + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#include "access/xact.h" +#include "lib/dllist.h" +#include "libpq/pqformat.h" +#include "replicate_com.h" +#include "pglb.h" + +#define PING_DB "template1" +#define PING_QUERY "SELECT 1" + +static ClusterTbl * PGR_Cluster_DB_4_Lifecheck = (ClusterTbl*)NULL; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRlifecheck_main(int fork_wait_time); +PGconn * PGRcreateConn( char * host, char * port,char * database, char * userName, char * password, char * md5Salt, char * cryptSalt ); + +static bool is_started_loadbalance(void); +static void set_timeout(SIGNAL_ARGS); +static int lifecheck_loop(void); +static int ping_cluster(PGconn * conn); +static void set_cluster_status(ClusterTbl * host_ptr, int status); + +int +PGRlifecheck_main(int fork_wait_time) +{ + bool started = false; + pid_t pgid = 0; + pid_t pid = 0; + + pgid = getpgid(0); + pid = fork(); + if (pid != 0) + { + return STATUS_OK; + } + + /* + * in child process, + * call recovery module + */ + setpgid(0,pgid); + + PGRsignal(SIGHUP, PGRexit_subprocess); + PGRsignal(SIGTERM, PGRexit_subprocess); + PGRsignal(SIGINT, PGRexit_subprocess); + PGRsignal(SIGQUIT, PGRexit_subprocess); + PGRsignal(SIGALRM, set_timeout); + + if (fork_wait_time > 0) { + sleep(fork_wait_time); + } + + if (PGRuserName == NULL) + { + PGRuserName = getenv("LOGNAME"); + if (PGRuserName == NULL) + { + PGRuserName = getenv("USER"); + if (PGRuserName == NULL) + PGRuserName = "postgres"; + } + } + + for (;;) + { + started = is_started_loadbalance(); + if (!started) + { + /* wait next lifecheck as interval */ + sleep(PGR_Lifecheck_Interval); + continue; + } + + /* life check to all cluster dbs */ + lifecheck_loop(); + + /* wait next lifecheck as interval */ + sleep(PGR_Lifecheck_Interval); + } + return STATUS_OK; +} + +static bool +is_started_loadbalance(void) +{ + ClusterTbl * host_ptr = (ClusterTbl*)NULL; + + host_ptr = Cluster_Tbl; + if (host_ptr == NULL) + { + return false; + } + while(host_ptr->useFlag != TBL_END) + { + if (host_ptr->useFlag == TBL_USE) + { + return true; + } + host_ptr ++; + } + return false; +} + +static void +set_timeout(SIGNAL_ARGS) +{ + if (PGR_Cluster_DB_4_Lifecheck != NULL) + { + set_cluster_status( PGR_Cluster_DB_4_Lifecheck, TBL_ERROR); + } + PGRsignal(SIGALRM, set_timeout); +} + +static int +lifecheck_loop(void) +{ + ClusterTbl * host_ptr = (ClusterTbl*)NULL; + char port[8]; + char * host = NULL; + PGconn * conn = NULL; + + host_ptr = Cluster_Tbl; + if (host_ptr == NULL) + { + return STATUS_ERROR; + } + alarm(0); + while(host_ptr->useFlag != TBL_END) + { + /* + * check the status of the cluster DB + */ + if ((host_ptr->useFlag != TBL_USE) || (host_ptr->useFlag != TBL_INIT)) + { + host_ptr ++; + continue; + } + snprintf(port,sizeof(port),"%d", host_ptr->port); + host = (char *)(host_ptr->hostName); + /* set host data */ + PGR_Cluster_DB_4_Lifecheck = host_ptr; + + /* set alarm as lifecheck timeout */ + alarm(PGR_Lifecheck_Timeout); + + /* connect DB */ + conn = PGRcreateConn(host,port, PING_DB ,PGRuserName,"","",""); + if ((conn != NULL) && + (ping_cluster(conn) == STATUS_OK)) + { + set_cluster_status(host_ptr,TBL_USE); + } + else + { + set_cluster_status(host_ptr,TBL_ERROR); + } + /* reset alarm */ + alarm(0); + + PQfinish(conn); + conn = NULL; + host_ptr ++; + } + + return STATUS_OK; +} + +static int +ping_cluster(PGconn * conn) +{ + int status = 0; + PGresult * res = (PGresult *)NULL; + + res = PQexec(conn, PING_QUERY ); + + status = PQresultStatus(res); + if (res != NULL) + { + PQclear(res); + } + if ((status == PGRES_NONFATAL_ERROR ) || + (status == PGRES_FATAL_ERROR )) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +PGconn * +PGRcreateConn( char * host, char * port,char * database, char * userName, char * password, char * md5Salt, char * cryptSalt ) +{ + int cnt = 0; + PGconn * conn = NULL; + char pwd[256]; + + memset(pwd,0,sizeof(pwd)); + if (*password != '\0') + { + if ((strncmp(password,"md5",3) == 0) && (md5Salt != NULL)) + { + sprintf(pwd,"%s(%d)(%d)(%d)(%d)",password, + *md5Salt,*(md5Salt+1),*(md5Salt+2),*(md5Salt+3)); + } + else + { + strncpy(pwd,password,sizeof(pwd)); + } + } + conn = PQsetdbLogin(host, port, NULL, NULL, database, userName, pwd); + /* check to see that the backend Connection was successfully made */ + cnt = 0; + while (PQstatus(conn) == CONNECTION_BAD) + { + if (conn != NULL) + { + PQfinish(conn); + conn = NULL; + } + conn = PQsetdbLogin(host, port, NULL, NULL, database, userName, pwd); + if (cnt > PGLB_CONNECT_RETRY_TIME ) + { + if (conn != NULL) + { + PQfinish(conn); + conn = NULL; + } + return (PGconn *)NULL; + } + + if(PQstatus(conn) == CONNECTION_BAD && h_errno==2) + { + usleep(PGR_SEND_WAIT_MSEC); + cnt ++; + } + else if(!strncasecmp(PQerrorMessage(conn),"FATAL: Sorry, too many clients already",30) || + !strncasecmp(PQerrorMessage(conn),"FATAL: Non-superuser connection limit",30) ) + { + usleep(PGR_SEND_WAIT_MSEC); + cnt ++; + } + else if(!strncasecmp(PQerrorMessage(conn),"FATAL: The database system is starting up",40) ) + { + usleep(PGR_SEND_WAIT_MSEC); + } + else + { + usleep(PGR_SEND_WAIT_MSEC); + cnt ++; + } + } + return conn; +} + +static void +set_cluster_status(ClusterTbl * host_ptr, int status) +{ + if (host_ptr == NULL) + return; + if (status == TBL_ERROR) + { + host_ptr->retry_count ++; + if (host_ptr->retry_count > PGLB_CONNECT_RETRY_TIME ) + { + PGRset_status_on_cluster_tbl(status, host_ptr); + } + } + else + { + host_ptr->retry_count = 0; + PGRset_status_on_cluster_tbl(status, host_ptr); + } +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/load_balance.c pgcluster-1.7.0rc7/src/pgcluster/pglb/load_balance.c --- postgresql-8.2.4/src/pgcluster/pglb/load_balance.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/load_balance.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,252 @@ +/*-------------------------------------------------------------------- + * FILE: + * load_balance.c + * + * NOTE: + * This file is composed of the functions of load balance modules + * with connection pooling or not + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include "postgres.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRload_balance(void); +int PGRload_balance_with_pool(void); +char PGRis_connection_full(ClusterTbl * ptr); +void PGRrelease_connection(ClusterTbl * ptr); +void PGRchild_wait(int sig); + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRload_balance() + * NOTES + * load balance module that normal connection is used + * ARGS + * void + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRload_balance(void) +{ + char * func = "PGRload_balance()"; + pid_t pid,pgid; + int count; + int status; + ClusterTbl * cluster_p = NULL; + + PGRsignal(SIGCHLD, PGRchild_wait); + /* get the least locaded cluster server info */ + cluster_p = PGRscan_cluster(); + count = 0; + while (cluster_p == NULL ) + { + if ( count > PGLB_CONNECT_RETRY_TIME) + { + show_error("%s:no cluster available",func); + return STATUS_ERROR; + } + cluster_p = PGRscan_cluster(); + count ++; + } + + pgid = getpgid((pid_t)0); + pid = fork(); + if (pid < 0) + { + show_error("%s:fork() failed. (%s)",func,strerror(errno)); + exit(1); + } + if (pid == 0) + { + setpgid((pid_t)0,pgid); + CurrentCluster = cluster_p; + + if (pool_init_cp()) + { + show_error("%s:pool_init_cp failed",func); + exit(1); + } + PGRsem_lock(ClusterSemid,cluster_p->rec_no); + if (PGRget_child_status(getpid()) == STATUS_ERROR) + { + PGRadd_child_tbl(cluster_p, getpid(), TBL_USE); + } + PGRsem_unlock(ClusterSemid,cluster_p->rec_no); + PGRdo_child(NOT_USE_CONNECTION_POOL ); + PGRrelease_connection(cluster_p); + PGRset_status_to_child_tbl(getpid(), TBL_FREE); + exit(0); + } + else if (pid > 0) + { + PGRsem_lock(ClusterSemid,cluster_p->rec_no); + if (PGRget_child_status(pid) == STATUS_ERROR) + { + PGRadd_child_tbl(cluster_p, pid, TBL_USE); + } + PGRsem_unlock(ClusterSemid,cluster_p->rec_no); + status = PGRget_child_status(pid); + while (status == TBL_USE) + { + status = PGRget_child_status(pid); + usleep(20); + } + return STATUS_OK; + } + else + { + return STATUS_ERROR; + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRload_balance_with_pool() + * NOTES + * load balance module that connection pooling system is used + * ARGS + * void + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRload_balance_with_pool(void) +{ + char * func = "PGRload_balance_with_pool()"; + int count; + pid_t pid; + ClusterTbl * cluster_p = NULL; + int status = TBL_USE; + + /* get the least locaded cluster server info */ + cluster_p = PGRscan_cluster(); + count = 0; + while (cluster_p == NULL ) + { + if ( count > PGLB_CONNECT_RETRY_TIME) + { + show_error("%s:no cluster available",func); + PGRreturn_no_connection_error(); + return STATUS_ERROR; + } + cluster_p = PGRscan_cluster(); + count ++; + } + pid = PGRscan_child_tbl(cluster_p); + if ((pid == 0) || (pid == STATUS_ERROR)) + { + show_error("%s:no child process available",func); + return STATUS_ERROR; + } + kill(pid,SIGUSR1); + + status = PGRget_child_status(pid); + while (status == TBL_USE) + { + status = PGRget_child_status(pid); + usleep(20); + } + + return STATUS_OK; + +} + +char +PGRis_connection_full(ClusterTbl * ptr) +{ + char rtn = 1; + + if (ptr == NULL) + { + return rtn; + } + PGRsem_lock(ClusterSemid,ptr->rec_no); + if (ptr->max_connect > ptr->use_num) + { + rtn = 0; + } + PGRsem_unlock(ClusterSemid,ptr->rec_no); + return rtn; +} + +void +PGRrelease_connection(ClusterTbl * ptr) +{ + if (ptr == NULL) + { + return; + } + PGRsem_lock(ClusterSemid,MAX_DB_SERVER); + if (ptr->use_num > 0) + { + ptr->use_num --; + } + PGRsem_unlock(ClusterSemid,MAX_DB_SERVER); +} + +void +PGRchild_wait(int sig) +{ + pid_t pid = 0; + int ret = 0; + + do { + pid = waitpid(-1,&ret,WNOHANG); + if ((pid <= 0) && (WTERMSIG(ret) > 0)) + { + pid = 1; + } + } while(pid > 0); +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/main.c pgcluster-1.7.0rc7/src/pgcluster/pglb/main.c --- postgresql-8.2.4/src/pgcluster/pglb/main.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/main.c 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,1137 @@ +/*-------------------------------------------------------------------- + * FILE: + * main.c + * + * NOTE: + * This file is composed of the main function of pglb. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include "postgres.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef HAVE_GETOPT_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + + + +#define IPC_NMAXSEM (32) +/*-------------------------------------- + * GLOBAL VARIABLE DECLARATION + *-------------------------------------- + */ +/* for replicate_com.h */ +ConfDataType * ConfData_Top = (ConfDataType *)NULL; +ConfDataType * ConfData_End = (ConfDataType *)NULL; +int MapTableShmid = -1; +int LifeCheckStartShmid = -1; +char * LifeCheckStartFlag = NULL; +int LifeCheckTimeOut = 10; +FILE * StatusFp = (FILE *)NULL; +char * PGRStatusFileName = NULL; +char * PGRLogFileName = NULL; +char * PGRuserName = NULL; +int Log_Print = 0; +int Debug_Print = 0; + +char * ResolvedName = NULL; +int Recv_Port_Number = 0; +int Recovery_Port_Number = 0; +uint16_t LifeCheck_Port_Number = 0; +int Use_Connection_Pool = 0; +int Max_Pool = 1; +int Connection_Life_Time = 0; +int Max_DB_Server = 0; +int MaxBackends = 0; +ClusterTbl * Cluster_Tbl = (ClusterTbl *)NULL; +int ClusterNum = 0; +int ClusterShmid = 0; +int ClusterSemid = 0; +ChildTbl * Child_Tbl = (ChildTbl *)NULL; +int ChildShmid = 0; +char * PGR_Data_Path = NULL; +char * PGR_Write_Path = NULL; +char * Backend_Socket_Dir = NULL; +FrontSocket Frontend_FD; +ClusterTbl * CurrentCluster = NULL; +int PGR_Lifecheck_Timeout = 3; +int PGR_Lifecheck_Interval = 15; + +int fork_wait_time = 0; + +extern char *optarg; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static int init_pglb(char * path); +static void pglb_exit(int signal_args); +static void load_balance_main(void); +static void daemonize(void); +static void write_pid_file(void); +static void stop_pglb(void); +static int is_exist_pid_file(void); +static ClusterTbl * scan_cluster_by_pid(pid_t pid); +static void usage(void); +static void close_child(int signal_args); + +void PGRrecreate_child(int signal_args); +void PGRexit_subprocess(int sig); + +/*-------------------------------------------------------------------- + * SYMBOL + * init_pglb() + * NOTES + * Reading of the setup file + * and the initialization of the memory area. + * ARGS + * char * path: path of the setup file (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +static int +init_pglb(char * path) +{ + char * func = "init_pglb()"; + + ConfDataType * conf; + ClusterTbl cluster_tbl[MAX_DB_SERVER]; + int size = 0; + int rec_no = 0; + int i; + int max_connect = 0; + union semun sem_arg; + char fname[256]; + + /* + * read configuration file + */ + if (path == NULL) + { + path = "."; + } + if (PGR_Get_Conf_Data(path,PGLB_CONF_FILE) != STATUS_OK) + { + show_error("%s:PGR_Get_Conf_Data failed",func); + return STATUS_ERROR; + } + + size = sizeof(LogFileInf); + LogFileData = (LogFileInf *) malloc(size); + if (LogFileData == NULL) + { + show_error("%s:malloc() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } + memset(LogFileData,0,size); + + /* cluster db status file open */ + if (PGRStatusFileName == NULL) + { + snprintf(fname,sizeof(fname),"%s/%s",PGR_Write_Path,PGLB_STATUS_FILE); + } + else + { + memcpy(fname,PGRStatusFileName,sizeof(fname)); + } + StatusFp = fopen(fname, "a"); + if (StatusFp == NULL) + { + show_error("%s:open() %s file failed. (%s)", + func,fname, strerror(errno)); + exit(1); + } + + Backend_Socket_Dir = malloc(128); + if (Backend_Socket_Dir == NULL) + { + show_error("%s:malloc() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + memset(Backend_Socket_Dir,0,128); + /* set initiarize data */ + strcpy(Backend_Socket_Dir,"/tmp"); + Max_Pool = 1; + Connection_Life_Time = 0; + Use_Connection_Pool = 0; + + conf = ConfData_Top; + while (conf != (ConfDataType *)NULL) + { + /* get cluster db servers name */ + if (!strcmp(conf->table,CLUSTER_SERVER_TAG)) + { + rec_no = conf->rec_no; + if (!strcmp(conf->key,HOST_NAME_TAG)) + { + memcpy(cluster_tbl[rec_no].hostName,conf->value,sizeof(cluster_tbl[rec_no].hostName)); + conf = (ConfDataType*)conf->next; + continue; + } + if (!strcmp(conf->key,PORT_TAG)) + { + cluster_tbl[rec_no].port = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + if (!strcmp(conf->key,MAX_CONNECT_TAG)) + { + cluster_tbl[rec_no].max_connect = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + } + /* get logging file data */ + else if (!strcmp(conf->table, LOG_INFO_TAG)) + { + if (!strcmp(conf->key, FILE_NAME_TAG)) + { + strncpy(LogFileData->file_name, conf->value ,sizeof(LogFileData->file_name)); + LogFileData->fp = NULL; + conf = (ConfDataType*)conf->next; + continue; + } + if (!strcmp(conf->key, FILE_SIZE_TAG)) + { + int i,len; + char * ptr; + int unit = 1; + len = strlen(conf->value); + ptr = conf->value; + for (i = 0; i < len ; i ++,ptr++) + { + if ((! isdigit(*ptr)) && (! isspace(*ptr))) + { + switch (*ptr) + { + case 'K': + case 'k': + unit = 1024; + break; + case 'M': + case 'm': + unit = 1024*1024; + break; + case 'G': + case 'g': + unit = 1024*1024*1024; + break; + } + *ptr = '\0'; + break; + } + } + LogFileData->max_size = atoi(conf->value) * unit; + conf = (ConfDataType*)conf->next; + continue; + } + if (!strcmp(conf->key, LOG_ROTATION_TAG)) + { + LogFileData->rotation = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + } + else + { + if (!strcmp(conf->key,HOST_NAME_TAG)) + { + int ip; + ip=PGRget_ip_by_name(conf->value); + if (ResolvedName == NULL) + { + ResolvedName = malloc(ADDRESS_LENGTH); + } + if (ResolvedName == NULL) + { + continue; + } + else + { + memset(ResolvedName,0,ADDRESS_LENGTH); + } + + sprintf(ResolvedName, + "%d.%d.%d.%d", + (ip ) & 0xff , + (ip >> 8) & 0xff , + (ip >> 16) & 0xff , + (ip >> 24) & 0xff ); + conf = (ConfDataType*)conf->next; + continue; + } + /* get port number for receive querys */ + else if (!strcmp(conf->key,RECV_PORT_TAG)) + { + Recv_Port_Number = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + /* get port number for recovery session */ + else if (!strcmp(conf->key,RECOVERY_PORT_TAG)) + { + Recovery_Port_Number = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + else if (!strcmp(conf->key,MAX_CLUSTER_TAG)) + { + Max_DB_Server = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + else if (!strcmp(conf->key,USE_CONNECTION_POOL_TAG)) + { + if (!strcmp(conf->value,"yes")) + { + Use_Connection_Pool = 1; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!strcmp(conf->key,MAX_POOL_TAG)) + { + Max_Pool = atoi(conf->value); + if (Max_Pool < 0) + Max_Pool = 1; + conf = (ConfDataType*)conf->next; + continue; + } + else if (!strcmp(conf->key,CONNECTION_LIFE_TIME)) + { + Connection_Life_Time = atoi(conf->value); + if (Connection_Life_Time < 0) + Connection_Life_Time = 0; + conf = (ConfDataType*)conf->next; + continue; + } + else if (!strcmp(conf->key,BACKEND_SOCKET_DIR_TAG)) + { + strncpy(Backend_Socket_Dir,conf->value,128); + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,LIFECHECK_TIMEOUT_TAG)) + { + /* get lifecheck timeout */ + PGR_Lifecheck_Timeout = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Timeout < 1) || (PGR_Lifecheck_Timeout > 3600)) + { + show_error("%s is out of range. It should be between 1sec-1hr.\n",LIFECHECK_TIMEOUT_TAG); + return STATUS_ERROR; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,LIFECHECK_INTERVAL_TAG)) + { + /* get lifecheck interval */ + PGR_Lifecheck_Interval = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Interval < 1) || (PGR_Lifecheck_Interval > 3600)) + { + show_error("%s is out of range. It should between 1sec-1hr.\n",LIFECHECK_INTERVAL_TAG); + return STATUS_ERROR; + } + conf = (ConfDataType*)conf->next; + continue; + } + } + conf = (ConfDataType*)conf->next; + } + if (Max_DB_Server <= 0) + { + show_error("%s:Max_DB_Server is wrong value. %s/%s file should be broken",func, path, PGLB_CONF_FILE); + exit(1); + } + /* shared memory allocation for cluster table */ + size = sizeof(ClusterTbl) * Max_DB_Server; + + ClusterShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (ClusterShmid < 0) + { + show_error("%s:ClusterShm shmget() failed. (%s)", func,strerror(errno)); + return STATUS_ERROR; + } + Cluster_Tbl = (ClusterTbl *)shmat(ClusterShmid,0,0); + if (Cluster_Tbl == (ClusterTbl *)-1) + { + show_error("%s:shmat() failed. (%s)", func,strerror(errno)); + return STATUS_ERROR; + } + memset(Cluster_Tbl,0,size); + + if ((ClusterSemid = semget(IPC_PRIVATE,MAX_DB_SERVER+1,IPC_CREAT | IPC_EXCL | 0600)) < 0) + { + show_error("%s:semget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + for ( i = 0 ; i <= MAX_DB_SERVER ; i ++) + { + semctl(ClusterSemid, i, GETVAL, sem_arg); + sem_arg.val = 1; + semctl(ClusterSemid, i, SETVAL, sem_arg); + } + ClusterNum = 0; + /* set cluster db server name into cluster db server table */ + for ( i = 0 ; i < Max_DB_Server ; i ++) + { + (Cluster_Tbl + i)->rec_no = i; + } + (Cluster_Tbl + i)->useFlag = TBL_END; + max_connect = 0; + for ( i = 0 ; i <= rec_no ; i ++) + { + cluster_tbl[i].use_num = 0; + cluster_tbl[i].rate = 0; + if (cluster_tbl[i].max_connect < 0) + { + cluster_tbl[i].max_connect = 0; + } + if (max_connect < cluster_tbl[i].max_connect) + { + max_connect = cluster_tbl[i].max_connect; + } + PGRadd_cluster_tbl(&cluster_tbl[i]); + } + + /* shared memory allocation for children table */ + size = sizeof(ChildTbl) * (Max_DB_Server + 1) * max_connect * Max_Pool; +#ifdef PRINT_DEBUG + show_debug("%s:Child_Tbl size is[%d]",func,size); +#endif + + ChildShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (ChildShmid < 0) + { + show_error("%s:ChildShm shmget() failed. (%s)",func, strerror(errno)); + return STATUS_ERROR; + } + Child_Tbl = (ChildTbl *)shmat(ChildShmid,0,0); + if (Child_Tbl == (ChildTbl *)-1) + { + show_error("%s:shmat() failed. (%s)", func,strerror(errno)); + return STATUS_ERROR; + } + memset(Child_Tbl, 0, size); + (Child_Tbl + ( Max_DB_Server * max_connect * Max_Pool) -1)->useFlag = TBL_END; + + PGR_Free_Conf_Data(); + + return STATUS_OK; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * pglb_exit() + * NOTES + * Closing of pglb process + * ARGS + * int signal_args: signal number (I) + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +pglb_exit(int signal_args) +{ + char fname[256]; + int rtn; + + Child_Tbl->useFlag = TBL_END; + PGRsignal(SIGCHLD,SIG_IGN); + PGRsignal(signal_args,SIG_IGN); + kill (0,signal_args); + while (wait(NULL) > 0 ) + ; + + if (ClusterShmid > 0) + { + rtn = shmdt((char *)Cluster_Tbl); + shmctl(ClusterShmid,IPC_RMID,(struct shmid_ds *)NULL); + ClusterShmid = 0; + Cluster_Tbl = NULL; + } + if (ChildShmid > 0) + { + rtn = shmdt((char *)Child_Tbl); + shmctl(ChildShmid,IPC_RMID,(struct shmid_ds *)NULL); + ChildShmid = 0; + Child_Tbl = NULL; + } + if (ClusterSemid > 0) + { + semctl(ClusterSemid, 0, IPC_RMID); + ClusterSemid = 0; + } + + if (StatusFp != NULL) + { + fflush(StatusFp); + fclose(StatusFp); + } + if (Frontend_FD.unix_fd != 0) + { + close(Frontend_FD.unix_fd); + Frontend_FD.unix_fd = 0; + snprintf(fname, sizeof(fname), "%s/.s.PGSQL.%d", Backend_Socket_Dir,Recv_Port_Number); + unlink(fname); + } + if (Frontend_FD.inet_fd != 0) + { + close(Frontend_FD.inet_fd); + Frontend_FD.inet_fd = 0; + } + /* + PGRsyn_quit(); + */ + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGLB_PID_FILE); + unlink(fname); + + if (ResolvedName != NULL) + { + free(ResolvedName); + ResolvedName = NULL; + } + exit(0); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * load_balance_main() + * NOTES + * This is a main module of load balance function + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +load_balance_main(void) +{ + char * func = "load_balance_main()"; + int status; + int rtn; + int count = 0; + + Frontend_FD.unix_fd = PGRcreate_unix_domain_socket(Backend_Socket_Dir, Recv_Port_Number); + if (Frontend_FD.unix_fd < 0) + { + show_error("%s:PGRcreate_unix_domain_socket failed",func); + pglb_exit(SIGTERM); + } + Frontend_FD.inet_fd = PGRcreate_recv_socket(ResolvedName, Recv_Port_Number); + if (Frontend_FD.inet_fd < 0) + { + show_error("%s:PGRcreate_recv_socket failed",func); + pglb_exit(SIGTERM); + } + if (Use_Connection_Pool) + { + PGRsignal(SIGCHLD,PGRrecreate_child); + rtn = PGRpre_fork_children(Cluster_Tbl); + if (rtn != STATUS_OK) + { + show_error("%s:PGRpre_fork_children failed",func); + pglb_exit(SIGTERM); + } + } + + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = 60; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(Frontend_FD.unix_fd,&rmask); + if(Frontend_FD.inet_fd) + FD_SET(Frontend_FD.inet_fd,&rmask); + rtn = select(Max(Frontend_FD.unix_fd, Frontend_FD.inet_fd) + 1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if( rtn > 0) + { + if(PGRis_cluster_alive() == STATUS_ERROR) { + show_error("%s:all clusters were dead.",func); + PGRreturn_no_connection_error(); + count=0; + } + else + { + if (Use_Connection_Pool) + { + status = PGRload_balance_with_pool(); + } + else + { + status = PGRload_balance(); + } + if (status != STATUS_OK) + { + show_error("%s:load balance process failed",func); + if ( count > PGLB_CONNECT_RETRY_TIME) + { + show_error("%s:no cluster available",func); + PGRreturn_connection_full_error(); + count = 0; + } + count ++; + } + else + { + count = 0; + } + } + } + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * daemonize() + * NOTES + * Daemonize this process + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +daemonize(void) +{ + char * func = "daemonize()"; + int i; + pid_t pid; + + pid = fork(); + if (pid == (pid_t) -1) + { + show_error("%s:fork() failed. (%s)",func, strerror(errno)); + exit(1); + return; /* not reached */ + } + else if (pid > 0) + { /* parent */ + exit(0); + } + +#ifdef HAVE_SETSID + if (setsid() < 0) + { + show_error("%s:setsid() failed. (%s)", func,strerror(errno)); + exit(1); + } +#endif + + i = open("/dev/null", O_RDWR); + dup2(i, 0); + dup2(i, 1); + dup2(i, 2); + close(i); +} + + +/*-------------------------------------------------------------------- + * SYMBOL + * write_pid_file() + * NOTES + * The process ID is written in the file. + * This process ID is used when finish pglb. + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +write_pid_file(void) +{ + char * func = "write_pid_file()"; + FILE *fd; + char fname[256]; + char pidbuf[128]; + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGLB_PID_FILE); + fd = fopen(fname, "w"); + if (!fd) + { + show_error("%s:open() %s file failed. (%s)", + func,fname, strerror(errno)); + exit(1); + } + snprintf(pidbuf, sizeof(pidbuf), "%d", getpid()); + fwrite(pidbuf, strlen(pidbuf), 1, fd); + if (fclose(fd)) + { + show_error("%s:fwrite() %s file failed. (%s)", + func,fname, strerror(errno)); + exit(1); + } +} + + +/*-------------------------------------------------------------------- + * SYMBOL + * stop_pglb() + * NOTES + * Stop the pglb process + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +stop_pglb(void) +{ + char * func = "stop_pglb()"; + FILE *fd; + char fname[256]; + char pidbuf[128]; + pid_t pid; + + if (PGR_Write_Path == NULL) + { + PGR_Write_Path = "."; + } + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGLB_PID_FILE); + fd = fopen(fname, "r"); + if (!fd) + { + show_error("%s:open() %s file failed. (%s)", + func,fname, strerror(errno)); + exit(1); + } + memset(pidbuf,0,sizeof(pidbuf)); + fread(pidbuf, sizeof(pidbuf), 1, fd); + fclose(fd); + pid = atoi(pidbuf); + if (kill (pid,SIGTERM) == -1) + { + show_error("%s:could not stop pid: %d (%s)",func,pid,strerror(errno)); + exit(1); + } +} + + +/*-------------------------------------------------------------------- + * SYMBOL + * is_exist_pid_file() + * NOTES + * Check existence of pid file. + * ARGS + * void + * RETURN + * 1: the pid file is exist + * 0: the pid file is not exist + *-------------------------------------------------------------------- + */ +static int +is_exist_pid_file(void) +{ + char fname[256]; + struct stat buf; + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGLB_PID_FILE); + if (stat(fname,&buf) == 0) + { + /* pid file is exist */ + return 1; + } + else + { + /* pid file is not exist */ + return 0; + } +} + + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRrecreate_child() + * NOTES + * create the child process again which it hunged up + * ARGS + * int signal_args: signal number (expecting the SIGCHLD) + * RETURN + * none + *-------------------------------------------------------------------- + */ +void +PGRrecreate_child(int signal_args) +{ + pid_t pid = 0; + int status; + ClusterTbl * cluster_p; + +ReWait: + + errno = 0; +#ifdef HAVE_WAITPID + while ((pid = waitpid(-1, &status, WNOHANG)) > 0) + { +#else + while ((pid = wait3(&status, WNOHANG, NULL)) > 0) + { +#endif + cluster_p = scan_cluster_by_pid(pid); + pid = PGRcreate_child(cluster_p); + } + if ((pid < 0) && (errno == EINTR)) + goto ReWait; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * close_child() + * NOTES + * Hung up child process + * ARGS + * int signal_args: signal number (expecting the SIGUSR2) + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +close_child(int signal_args) +{ + char * func = "close_child()"; + ChildTbl * child; + ClusterTbl * cluster; + int rec_no = -1; + + if (( Cluster_Tbl == NULL) || (Child_Tbl == NULL)) + { + show_error("%s:Cluster_Tbl or Child_Tbl is not initialize",func); + return ; + } + cluster = Cluster_Tbl; + while(cluster->useFlag != TBL_END) + { + if (cluster->useFlag == TBL_ERROR_NOTICE) + { + rec_no = cluster->rec_no; + PGRset_status_on_cluster_tbl(TBL_ERROR,cluster); + break; + } + cluster++; + } + if (rec_no < 0) + { + return; + } + child = Child_Tbl; + while(child->useFlag != TBL_END) + { + if (child->rec_no == rec_no) + { + if (kill (child->pid,SIGTERM) == -1) + { + show_error("%s:could not stop pid: %d (%s)",func,child->pid,strerror(errno)); + return; + } + PGRchild_wait(signal_args); + child->useFlag = DATA_FREE; + } + child++; + } + PGRsignal(SIGUSR2, close_child); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * scan_cluster_by_pid() + * NOTES + * get cluster server record from child process id + * ARGS + * pid_t pid: child process id (I) + * RETURN + * OK: pointer of cluster table + * NG: NULL + *-------------------------------------------------------------------- + */ +static ClusterTbl * +scan_cluster_by_pid(pid_t pid) +{ + char * func = "scan_cluster_by_pid()"; + ChildTbl * child_p; + ClusterTbl * cluster_p; + int cnt; + + child_p = Child_Tbl; + if (child_p == NULL) + { + show_error("%s:Child Table is not initialize",func); + return NULL; + } + cluster_p = Cluster_Tbl; + if (cluster_p == NULL) + { + show_error("%s:Cluster Table is not initialize",func); + return NULL; + } + + while (child_p->useFlag != TBL_END) + { + if (child_p->pid == pid) + { + break; + } + child_p++; + } + if (child_p->useFlag == TBL_END) + { + show_error("%s:pid:%d not found in child table",func,pid); + return NULL; + } + + cnt = 0; + while ((cluster_p->useFlag != TBL_END) && (cnt < ClusterNum)) + { + if (cluster_p->rec_no == child_p->rec_no) + { + return cluster_p; + } + cluster_p++; + cnt ++; + } + return NULL; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * usage() + * NOTES + * show usage of pglb + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +usage(void) +{ + char * path; + + path = getenv("PGDATA"); + if (path == NULL) + path = "."; + fprintf(stderr,"pglb version [%s]\n",PGLB_VERSION); + fprintf(stderr,"A load balancer for PostgreSQL\n\n"); + fprintf(stderr,"usage: pglb [-D path_of_config_file] [-W path_of_work_files] [-n][-v][-h][stop | restart]\n"); + fprintf(stderr," config file default path: %s/%s\n",path, PGLB_CONF_FILE); + fprintf(stderr," -l: print error logs in the log file.\n"); + fprintf(stderr," -n: don't run in daemon mode.\n"); + fprintf(stderr," -v: debug mode. need '-n' flag\n"); + fprintf(stderr," -h: print this help\n"); + fprintf(stderr," stop: stop pglb\n"); + fprintf(stderr," restart: restart pglb\n"); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * main() + * NOTES + * main module of pglb + * ARGS + * int argc: number of parameter + * char ** argv: value of parameter + * RETURN + * none + *-------------------------------------------------------------------- + */ +int +main(int argc, char ** argv) +{ + int opt = 0; + char * r_path = NULL; + char * w_path = NULL; + int detach = 1; + + PGRsignal(SIGHUP, pglb_exit); + PGRsignal(SIGINT, pglb_exit); + PGRsignal(SIGQUIT, pglb_exit); + PGRsignal(SIGTERM, pglb_exit); + PGRsignal(SIGALRM, SIG_IGN); /* ignored */ + PGRsignal(SIGPIPE, SIG_IGN); /* ignored */ + PGRsignal(SIGTTIN, SIG_IGN); /* ignored */ + PGRsignal(SIGTTOU, SIG_IGN); /* ignored */ + PGRsignal(SIGCHLD,PGRchild_wait); + PGRsignal(SIGUSR1, SIG_IGN); /* ignored */ + PGRsignal(SIGUSR2, close_child); /* close child process */ + r_path = getenv("PGDATA"); + if (r_path == NULL) + r_path = "."; + + while ((opt = getopt(argc, argv, "U:D:W:w:lvnh")) != -1) + { + switch (opt) + { + case 'U': + if (!optarg) + { + usage(); + exit(1); + } + PGRuserName = strdup(optarg); + break; + case 'D': + if (!optarg) + { + usage(); + exit(1); + } + r_path = optarg; + break; + case 'W': + if (!optarg) + { + usage(); + exit(1); + } + w_path = optarg; + break; + case 'w': + fork_wait_time = atoi(optarg); + if (fork_wait_time < 0) + fork_wait_time = 0; + break; + case 'l': + Log_Print = 1; + break; + case 'v': + Debug_Print = 1; + break; + case 'n': + detach = 0; + break; + case 'h': + usage(); + exit(0); + break; + default: + usage(); + exit(1); + } + } + PGR_Data_Path = r_path; + if (w_path == NULL) + { + PGR_Write_Path = PGR_Data_Path; + } + else + { + PGR_Write_Path = w_path; + } + + if (optind == (argc-1) && + ((!strcmp(argv[optind],"stop")) || + (!strcmp(argv[optind],"restart")))) + { + stop_pglb(); + if (!strcmp(argv[optind],"stop")) + { + exit(0); + } + } + else if (optind == argc) + { + if (is_exist_pid_file()) + { + fprintf(stderr,"pid file %s/%s found. is another pglb running?", PGR_Write_Path, PGLB_PID_FILE); + exit(1); + } + } + else if (optind < argc) + { + usage(); + exit(1); + } + + if (detach) + { + daemonize(); + } + write_pid_file(); + + if (init_pglb(PGR_Data_Path) != STATUS_OK) + { + exit(0); + } + + /* call recovery process */ + PGRrecovery_main(fork_wait_time); + + /* call lifecheck process */ + PGRlifecheck_main(fork_wait_time); + + /* start loadbalance module */ + load_balance_main(); + pglb_exit(0); + return STATUS_OK; +} + +void +PGRexit_subprocess(int sig) +{ + pglb_exit(sig); +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pglb.conf.sample pgcluster-1.7.0rc7/src/pgcluster/pglb/pglb.conf.sample --- postgresql-8.2.4/src/pgcluster/pglb/pglb.conf.sample 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pglb.conf.sample 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,73 @@ +#============================================================ +# Load Balance Server configuration file +#------------------------------------------------------------- +# file: pglb.conf +#------------------------------------------------------------- +# This file controls: +# o which hosts are db cluster server +# o which port use connect to db cluster server +# o how many connections are allowed on each DB server +#============================================================ +#------------------------------------------------------------- +# set cluster DB server information +# o Host_Name : Hostname of Cluster +# Please write a host name by FQDN or IP address. +# o Port : Connection port for postmaster +# o Max_Connection : Maximum number of connections to postmaster +#------------------------------------------------------------- +# +# master.pgcluster.org +# 5432 +# 32 +# +# +# post2.pgcluster.org +# 5432 +# 32 +# +# +# post3.pgcluster.org +# 5432 +# 32 +# +#------------------------------------------------------------- +# set Load Balance server information +# o Host_Name : The host name of this load balance server +# Please write a host name by FQDN or IP address. +# o Backend_Socket_Dir : Unix domain socket path for the backend +# o Receive_Port Connection port from client +# o Recovery_Port : Connection port for recovery process +# o Max_Cluster_Num : Maximum number of cluster DB servers +# o Use_Connection_Pooling : Use connection pool [yes/no] +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + loadbalancer.pgcluster.org + /tmp + 5432 + 6001 + 128 + no + 3s + 15s +#------------------------------------------------------------- +# A setup of a log files +# +# o File_Name : Log file name with full path +# o File_Size : Maximum size of each log files +# Please specify in a number and unit(K or M) +# 10 -- 10 Byte +# 10K -- 10 KByte +# 10M -- 10 MByte +# o Rotate : Rotation times +# If specified 0, old versions are removed. +#------------------------------------------------------------- + + /tmp/pglb.log + 1M + 3 + diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pglb.h pgcluster-1.7.0rc7/src/pgcluster/pglb/pglb.h --- postgresql-8.2.4/src/pgcluster/pglb/pglb.h 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pglb.h 2007-03-01 16:27:49.000000000 +0100 @@ -0,0 +1,472 @@ +/*-------------------------------------------------------------------- + * FILE: + * pglb.h + * + * Portions Copyright (c) 2003-2006 Atsushi Mitani + *-------------------------------------------------------------------- + */ +#ifndef PGLB_H +#define PGLB_H + +#define PGLB_VERSION "1.7.0rc7" + +#include "../libpgc/libpgc.h" + +/* + * from pool.h + */ + +/* + * define this if you do not want to issue RESET ALL at each new + * connection. Also you need to define this for 7.1 or prior + * PostgreSQL since they do not support RESET ALL + */ +#undef NO_RESET_ALL + +/* undef this if you have problems with non blocking accept() */ +#define NONE_BLOCK + +#define POOLMAXPATHLEN 8192 + +/* configuration file name */ +#define POOL_CONF_FILE_NAME "pgpool.conf" + +/* pid file directory */ +#define DEFAULT_LOGDIR "/tmp" + +/* Unix domain socket directory */ +#define DEFAULT_SOCKET_DIR "/tmp" + +/* pid file name */ +#define PID_FILE_NAME "pgpool.pid" + +/* strict mode comment in SQL */ +#define STRICT_MODE_STR "/*STRICT*/" +#define STRICT_MODE(s) (strncasecmp((s), STRICT_MODE_STR, strlen(STRICT_MODE_STR)) == 0) + +typedef enum { + POOL_CONTINUE = 0, + POOL_IDLE, + POOL_END, + POOL_ERROR, + POOL_FATAL +} POOL_STATUS; + +/* protocol major version numbers */ +#define PROTO_MAJOR_V2 2 +#define PROTO_MAJOR_V3 3 + +/* + * startup packet definitions (v2) stolen from PostgreSQL + */ +#define SM_DATABASE 64 +#define SM_USER 32 +#define SM_OPTIONS 64 +#define SM_UNUSED 64 +#define SM_TTY 64 + +typedef struct PGR_StartupPacket_v2 +{ + int protoVersion; /* Protocol version */ + char database[SM_DATABASE]; /* Database name */ + char user[SM_USER]; /* User name */ + char options[SM_OPTIONS]; /* Optional additional args */ + char unused[SM_UNUSED]; /* Unused */ + char tty[SM_TTY]; /* Tty for debug output */ +} PGR_StartupPacket_v2; + +/* startup packet info */ +typedef struct +{ + char *startup_packet; /* raw startup packet without packet length (malloced area) */ + int len; /* raw startup packet length */ + int major; /* protocol major version */ + int minor; /* protocol minor version */ + char *database; /* database name in startup_packet (malloced area) */ + char *user; /* user name in startup_packet (malloced area) */ +} PGR_StartupPacket; + +typedef struct CancelPacket +{ + int protoVersion; /* Protocol version */ + int pid; /* bcckend process id */ + int key; /* cancel key */ +} CancelPacket; + +/* + * configuration paramters + */ +typedef struct { + int inetdomain; /* should we make an INET domain socket too? */ + int port; /* port # to bind */ + char *socket_dir; /* pgpool socket directory */ + char *backend_host_name; /* backend host name */ + int backend_port; /* backend port # */ + char *secondary_backend_host_name; /* secondary backend host name */ + int secondary_backend_port; /* secondary backend port # */ + int num_init_children; /* # of children initially pre-forked */ + int child_life_time; /* if idle for this seconds, child exits */ + int connection_life_time; /* if idle for this seconds, connection closes */ + int max_pool; /* max # of connection pool per child */ + char *logdir; /* logging directory */ + char *backend_socket_dir; /* Unix domain socket directory for the PostgreSQL server */ + int replication_mode; /* replication mode */ + int replication_strict; /* if non 0, wait for completion of the + query sent to master to avoid deadlock */ + /* + * if secondary does not respond in this milli seconds, abort this session. + * this is not compatible with replication_strict = 1. 0 means no timeout. + */ + int replication_timeout; + + int load_balance_mode; /* load balance mode */ + + /* followings do not exist in the configuration file */ + char *current_backend_host_name; /* current backend host name */ + int current_backend_port; /* current backend port # */ + int replication_enabled; /* replication mode enabled */ + + int replication_stop_on_mismatch; /* if there's a data mismatch between master and secondary + * start degenration to stop replication mode + */ +} POOL_CONFIG; + +#define MAX_PASSWORD_SIZE (1024) + +typedef struct { + int num; /* number of entries */ + char **names; /* parameter names */ + char **values; /* values */ +} ParamStatus; + +/* + * stream connection structure + */ +typedef struct { + int fd; /* fd for connection */ + FILE *write_fd; /* stream write connection */ + + char *hp; /* pending data buffer head address */ + int po; /* pending data offset */ + int bufsz; /* pending data buffer size */ + int len; /* pending data length */ + + char *sbuf; /* buffer for pool_read_string */ + int sbufsz; /* its size in bytes */ + + char *buf2; /* buffer for pool_read2 */ + int bufsz2; /* its size in bytes */ + + int isbackend; /* this connection is for backend if non 0 */ + int issecondary_backend; /* this connection is for secondary backend if non 0 */ + + char tstate; /* transaction state (V3 only) */ + + /* + * following are used to remember when re-use the authenticated connection + */ + int auth_kind; /* 3: clear text password, 4: crypt password, 5: md5 password */ + int pwd_size; /* password (sent back from frontend) size in host order */ + char password[MAX_PASSWORD_SIZE]; /* password (sent back from frontend) */ + char salt[4]; /* password salt */ + + /* + * following are used to remember current session paramter status. + * re-used connection will need them (V3 only) + */ + ParamStatus params; + + int no_forward; /* if non 0, do not write to frontend */ + +} POOL_CONNECTION; + +/* + * connection pool structure + */ +typedef struct { + PGR_StartupPacket *sp; /* startup packet info */ + int pid; /* backend pid */ + int key; /* cancel key */ + POOL_CONNECTION *con; + time_t closetime; /* absolute time in second when the connection closed + * if 0, that means the connection is under use. + */ +} POOL_CONNECTION_POOL_SLOT; + +#define MAX_CONNECTION_SLOTS 2 + +typedef struct { + int num; /* number of slots */ + POOL_CONNECTION_POOL_SLOT *slots[MAX_CONNECTION_SLOTS]; +} POOL_CONNECTION_POOL; + +#define MASTER_CONNECTION(p) ((p)->slots[0]) +#define SECONDARY_CONNECTION(p) ((p)->slots[1]) +#define MASTER(p) MASTER_CONNECTION(p)->con +#define SECONDARY(p) SECONDARY_CONNECTION(p)->con +#define MAJOR(p) MASTER_CONNECTION(p)->sp->major +#define TSTATE(p) MASTER(p)->tstate + +#define Max(x, y) ((x) > (y) ? (x) : (y)) +#define Min(x, y) ((x) < (y) ? (x) : (y)) + +/* + * pglb + */ + +typedef struct { + int useFlag; + int sock; +}SocketTbl; + +typedef struct { + int useFlag; + char hostName[HOSTNAME_MAX_LENGTH]; + unsigned short port; + short max_connect; + int use_num; + int rate; + int rec_no; + int retry_count; +}ClusterTbl; + +typedef struct { + long mtype; + char mdata[1]; +}MsgData; + +typedef struct { + int useFlag; + int rec_no; + pid_t pid; +}ChildTbl; + +#define UNIX_DOMAIN_FD (0) +#define INET_DOMAIN_FD (1) +typedef struct { + int unix_fd; + int inet_fd; +}FrontSocket; + +#define pool_config_inetdomain (0) +#define pool_config_replication_mode (0) +#define pool_config_replication_strict (0) +#define pool_config_replication_timeout (0) +#define pool_config_replication_enabled (0) +#define pool_config_load_balance_mode (0) +#define pool_config_replication_stop_on_mismatch (0) +#define pool_config_port (Recv_Port_Number) +#define pool_config_socket_dir (Backend_Socket_Dir) +#define pool_config_backend_host_name (CurrentCluster->hostName) +#define pool_config_backend_port (CurrentCluster->port) +#define pool_config_secondary_backend_host_name (CurrentCluster->hostName) +#define pool_config_secondary_backend_port (CurrentCluster->port) +#define pool_config_num_init_children (CurrentCluster->max_connect) +#define pool_config_child_life_time (Connection_Life_Time) +#define pool_config_connection_life_time (Connection_Life_Time) +#define pool_config_max_pool (Max_Pool) +#define pool_config_logdir "./" +#define pool_config_backend_socket_dir (Backend_Socket_Dir) +#define pool_config_current_backend_host_name (CurrentCluster->hostName) +#define pool_config_current_backend_port (CurrentCluster->port) +#define REPLICATION (0) +#define IN_LOAD_BALANCE (0) + +/* + * for pglb + */ +#define MAX_DB_SERVER (32) +#define PGLB_MAX_SOCKET_QUEUE (10000) +#define CLUSTER_TBL_SHM_KEY (1010) +#define PGLB_CONNECT_RETRY_TIME (3) +#define DEFAULT_CONNECT_NUM (32) +#define DEFAULT_PORT (5432) +#define BUF_SIZE (16384) +#define TBL_FREE (0) +#define TBL_INIT (1) +#define TBL_USE (2) +#define TBL_STOP (3) +#define TBL_ACCEPT (10) +#define TBL_ERROR_NOTICE (98) +#define TBL_ERROR (99) +#define TBL_END (-1) +#define STATUS_OK (0) +#define STATUS_ERROR (-1) +#ifdef RECOVERY_PREPARE_REQ +#define ADD_DB RECOVERY_PREPARE_REQ +#else +#define ADD_DB (1) +#endif +#ifdef RECOVERY_PGDATA_ANS +#define STOP_DB RECOVERY_PGDATA_ANS +#else +#define STOP_DB (3) +#endif +#ifdef RECOVERY_FINISH +#define START_DB RECOVERY_FINISH +#else +#define START_DB (9) +#endif +#define DELETE_DB (99) +#define QUERY_TERMINATE (0x00) +#define RESPONSE_TERMINATE (0x5a) +#define PGLB_CONF_FILE "pglb.conf" +#define PGLB_PID_FILE "pglb.pid" +#define PGLB_STATUS_FILE "pglb.sts" +#define PGLB_LOG_FILE "pglb.log" +#define CLUSTER_SERVER_TAG "Cluster_Server_Info" +#define MAX_CONNECT_TAG "Max_Connect" +#define RECOVERY_PORT_TAG "Recovery_Port" +#define RECV_PORT_TAG "Receive_Port" +#define MAX_CLUSTER_TAG "Max_Cluster_Num" +#define USE_CONNECTION_POOL_TAG "Use_Connection_Pooling" +#define MAX_POOL_TAG "Max_Pool_Each_Server" +#define BACKEND_SOCKET_DIR_TAG "Backend_Socket_Dir" +#define CONNECTION_LIFE_TIME "Connection_Life_Time" +#define NOT_USE_CONNECTION_POOL (0) +#define USE_CONNECTION_POOL (1) + +#define PGR_SEND_RETRY_CNT (100) +#define PGR_SEND_WAIT_MSEC (500) +#define PGR_RECV_RETRY_CNT (100) +#define PGR_RECV_WAIT_MSEC (500) + +extern int Recv_Port_Number; +extern int Recovery_Port_Number; +extern uint16_t LifeCheck_Port_Number; +extern int Use_Connection_Pool; +extern int Max_Pool; +extern int Connection_Life_Time; +extern int Msg_Id; +extern ClusterTbl * Cluster_Tbl; +extern int Max_DB_Server; +extern int MaxBackends; +extern char * Backend_Socket_Dir; +extern int ClusterShmid; +extern int ClusterSemid; +extern int ChildShmid; +extern int ClusterNum; +extern ChildTbl * Child_Tbl; +extern char * PGR_Data_Path; +extern char * PGR_Write_Path; +extern char * Backend_Socket_Dir; +extern FrontSocket Frontend_FD; +extern FILE * StatusFp; +extern char * ResolvedName; +extern char * PGRuserName; + +/* for child.c */ +extern POOL_CONNECTION * Frontend; +extern ClusterTbl * CurrentCluster; + +extern char * Function; + +extern POOL_CONNECTION_POOL *pool_connection_pool; /* connection pool */ + +/* extern of main.c */ +extern void PGRrecreate_child(int signal_args); +extern void PGRexit_subprocess(int sig); + +/* extern of child.c */ +extern int PGRpre_fork_children(ClusterTbl * ptr); +extern int PGRpre_fork_child(ClusterTbl * ptr); +extern int PGRdo_child( int use_pool); +extern int PGRcreate_child(ClusterTbl * cluster_p); +extern pid_t PGRscan_child_tbl(ClusterTbl * cluster_p); +extern void notice_backend_error(void); +extern void do_pooling_child(int sig); +extern int PGRset_status_to_child_tbl(pid_t pid, int status); +extern int PGRadd_child_tbl(ClusterTbl * cluster_p, pid_t pid, int status); +extern int PGRget_child_status(pid_t pid); +extern void PGRreturn_connection_full_error(void); +extern void PGRreturn_no_connection_error(void); +extern void PGRquit_children_on_cluster(int rec_no); + +/* extern of cluster_table.c */ +extern int PGRis_cluster_alive(void) ; +extern ClusterTbl * PGRscan_cluster(void); +extern void PGRset_key_of_cluster(ClusterTbl * ptr, RecoveryPacket * packet); +extern ClusterTbl * PGRadd_cluster_tbl (ClusterTbl * conf_data); +extern ClusterTbl * PGRset_status_on_cluster_tbl (int status, ClusterTbl * ptr); +extern ClusterTbl * PGRsearch_cluster_tbl(ClusterTbl * conf_data); + +/* extern of load_balance.c */ +extern int PGRload_balance(void); +extern int PGRload_balance_with_pool(void); +extern char PGRis_connection_full(ClusterTbl * ptr); +extern void PGRuse_connection(ClusterTbl * ptr); +extern void PGRrelease_connection(ClusterTbl * ptr); +extern void PGRchild_wait(int sig); + +/* extern of recovery.c */ +extern void PGRrecovery_main(int fork_wait_fime); + +/* extern of socket.c */ +extern int PGRcreate_unix_domain_socket(char * sock_dir, unsigned short port); +extern int PGRcreate_recv_socket(char * hostName , unsigned short portNumber); +extern int PGRcreate_acception(int fd, char * hostName , unsigned short portNumber); +extern void PGRclose_sock(int * sock); +extern int PGRread_byte(int sock,char * buf,int len, int flag); +extern int PGRcreate_cluster_socket( int * sock, ClusterTbl * ptr ); + +/* extern of pool_auth.c */ +extern int pool_do_auth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp); +extern int pool_do_reauth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp); +extern int pool_read_message_length(POOL_CONNECTION_POOL *cp); +extern signed char pool_read_kind(POOL_CONNECTION_POOL *cp); + +/* extern of pool_connection_pool.c */ +extern int pool_init_cp(void); +extern POOL_CONNECTION_POOL *pool_get_cp(char *user, char *database, int protoMajor); +extern void pool_discard_cp(char *user, char *database, int protoMajor); +extern POOL_CONNECTION_POOL *pool_create_cp(void); +extern void pool_connection_pool_timer(POOL_CONNECTION_POOL *backend); +extern void pool_backend_timer_handler(int sig); +extern int connect_inet_domain_socket(int secondary_backend); +extern int connect_unix_domain_socket(int secondary_backend); +extern char PGRis_same_host(char * host1, char * host2); +extern void pool_finish(void); + +/* extern of pool_process_query.c */ +extern POOL_STATUS pool_process_query(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, int connection_reuse); +extern POOL_STATUS ErrorResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +extern void pool_enable_timeout(); +extern void pool_disable_timeout(); +extern int pool_check_fd(POOL_CONNECTION *cp, int notimeout); +extern void pool_send_frontend_exits(POOL_CONNECTION_POOL *backend); +extern POOL_STATUS SimpleForwardToFrontend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +extern POOL_STATUS SimpleForwardToBackend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +extern POOL_STATUS ParameterStatus(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +extern void pool_send_error_message(POOL_CONNECTION *frontend, int protoMajor, char *code, char *message, char *detail, char *hint, char *file, int line); + +/* extern of pool_params.c */ +extern int pool_init_params(ParamStatus *params); +extern void pool_discard_params(ParamStatus *params); +extern char *pool_find_name(ParamStatus *params, char *name, int *pos); +extern int pool_get_param(ParamStatus *params, int index, char **name, char **value); +extern int pool_add_param(ParamStatus *params, char *name, char *value); +extern void pool_param_debug_print(ParamStatus *params); + +/* extern of pool_stream.c */ +extern POOL_CONNECTION *pool_open(int fd); +extern void pool_close(POOL_CONNECTION *cp); +extern int pool_read(POOL_CONNECTION *cp, void *buf, int len); +extern char *pool_read2(POOL_CONNECTION *cp, int len); +extern int pool_write(POOL_CONNECTION *cp, void *buf, int len); +extern int pool_flush(POOL_CONNECTION *cp); +extern int pool_write_and_flush(POOL_CONNECTION *cp, void *buf, int len); +extern char *pool_read_string(POOL_CONNECTION *cp, int *len, int line); + +/* + * external prototype in show.c + */ +extern void show_error(const char * fmt,...); +extern void show_debug(const char * fmt,...); +extern void PGRwrite_log_file(FILE * fp, const char * fmt,...); + +/* + * external prototype in lifecheck.c + */ +extern int PGRlifecheck_main(int fork_wait_time); + +#endif /* PGLB_H */ diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pool_auth.c pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_auth.c --- postgresql-8.2.4/src/pgcluster/pglb/pool_auth.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_auth.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,959 @@ +/*-------------------------------------------------------------------- + * FILE: + * pool_auth.c + * + * NOTE: + * authenticaton stuff + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include "replicate_com.h" +#include "pglb.h" + +int pool_do_auth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp); +int pool_do_reauth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp); +int pool_read_message_length(POOL_CONNECTION_POOL *cp); +signed char pool_read_kind(POOL_CONNECTION_POOL *cp); + +static POOL_STATUS pool_send_auth_ok(POOL_CONNECTION *frontend, int pid, int key, int protoMajor); +static int do_clear_text_password(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor); +static int do_crypt(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor); +static int do_md5(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor); + +/* +* do authentication against backend. if success return 0 otherwise non 0. +*/ +int pool_do_auth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp) +{ + char * func = "pool_do_auth()"; + int status; + signed char kind; + int pid, pid1; + int key, key1; + int protoMajor; + int length; + + protoMajor = MAJOR(cp); + + kind = pool_read_kind(cp); + if (kind < 0) + { + return -1; + } + + /* error response? */ + if (kind == 'E') + { + /* we assume error response at this stage is likely version + * protocol mismatch (v3 frontend vs. v2 backend). So we throw + * a V2 protocol error response in the hope that v3 frontend + * will negotiate again using v2 protocol. + */ + show_error("%s:pool_do_auth: maybe protocol version mismatch (current version %d)",func, protoMajor); + ErrorResponse(frontend, cp); + return -1; + } + else if (kind != 'R') + { + show_error("%s:pool_do_auth: expect \"R\" got %c",func, kind); + return -1; + } + + /* + * message length (v3 only) */ + if (protoMajor == PROTO_MAJOR_V3 && pool_read_message_length(cp) < 0) + { + return -1; + } + + /* + * read authentication request kind. + * + * 0: authentication ok + * 1: kerberos v4 + * 2: kerberos v5 + * 3: clear text password + * 4: crypt password + * 5: md5 password + * 6: scm credential + * + * in replication mode, we only supports kind = 0, 3. this is because to "salt" + * cannot be replicated among master and secondary. + * in non replication mode, we supports kind = 0, 3, 4, 5 + */ + + status = pool_read(MASTER(cp), &pid, sizeof(pid)); + if (status < 0) + { + show_error("%s:pool_do_auth: read authentication kind failed",func); + return -1; + } + + if (REPLICATION) + { + status = pool_read(SECONDARY(cp), &pid1, sizeof(pid1)); + + if (status < 0) + { + show_error("%s:pool_do_auth: read authentication kind from secondary failed",func); + return -1; + } + } + + pid = ntohl(pid); + + /* trust? */ + if (pid == 0) + { + if (protoMajor == PROTO_MAJOR_V3) + { + int msglen; + + pool_write(frontend, "R", 1); + msglen = htonl(8); + pool_write(frontend, &msglen, sizeof(msglen)); + msglen = htonl(0); + if (pool_write_and_flush(frontend, &msglen, sizeof(msglen)) < 0) + { + return -1; + } + } + MASTER(cp)->auth_kind = 0; + } + + /* clear text password authentication? */ + else if (pid == 3) + { +#ifdef PRINT_DEBUG + show_debug("%s:trying clear text password authentication",func); +#endif + + pid = do_clear_text_password(MASTER(cp), frontend, 0, protoMajor); + + if (pid >= 0 && REPLICATION) + { + pid = do_clear_text_password(SECONDARY(cp), frontend, 0, protoMajor); + } + } + + /* crypt authentication? */ + else if (pid == 4) + { +#ifdef PRINT_DEBUG + show_debug("%s:trying crypt authentication",func); +#endif + + pid = do_crypt(MASTER(cp), frontend, 0, protoMajor); + + if (pid >= 0 && REPLICATION) + { + pid = do_crypt(SECONDARY(cp), frontend, 0, protoMajor); + } + } + + /* md5 authentication? */ + else if (pid == 5) + { +#ifdef PRINT_DEBUG + show_debug("%s:trying md5 authentication",func); +#endif + + pid = do_md5(MASTER(cp), frontend, 0, protoMajor); + + if (pid >= 0 && REPLICATION) + { + pid = do_md5(SECONDARY(cp), frontend, 0, protoMajor); + } + } + + if (pid != 0) + { + show_error("%s:pool_do_auth: backend does not return authenticaton ok",func); + return -1; + } + + /* + * authentication ok. now read pid and secret key from the + * backend + */ + kind = pool_read_kind(cp); + if (kind < 0) + { + return -1; + } + + /* error response? */ + if (kind == 'E') + { + if (protoMajor == PROTO_MAJOR_V2) + ErrorResponse(frontend, cp); + else + SimpleForwardToFrontend(kind, frontend, cp); + return -1; + } + else if (kind != 'K') + { + if (protoMajor == PROTO_MAJOR_V3) + { + /* process parameter status */ + while (kind == 'S') + { + if (ParameterStatus(frontend, cp) != POOL_CONTINUE) + return -1; + + pool_flush(frontend); + + kind = pool_read_kind(cp); + if (kind < 0) + { + show_error("%s:pool_do_auth: failed to read kind while processing ParamterStatus",func); + return -1; + } + } + } + else + { + show_error("%s:pool_do_auth: expect \"K\" got %c",func, kind); + return -1; + } + } + + /* + * message length (V3 only) + */ + if (protoMajor == PROTO_MAJOR_V3 && (length = pool_read_message_length(cp)) != 12) + { + show_error("%s:pool_do_auth: invalid messages length(%d) for BackendKeyData",func, length); + return -1; + } + + /* + * OK, read pid and secret key + */ + + /* pid */ + pool_read(MASTER(cp), &pid, sizeof(pid)); + MASTER_CONNECTION(cp)->pid = pid; + + /* key */ + pool_read(MASTER(cp), &key, sizeof(key)); + MASTER_CONNECTION(cp)->key = key; + + if (REPLICATION) + { + pool_read(SECONDARY(cp), &pid1, sizeof(pid1)); + SECONDARY_CONNECTION(cp)->pid = pid; + + /* key */ + pool_read(SECONDARY(cp), &key1, sizeof(key1)); + SECONDARY_CONNECTION(cp)->key = key; + } + + return (pool_send_auth_ok(frontend, pid, key, protoMajor)); +} + +/* +* do re-authentication for reused connection. if success return 0 otherwise non 0. +*/ +int pool_do_reauth(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *cp) +{ + char * func = "pool_do_reauth()"; + int status; + int protoMajor; + + protoMajor = MAJOR(cp); + + switch(MASTER(cp)->auth_kind) + { + case 0: + /* trust */ + status = 0; + break; + + case 3: + /* clear text password */ + status = do_clear_text_password(MASTER(cp), frontend, 1, protoMajor); + break; + + case 4: + /* crypt password */ + status = do_crypt(MASTER(cp), frontend, 1, protoMajor); + break; + + case 5: + /* md5 password */ + status = do_md5(MASTER(cp), frontend, 1, protoMajor); + break; + + default: + show_error("%s: unknown authentication request code %d", + func,MASTER(cp)->auth_kind); + return -1; + } + + if (status == 0) + { + if (protoMajor == PROTO_MAJOR_V3) + { + int msglen; + + pool_write(frontend, "R", 1); + msglen = htonl(8); + pool_write(frontend, &msglen, sizeof(msglen)); + msglen = htonl(0); + if (pool_write_and_flush(frontend, &msglen, sizeof(msglen)) < 0) + { + return -1; + } + } + } + else + { +#ifdef PRINT_DEBUG + show_debug("%s: authentication failed",func); +#endif + return -1; + } + + return (pool_send_auth_ok(frontend, MASTER_CONNECTION(cp)->pid, MASTER_CONNECTION(cp)->key, protoMajor) != POOL_CONTINUE); +} + +/* +* send authentication ok to frontend. if success return 0 otherwise non 0. +*/ +static POOL_STATUS pool_send_auth_ok(POOL_CONNECTION *frontend, int pid, int key, int protoMajor) +{ + char kind; + int len; + + if (protoMajor == PROTO_MAJOR_V2) + { + /* return "Authentication OK" to the frontend */ + kind = 'R'; + pool_write(frontend, &kind, 1); + len = htonl(0); + if (pool_write_and_flush(frontend, &len, sizeof(len)) < 0) + { + return -1; + } + } + + /* send backend key data */ + kind = 'K'; + pool_write(frontend, &kind, 1); + if (protoMajor == PROTO_MAJOR_V3) + { + len = htonl(12); + pool_write(frontend, &len, sizeof(len)); + } + pool_write(frontend, &pid, sizeof(pid)); + if (pool_write_and_flush(frontend, &key, sizeof(key)) < 0) + { + return -1; + } + + return 0; +} + +/* + * perform clear text password authetication + */ +static int do_clear_text_password(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor) +{ + char * func = "do_clear_text_password()"; + static int size; + static char password[MAX_PASSWORD_SIZE]; + char response; + int kind; + int len; + + /* master? */ + if (!backend->issecondary_backend) + { + pool_write(frontend, "R", 1); /* authenticaton */ + if (protoMajor == PROTO_MAJOR_V3) + { + len = htonl(8); + pool_write(frontend, &len, sizeof(len)); + } + kind = htonl(3); /* clear text password authentication */ + pool_write_and_flush(frontend, &kind, sizeof(kind)); /* indicating clear text password authentication */ + + /* read password packet */ + if (protoMajor == PROTO_MAJOR_V2) + { + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + else + { + char k; + + if (pool_read(frontend, &k, sizeof(k))) + { + show_error("%s: failed to read password packet \"p\"",func); + return -1; + } + if (k != 'p') + { + show_error("%s:packet does not start with \"p\"",func); + return -1; + } + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + + if ((ntohl(size) - 4) > sizeof(password)) + { + show_error("%s: password is too long (size: %d)",func, ntohl(size) - 4); + return -1; + } + + if (pool_read(frontend, password, ntohl(size) - 4)) + { + show_error("%s: failed to read password (size: %d)",func, ntohl(size) - 4); + return -1; + } + } + + /* connection reusing? */ + if (reauth) + { + if ((ntohl(size) - 4) != backend->pwd_size) + { +#ifdef PRINT_DEBUG + show_debug("%s; password size does not match in re-authetication",func); +#endif + return -1; + } + + if (memcmp(password, backend->password, backend->pwd_size) != 0) + { +#ifdef PRINT_DEBUG + show_debug("%s; password does not match in re-authetication",func); +#endif + return -1; + } + + return 0; + } + + /* send password packet to backend */ + if (protoMajor == PROTO_MAJOR_V3) + pool_write(backend, "p", 1); + pool_write(backend, &size, sizeof(size)); + pool_write_and_flush(backend, password, ntohl(size) -4); + if (pool_read(backend, &response, sizeof(response))) + { + show_error("%s: failed to read authentication response",func); + return -1; + } + + if (response != 'R') + { +#ifdef PRINT_DEBUG + show_debug("%s: backend does not return R while processing clear text password authentication",func); +#endif + return -1; + } + + if (protoMajor == PROTO_MAJOR_V3) + { + if (pool_read(backend, &len, sizeof(len))) + { + show_error("%s: failed to read authentication packet size",func); + return -1; + } + + if (ntohl(len) != 8) + { + show_error("%s: incorrect authentication packet size (%d)",func, ntohl(len)); + return -1; + } + } + + /* expect to read "Authentication OK" response. kind should be 0... */ + if (pool_read(backend, &kind, sizeof(kind))) + { +#ifdef PRINT_DEBUG + show_debug("%s: failed to read Authentication OK response",func); +#endif + return -1; + } + + /* if authenticated, save info */ + if (!reauth && kind == 0) + { + if (!backend->issecondary_backend && protoMajor == PROTO_MAJOR_V3) + { + int msglen; + + pool_write(frontend, "R", 1); + msglen = htonl(8); + pool_write(frontend, &msglen, sizeof(msglen)); + msglen = htonl(0); + if (pool_write_and_flush(frontend, &msglen, sizeof(msglen)) < 0) + { + return -1; + } + } + + backend->auth_kind = 3; + backend->pwd_size = ntohl(size) - 4; + memcpy(backend->password, password, backend->pwd_size); + } + return kind; +} + +/* + * perform crypt authetication + */ +static int do_crypt(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor) +{ + char * func = "do_crypt()"; + char salt[2]; + static int size; + static char password[MAX_PASSWORD_SIZE]; + char response; + int kind; + int len; + + if (!reauth) + { + /* read salt */ + if (pool_read(backend, salt, sizeof(salt))) + { + show_error("%s: failed to read salt",func); + return -1; + } + } + else + { + memcpy(salt, backend->salt, sizeof(salt)); + } + + /* master? */ + if (!backend->issecondary_backend) + { + pool_write(frontend, "R", 1); /* authenticaton */ + if (protoMajor == PROTO_MAJOR_V3) + { + len = htonl(10); + pool_write(frontend, &len, sizeof(len)); + } + kind = htonl(4); /* crypt authentication */ + pool_write(frontend, &kind, sizeof(kind)); /* indicating crypt authentication */ + pool_write_and_flush(frontend, salt, sizeof(salt)); /* salt */ + + /* read password packet */ + if (protoMajor == PROTO_MAJOR_V2) + { + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + else + { + char k; + + if (pool_read(frontend, &k, sizeof(k))) + { + show_error("%s: failed to read password packet",func); + return -1; + } + if (k != 'p') + { + show_error("%s: password packet does not start with \"p\"",func); + return -1; + } + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + + if ((ntohl(size) - 4) > sizeof(password)) + { + show_error("%s: password is too long(size: %d)", func,ntohl(size) - 4); + return -1; + } + + if (pool_read(frontend, password, ntohl(size) - 4)) + { + show_error("%s: failed to read password (size: %d)", func,ntohl(size) - 4); + return -1; + } + } + + /* connection reusing? */ + if (reauth) + { +#ifdef PRINT_DEBUG + show_debug("%s:size: %d saved_size: %d",func, (ntohl(size) - 4), backend->pwd_size); +#endif + if ((ntohl(size) - 4) != backend->pwd_size) + { +#ifdef PRINT_DEBUG + show_debug("%s: password size does not match in re-authetication",func); +#endif + return -1; + } + + if (memcmp(password, backend->password, backend->pwd_size) != 0) + { +#ifdef PRINT_DEBUG + show_debug("%s: password does not match in re-authetication",func); +#endif + return -1; + } + + return 0; + } + + /* send password packet to backend */ + if (protoMajor == PROTO_MAJOR_V3) + pool_write(backend, "p", 1); + pool_write(backend, &size, sizeof(size)); + pool_write_and_flush(backend, password, ntohl(size) -4); + if (pool_read(backend, &response, sizeof(response))) + { + show_error("%s: failed to read authentication response",func); + return -1; + } + + if (response != 'R') + { +#ifdef PRINT_DEBUG + show_debug("%s: backend does not return R while processing crypt authentication(%02x) secondary: %d",func, response, backend->issecondary_backend); +#endif + return -1; + } + + if (protoMajor == PROTO_MAJOR_V3) + { + if (pool_read(backend, &len, sizeof(len))) + { + show_error("%s: failed to read authentication packet size",func); + return -1; + } + + if (ntohl(len) != 8) + { + show_error("%s: incorrect authentication packet size (%d)",func, ntohl(len)); + return -1; + } + } + + /* expect to read "Authentication OK" response. kind should be 0... */ + if (pool_read(backend, &kind, sizeof(kind))) + { +#ifdef PRINT_DEBUG + show_debug("%s: failed to read Authentication OK response",func); +#endif + return -1; + } + + /* if authenticated, save info */ + if (!reauth && kind == 0) + { + if (protoMajor == PROTO_MAJOR_V3) + { + int msglen; + + pool_write(frontend, "R", 1); + msglen = htonl(8); + pool_write(frontend, &msglen, sizeof(msglen)); + msglen = htonl(0); + if (pool_write_and_flush(frontend, &msglen, sizeof(msglen)) < 0) + { + return -1; + } + } + backend->auth_kind = 4; + backend->pwd_size = ntohl(size) - 4; + memcpy(backend->password, password, backend->pwd_size); + memcpy(backend->salt, salt, sizeof(salt)); + } + return kind; +} + +/* + * perform MD5 authetication + */ +static int do_md5(POOL_CONNECTION *backend, POOL_CONNECTION *frontend, int reauth, int protoMajor) +{ + char * func = "do_md5()"; + char salt[4]; + static int size; + static char password[MAX_PASSWORD_SIZE]; + char response; + int kind; + int len; + + if (!reauth) + { + /* read salt */ + if (pool_read(backend, salt, sizeof(salt))) + { + show_error("%s: failed to read salt",func); + return -1; + } + } + else + { + memcpy(salt, backend->salt, sizeof(salt)); + } + + /* master? */ + if (!backend->issecondary_backend) + { + pool_write(frontend, "R", 1); /* authenticaton */ + if (protoMajor == PROTO_MAJOR_V3) + { + len = htonl(12); + pool_write(frontend, &len, sizeof(len)); + } + kind = htonl(5); + pool_write(frontend, &kind, sizeof(kind)); /* indicating MD5 */ + pool_write_and_flush(frontend, salt, sizeof(salt)); /* salt */ + + /* read password packet */ + if (protoMajor == PROTO_MAJOR_V2) + { + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + else + { + char k; + + if (pool_read(frontend, &k, sizeof(k))) + { + show_error("%s: failed to read password packet \"p\"",func); + return -1; + } + if (k != 'p') + { + show_error("%s: password packet does not start with \"p\"",func); + return -1; + } + if (pool_read(frontend, &size, sizeof(size))) + { + show_error("%s: failed to read password packet size",func); + return -1; + } + } + + if ((ntohl(size) - 4) > sizeof(password)) + { + show_error("%s: password is too long(size: %d)",func, ntohl(size) - 4); + return -1; + } + + if (pool_read(frontend, password, ntohl(size) - 4)) + { + show_error("%s: failed to read password (size: %d)",func, ntohl(size) - 4); + return -1; + } + } + + /* connection reusing? */ + if (reauth) + { + if ((ntohl(size) - 4) != backend->pwd_size) + { +#ifdef PRINT_DEBUG + show_debug("%s; password size does not match in re-authetication",func); +#endif + return -1; + } + + if (memcmp(password, backend->password, backend->pwd_size) != 0) + { +#ifdef PRINT_DEBUG + show_debug("%s; password does not match in re-authetication",func); +#endif + return -1; + } + + return 0; + } + + /* send password packet to backend */ + if (protoMajor == PROTO_MAJOR_V3) + pool_write(backend, "p", 1); + pool_write(backend, &size, sizeof(size)); + pool_write_and_flush(backend, password, ntohl(size) -4); + if (pool_read(backend, &response, sizeof(response))) + { + show_error("%s: failed to read authentication response",func); + return -1; + } + + if (response != 'R') + { +#ifdef PRINT_DEBUG + show_debug("%s: backend does not return R while processing MD5 authentication %c", func,response); +#endif + return -1; + } + + if (protoMajor == PROTO_MAJOR_V3) + { + if (pool_read(backend, &len, sizeof(len))) + { + show_error("%s: failed to read authentication packet size",func); + return -1; + } + + if (ntohl(len) != 8) + { + show_error("%s: incorrect authentication packet size (%d)",func, ntohl(len)); + return -1; + } + } + + /* expect to read "Authentication OK" response. kind should be 0... */ + if (pool_read(backend, &kind, sizeof(kind))) + { +#ifdef PRINT_DEBUG + show_debug("%s: failed to read Authentication OK response",func); +#endif + return -1; + } + + /* if authenticated, save info */ + if (!reauth && kind == 0) + { + if (protoMajor == PROTO_MAJOR_V3) + { + int msglen; + + pool_write(frontend, "R", 1); + msglen = htonl(8); + pool_write(frontend, &msglen, sizeof(msglen)); + msglen = htonl(0); + if (pool_write_and_flush(frontend, &msglen, sizeof(msglen)) < 0) + { + return -1; + } + } + backend->auth_kind = 5; + backend->pwd_size = ntohl(size) - 4; + memcpy(backend->password, password, backend->pwd_size); + memcpy(backend->salt, salt, sizeof(salt)); + } + return kind; +} + +/* + * read message length (V3 only) + */ +int pool_read_message_length(POOL_CONNECTION_POOL *cp) +{ + char * func = "pool_read_message_length()"; + int status; + int length, length1; + + status = pool_read(MASTER(cp), &length, sizeof(length)); + if (status < 0) + { + show_error("%s: error while reading message length",func); + return -1; + } + length = ntohl(length); + + if (REPLICATION) + { + status = pool_read(SECONDARY(cp), &length1, sizeof(length1)); + if (status < 0) + { + show_error("%s: error while reading message length from secondary backend",func); + return -1; + } + length1 = ntohl(length1); + + if (length != length1) + { + show_error("%s: length does not match between backends master(%d) secondary(%d)", + func,length, length1); + return -1; + } + } + + if (length < 0) + { + show_error("%s:read_message_length: invalid message length (%d)", func, length); + return -1; + } + + return length; +} + +signed char pool_read_kind(POOL_CONNECTION_POOL *cp) +{ + char * func = "pool_read_kind()"; + int status; + char kind, kind1; + + status = pool_read(MASTER(cp), &kind, sizeof(kind)); + if (status < 0) + { + show_error("%s:read_message_kind: error while reading message kind",func); + return -1; + } + + if (REPLICATION) + { + status = pool_read(SECONDARY(cp), &kind1, sizeof(kind1)); + if (status < 0) + { + show_error("%s: error while reading message kind from secondary backend",func); + return -1; + } + + if (kind != kind1) + { + show_error("%s: kind does not match between backends master(%d) secondary(%d)", + func, kind, kind1); + return -1; + } + } + + return kind; +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pool_connection_pool.c pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_connection_pool.c --- postgresql-8.2.4/src/pgcluster/pglb/pool_connection_pool.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_connection_pool.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,535 @@ +/*-------------------------------------------------------------------- + * FILE: + * pool_connection_pool.c + * + * NOTE: + * connection pool stuff + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + */ +#include "postgres.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + +POOL_CONNECTION_POOL *pool_connection_pool; /* connection pool */ + +int pool_init_cp(void); +POOL_CONNECTION_POOL *pool_get_cp(char *user, char *database, int protoMajor); +void pool_discard_cp(char *user, char *database, int protoMajor); +POOL_CONNECTION_POOL *pool_create_cp(void); +void pool_connection_pool_timer(POOL_CONNECTION_POOL *backend); +void pool_backend_timer_handler(int sig); +int connect_inet_domain_socket(int secondary_backend); +int connect_unix_domain_socket(int secondary_backend); +char PGRis_same_host(char * host1, char * host2); +void pool_finish(void); + + +static POOL_CONNECTION_POOL_SLOT *create_cp(POOL_CONNECTION_POOL_SLOT *cp, int secondary_backend); +static POOL_CONNECTION_POOL *new_connection(POOL_CONNECTION_POOL *p); + + + +/* +* initialize connection pools. this should be called once at the startup. +*/ +int pool_init_cp(void) +{ + char * func = "pool_init_cp()"; + pool_connection_pool = (POOL_CONNECTION_POOL *)malloc(sizeof(POOL_CONNECTION_POOL)*Max_Pool); + if (pool_connection_pool == NULL) + { + show_error("%s: malloc() failed[%s]",func,strerror(errno)); + return -1; + } + memset(pool_connection_pool, 0, sizeof(POOL_CONNECTION_POOL)*Max_Pool); + + return 0; +} + +/* +* find connection by user and database +*/ +POOL_CONNECTION_POOL *pool_get_cp(char *user, char *database, int protoMajor) +{ + char * func = "pool_get_cp()"; + int i; + + POOL_CONNECTION_POOL *p = pool_connection_pool; + + if (p == NULL) + { + show_error("%s: pool_connection_pool is not initialized",func); + return NULL; + } + + for (i=0;isp->major == protoMajor && + MASTER_CONNECTION(p)->sp->user != NULL && + strcmp(MASTER_CONNECTION(p)->sp->user, user) == 0 && + strcmp(MASTER_CONNECTION(p)->sp->database, database) == 0) + { + /* mark this connection is under use */ + MASTER_CONNECTION(p)->closetime = 0; + return p; + } + p++; + } + return NULL; +} + +/* + * disconnect and release a connection to the database + */ +void pool_discard_cp(char *user, char *database, int protoMajor) +{ + char * func = "pool_discard_cp()"; + POOL_CONNECTION_POOL *p = pool_get_cp(user, database, protoMajor); + + if (p == NULL) + { + show_error("%s: cannot get connection pool for user %s datbase %s", func,user, database); + return; + } + + free(MASTER_CONNECTION(p)->sp->user); + free(MASTER_CONNECTION(p)->sp->database); + free(MASTER_CONNECTION(p)->sp->startup_packet); + pool_close(MASTER_CONNECTION(p)->con); + + memset(p, 0, sizeof(POOL_CONNECTION_POOL)); +} + + +/* +* create a connection pool by user and database +*/ +POOL_CONNECTION_POOL *pool_create_cp(void) +{ + char * func = "pool_create_cp()"; + int i; + time_t closetime; + POOL_CONNECTION_POOL *oldestp; + + POOL_CONNECTION_POOL *p = pool_connection_pool; + + if (p == NULL) + { + show_error("%s: pool_connection_pool is not initialized",func); + return NULL; + } + + for (i=0; iclosetime; + for (i=0; isp->user, + MASTER_CONNECTION(p)->sp->database, + MASTER_CONNECTION(p)->closetime); +#endif + if (MASTER_CONNECTION(p)->closetime < closetime) + { + closetime = MASTER_CONNECTION(p)->closetime; + oldestp = p; + } + p++; + } + + p = oldestp; + pool_send_frontend_exits(p); + +#ifdef PRINT_DEBUG + show_debug("%s:discarding old %d th connection. user: %s database: %s", + func, + oldestp - pool_connection_pool, + MASTER_CONNECTION(p)->sp->user, + MASTER_CONNECTION(p)->sp->database); +#endif + + free(MASTER_CONNECTION(p)->sp->user); + free(MASTER_CONNECTION(p)->sp->database); + free(MASTER_CONNECTION(p)->sp->startup_packet); + pool_close(MASTER_CONNECTION(p)->con); + + memset(p, 0, sizeof(POOL_CONNECTION_POOL)); + + return new_connection(p); +} + +/* + * set backend connection close timer + */ +void pool_connection_pool_timer(POOL_CONNECTION_POOL *backend) +{ +#ifdef PRINT_DEBUG + char * func = "pool_connection_pool_timer()"; +#endif + POOL_CONNECTION_POOL *p = pool_connection_pool; + int i; + +#ifdef PRINT_DEBUG + show_debug("%s:pool_connection_pool_timer: called",func); +#endif + + MASTER_CONNECTION(backend)->closetime = time(NULL); /* set connection close time */ + + if (Connection_Life_Time == 0) + return; + + /* look for any other timeout */ + for (i=0;isp->user == NULL) + continue; + + if (p != backend && MASTER_CONNECTION(p)->closetime) + return; + } + + /* no other timer found. set my timer */ +#ifdef PRINT_DEBUG + show_debug("%s: set alarm after %d seconds",func, Connection_Life_Time); +#endif + signal(SIGALRM, pool_backend_timer_handler); + alarm(Connection_Life_Time); +} + +/* + * backend connection close timer handler + */ +void pool_backend_timer_handler(int sig) +{ +#define TMINTMAX 0x7fffffff + +#ifdef PRINT_DEBUG + char * func = "pool_backend_timer_handler()"; +#endif + POOL_CONNECTION_POOL *p = pool_connection_pool; + int i; + time_t now; + time_t nearest = TMINTMAX; + + now = time(NULL); + +#ifdef PRINT_DEBUG + show_debug("%s:called at %d", func,now); +#endif + + for (i=0;isp->user == NULL) + continue; + + /* timer expire? */ + if (MASTER_CONNECTION(p)->closetime) + { +#ifdef PRINT_DEBUG + show_debug("%s: expire time: %d", + func, + MASTER_CONNECTION(p)->closetime+Connection_Life_Time); +#endif + + if (now >= (MASTER_CONNECTION(p)->closetime+Connection_Life_Time)) + { + /* discard expired connection */ +#ifdef PRINT_DEBUG + show_debug("%s: expires user %s database %s", func, MASTER_CONNECTION(p)->sp->user, MASTER_CONNECTION(p)->sp->database); +#endif + + pool_send_frontend_exits(p); + + free(MASTER_CONNECTION(p)->sp->user); + free(MASTER_CONNECTION(p)->sp->database); + free(MASTER_CONNECTION(p)->sp->startup_packet); + pool_close(MASTER_CONNECTION(p)->con); + + memset(p, 0, sizeof(POOL_CONNECTION_POOL)); + } + else + { + /* look for nearest timer */ + if (MASTER_CONNECTION(p)->closetime < nearest) + nearest = MASTER_CONNECTION(p)->closetime; + } + } + } + + /* any remaining timer */ + if (nearest != TMINTMAX) + { + nearest = Connection_Life_Time - (now - nearest); + if (nearest <= 0) + nearest = 1; + signal(SIGALRM, pool_backend_timer_handler); + alarm(nearest); + } +} + +int connect_inet_domain_socket(int secondary_backend) +{ + char * func = "connect_inet_domain_socket()"; + int fd; + int len; + int on = 1; + struct sockaddr_in addr; + struct hostent *hp; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + show_error("%s: socket() failed: %s",func, strerror(errno)); + return -1; + } + + /* set nodelay */ + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (char *) &on, + sizeof(on)) < 0) + { + show_error("%s: setsockopt() failed: %s", func, strerror(errno)); + close(fd); + return -1; + } + + memset((char *) &addr, 0, sizeof(addr)); + ((struct sockaddr *)&addr)->sa_family = AF_INET; + + addr.sin_port = htons(CurrentCluster->port); + len = sizeof(struct sockaddr_in); + + hp = gethostbyname(CurrentCluster->hostName); + + if ((hp == NULL) || (hp->h_addrtype != AF_INET)) + { + show_error("%s: gethostbyname() failed: %s host: %s",func, strerror(errno), CurrentCluster->hostName); + close(fd); + return -1; + } + memmove((char *) &(addr.sin_addr), + (char *) hp->h_addr, + hp->h_length); + + if (connect(fd, (struct sockaddr *)&addr, len) < 0) + { + show_error("%s: connect() failed: %s",func,strerror(errno)); + close(fd); + return -1; + } + return fd; +} + +int connect_unix_domain_socket(int secondary_backend) +{ + char * func = "connect_unix_domain_socket()"; + struct sockaddr_un addr; + int fd; + int len; + int port; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + { + show_error("%s: setsockopt() failed: %s", func,strerror(errno)); + return -1; + } + + port = CurrentCluster->port; + memset((char *) &addr, 0, sizeof(addr)); + ((struct sockaddr *)&addr)->sa_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/.s.PGSQL.%d", + Backend_Socket_Dir, + CurrentCluster->port); +#ifdef PRINT_DEBUG + show_debug("%s:postmaster Unix domain socket: %s",func, addr.sun_path); +#endif + + len = sizeof(struct sockaddr_un); + + if (connect(fd, (struct sockaddr *)&addr, len) < 0) + { + show_error("%s: connect() failed: %s",func, strerror(errno)); + close(fd); + return -1; + } +#ifdef PRINT_DEBUG + show_debug("%s:connected to postmaster Unix domain socket: %s fd: %d", func,addr.sun_path, fd); +#endif + return fd; +} + +static POOL_CONNECTION_POOL_SLOT *create_cp(POOL_CONNECTION_POOL_SLOT *cp, int secondary_backend) +{ + char * func = "create_cp()"; + int fd; + char hostName[HOSTNAME_MAX_LENGTH]; + + if (gethostname(hostName,sizeof(hostName)) < 0) + { + show_error("%s:gethostname() failed. (%s)",func,strerror(errno)); + return NULL; + } + if (PGRis_same_host(hostName,CurrentCluster->hostName) == 1) + { +#ifdef PRINT_DEBUG + show_debug("%s:[%s] [%s] is same",func,hostName,CurrentCluster->hostName); +#endif + fd = connect_unix_domain_socket(secondary_backend); + } + else + { + fd = connect_inet_domain_socket(secondary_backend); + } + + if (fd < 0) + { + /* fatal error, notice to parent and exit */ + notice_backend_error(); + exit(1); + } + + cp->con = pool_open(fd); + cp->closetime = 0; + return cp; +} + +static POOL_CONNECTION_POOL *new_connection(POOL_CONNECTION_POOL *p) +{ + char * func = "new_connection()"; + /* create master connection */ + MASTER_CONNECTION(p) = malloc(sizeof(POOL_CONNECTION_POOL_SLOT)); + if (MASTER_CONNECTION(p) == NULL) + { + show_error("%s: malloc() failed [%s]",func,strerror(errno)); + return NULL; + } + create_cp(MASTER_CONNECTION(p), 0); + + /* initialize Paramter Status save structure */ + if (pool_init_params(&MASTER(p)->params)) + { + return NULL; + } + p->num = 1; /* number of slots */ + + return p; +} + +char PGRis_same_host(char * host1, char * host2) +{ + unsigned int ip1, ip2; + + if ((host1 == NULL) || (host2 == NULL)) + { + return 0; + } + ip1 = PGRget_ip_by_name( host1); + ip2 = PGRget_ip_by_name( host2); + if (ip1 == ip2) + { + return 1; + } + return 0; +} + +void pool_finish(void) +{ + char * func = "pool_finish()"; + int i; + + POOL_CONNECTION_POOL *p = pool_connection_pool; +return; + if (p == NULL) + { + show_error("%s:pool_connection_pool is not initialized",func); + return ; + } + + for (i=0 ; isp->user != NULL) + { + free(MASTER_CONNECTION(p)->sp->user); + MASTER_CONNECTION(p)->sp->user = NULL; + } + if (MASTER_CONNECTION(p)->sp->database != NULL) + { + free(MASTER_CONNECTION(p)->sp->database); + MASTER_CONNECTION(p)->sp->database = NULL; + } + if (MASTER_CONNECTION(p)->sp->startup_packet != NULL) + { + free(MASTER_CONNECTION(p)->sp->startup_packet); + MASTER_CONNECTION(p)->sp->startup_packet = NULL; + } + */ + if (MASTER_CONNECTION(p)->con != NULL) + { + pool_close(MASTER_CONNECTION(p)->con); + MASTER_CONNECTION(p)->con = NULL; + } + memset(p, 0, sizeof(POOL_CONNECTION_POOL)); + p++; + } + free((char *)pool_connection_pool); + pool_connection_pool = NULL; +} + diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pool_params.c pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_params.c --- postgresql-8.2.4/src/pgcluster/pglb/pool_params.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_params.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,184 @@ +/*-------------------------------------------------------------------- + * FILE: + * pool_params.c + * + * NOTE: + * connection pool stuff + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + */ + +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + +#define MAX_PARAM_ITEMS 128 + +int pool_init_params(ParamStatus *params); +void pool_discard_params(ParamStatus *params); +char *pool_find_name(ParamStatus *params, char *name, int *pos); +int pool_get_param(ParamStatus *params, int index, char **name, char **value); +int pool_add_param(ParamStatus *params, char *name, char *value); +void pool_param_debug_print(ParamStatus *params); + +/* + * initialize parameter structure + */ +int pool_init_params(ParamStatus *params) +{ + char * func = "pool_init_params()"; + + params->num = 0; + params->names = malloc(MAX_PARAM_ITEMS*sizeof(char *)); + if (params->names == NULL) + { + show_error("%s: cannot allocate memory",func); + return -1; + } + params->values = malloc(MAX_PARAM_ITEMS*sizeof(char *)); + if (params->values == NULL) + { + show_error("%s: cannot allocate memory",func); + return -1; + } + return 0; +} + +/* + * discard parameter structure + */ +void pool_discard_params(ParamStatus *params) +{ + int i; + + for (i=0;inum;i++) + { + free(params->names[i]); + free(params->values[i]); + } + free(params->names); + free(params->values); +} + +/* + * find param value by name. if found, its value is returned + * also, pos is set + * if not found, NULL is returned + */ +char *pool_find_name(ParamStatus *params, char *name, int *pos) +{ + int i; + + for (i=0;inum;i++) + { + if (!strcmp(name, params->names[i])) + { + *pos = i; + return params->values[i]; + } + } + return NULL; +} + +/* + * return name and value by index. + */ +int pool_get_param(ParamStatus *params, int index, char **name, char **value) +{ + if (index < 0 || index >= params->num) + return -1; + + *name = params->names[index]; + *value = params->values[index]; + + return 0; +} + +/* + * add or replace name/value pair + */ +int pool_add_param(ParamStatus *params, char *name, char *value) +{ + char * func = "pool_add_param()"; + int pos; + + if (pool_find_name(params, name, &pos)) + { + /* name already exists */ + if (strlen(params->values[pos]) < strlen(value)) + { + params->values[pos] = realloc(params->values[pos], strlen(value) + 1); + if (params->values[pos] == NULL) + { + show_error("%s: cannot allocate memory",func); + return -1; + } + } + strcpy(params->values[pos], value); + } + else + { + int num; + + /* add name/value pair */ + if (params->num >= MAX_PARAM_ITEMS) + { + show_error("%s: no more room for num",func); + return -1; + } + num = params->num; + params->names[num] = strdup(name); + if (params->names[num] == NULL) + { + show_error("%s: cannot allocate memory",func); + return -1; + } + params->values[num] = strdup(value); + if (params->values[num] == NULL) + { + show_error("%s: cannot allocate memory",func); + return -1; + } + params->num++; + } + return 0; +} + +void pool_param_debug_print(ParamStatus *params) +{ +#ifdef PRINT_DEBUG + char * func = "pool_param_debug_print()"; +#endif + int i; + + for (i=0;inum;i++) + { +#ifdef PRINT_DEBUG + show_debug("%s: No.%d: name: %s value: %s",func, i, params->names[i], params->values[i]); +#endif + } +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pool_process_query.c pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_process_query.c --- postgresql-8.2.4/src/pgcluster/pglb/pool_process_query.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_process_query.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,2100 @@ +/*-------------------------------------------------------------------- + * FILE: + * pool_process_query.c + * + * NOTE: + * query processing stuff + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "postgres_fe.h" +#include "libpq/pqcomm.h" + +#include "replicate_com.h" +#include "pglb.h" + +POOL_STATUS pool_process_query(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, int connection_reuse); +POOL_STATUS ErrorResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +void pool_enable_timeout(void); +void pool_disable_timeout(void); +int pool_check_fd(POOL_CONNECTION *cp, int notimeout); +void pool_send_frontend_exits(POOL_CONNECTION_POOL *backend); +POOL_STATUS SimpleForwardToFrontend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +POOL_STATUS SimpleForwardToBackend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +POOL_STATUS ParameterStatus(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +void pool_send_error_message(POOL_CONNECTION *frontend, int protoMajor, char *code, char *message, char *detail, char *hint, char *file, int line); + + +static POOL_STATUS Query(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, char *query); +static POOL_STATUS ReadyForQuery(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, int send_ready); +static POOL_STATUS CompleteCommandResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static int RowDescription(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS AsciiRow(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, short num_fields); +static POOL_STATUS BinaryRow(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, short num_fields); +static POOL_STATUS CursorResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS NoticeResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS CopyInResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS CopyOutResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS CopyDataRows(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, int copyin); +static POOL_STATUS EmptyQueryResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS NotificationResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS FunctionCall(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS FunctionResultResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static POOL_STATUS ProcessFrontendResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static int synchronize(POOL_CONNECTION *cp); +static void process_reporting(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend); +static int reset_backend(POOL_CONNECTION_POOL *backend, int qcnt); +static int load_balance_enabled(POOL_CONNECTION_POOL *backend, char *sql); +static void start_load_balance(POOL_CONNECTION_POOL *backend); +static void end_load_balance(POOL_CONNECTION_POOL *backend); + +static POOL_CONNECTION_POOL_SLOT *slots[MAX_CONNECTION_SLOTS]; + +POOL_STATUS pool_process_query(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend, + int connection_reuse) +{ + char * func = "pool_process_query()"; + char kind, kind1; /* packet kind (backend) */ + char fkind; /* packet kind (frontend) */ + short num_fields = 0; + fd_set readmask; + fd_set writemask; + fd_set exceptmask; + int fds; + POOL_STATUS status; + int state; /* 0: ok to issue commands 1: waiting for "ready for query" response */ + int qcnt; + + frontend->no_forward = connection_reuse; + qcnt = 0; + state = 0; + + for (;;) + { + kind = kind1 = 0; + fkind = 0; + + if (state == 0 && connection_reuse) + { + int st = 0; + + /* send query for resetting connection such as "ROLLBACK" "RESET ALL"... */ + st = reset_backend(backend, qcnt); + + if (st < 0) /* error? */ + return POOL_END; + + else if (st == 0) /* no query issued? */ + { + qcnt++; + continue; + } + + else if (st == 1) /* more query remains */ + { + state = 1; + qcnt++; + continue; + } + + else if (st == 2) /* no more qury */ + { + frontend->no_forward = 0; + return POOL_CONTINUE; + } + + } + + if ((!REPLICATION && MASTER(backend)->len == 0 && frontend->len == 0) || + (REPLICATION && MASTER(backend)->len == 0 && + SECONDARY(backend)->len == 0 + && frontend->len == 0)) + { + + struct timeval timeout; + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + FD_ZERO(&readmask); + FD_ZERO(&writemask); + FD_ZERO(&exceptmask); + if (!connection_reuse) + FD_SET(frontend->fd, &readmask); + FD_SET(MASTER(backend)->fd, &readmask); + if (REPLICATION) + FD_SET(SECONDARY(backend)->fd, &readmask); + if (!connection_reuse) + FD_SET(frontend->fd, &exceptmask); + FD_SET(MASTER(backend)->fd, &exceptmask); + + if (connection_reuse) + { + if (REPLICATION) + fds = select(Max(SECONDARY(backend)->fd, MASTER(backend)->fd) + 1, + &readmask, &writemask, &exceptmask, NULL); + else + fds = select(MASTER(backend)->fd+1, &readmask, &writemask, &exceptmask, NULL); + } + else + { + if (REPLICATION) + fds = select(Max(SECONDARY(backend)->fd, + Max(frontend->fd, MASTER(backend)->fd)+1), + &readmask, &writemask, &exceptmask, NULL); + else + fds = select(Max(frontend->fd, MASTER(backend)->fd)+1, + &readmask, &writemask, &exceptmask, NULL); + } + + if (fds == -1) + { + if (errno == EINTR) + continue; + + show_error("%s:select() failed. reason: %s",func, strerror(errno)); + return POOL_ERROR; + } + + if (fds == 0) + { + return POOL_CONTINUE; + } + + if (FD_ISSET(MASTER(backend)->fd, &readmask)) + { + pool_read(MASTER(backend), &kind, 1); +#ifdef PRINT_DEBUG + show_debug("%s:read kind from backend %c", func,kind); +#endif + } + + if (REPLICATION && FD_ISSET(SECONDARY(backend)->fd, &readmask)) + { + pool_read(SECONDARY(backend), &kind1, 1); +#ifdef PRINT_DEBUG + show_debug("%s:read kind from secondary backend %c", func,kind1); +#endif + } + + if (!connection_reuse && FD_ISSET(frontend->fd, &exceptmask)) + { + return POOL_END; + } + if (FD_ISSET(MASTER(backend)->fd, &exceptmask)) + { + return POOL_ERROR; + } + + if (!connection_reuse && FD_ISSET(frontend->fd, &readmask)) + { + status = ProcessFrontendResponse(frontend, backend); + if (status != POOL_CONTINUE) + return status; + + continue; + } + } + else + { + if (MASTER(backend)->len > 0) + { + pool_read(MASTER(backend), &kind, 1); + if (REPLICATION) + { + pool_read(SECONDARY(backend), &kind1, 1); + if (kind == '\0' || kind != kind1) + { + show_error("%s: kind does not match between backends master(%c) secondary(%c)", + func, kind, kind1); + pool_send_error_message(frontend, MAJOR(backend), "XX000", + "kind mismatch between backends", "", + "check data consistency between master and secondary", __FILE__, __LINE__); + + if (pool_config_replication_stop_on_mismatch) + return POOL_FATAL; + else + return POOL_ERROR; + } + } +#ifdef PRINT_DEBUG + show_debug("%s:read kind from backend pending data %c len: %d po: %d", func, kind, MASTER(backend)->len, MASTER(backend)->po); +#endif + } + if (frontend->len > 0) + { + status = ProcessFrontendResponse(frontend, backend); + if (status != POOL_CONTINUE) + return status; + + continue; + } + } + + /* this is the synchronous point */ + if (REPLICATION) + { + if (kind == 0) + { + pool_read(MASTER(backend), &kind, 1); + } + if (kind1 == 0) + { + pool_read(SECONDARY(backend), &kind1, 1); + } + if (kind == '\0' || kind != kind1) + { + show_error("%s: kind does not match between backends master(%c) secondary(%c)", + func, kind, kind1); + pool_send_error_message(frontend, MAJOR(backend), "XX000", + "kind mismatch between backends", "", + "check data consistency between master and secondary", __FILE__, __LINE__); + + if (pool_config_replication_stop_on_mismatch) + return POOL_FATAL; + else + return POOL_ERROR; + } + } + + /* + * Prrocess backend Response + */ + + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + switch (kind) + { + case 'G': + /* CopyIn response */ + status = CopyInResponse(frontend, backend); + break; + case 'S': + /* Paramter Status */ + status = ParameterStatus(frontend, backend); + break; + case 'Z': + /* Ready for query */ + status = ReadyForQuery(frontend, backend, 1); + break; + default: + status = SimpleForwardToFrontend(kind, frontend, backend); + break; + } + } + else + { + switch (kind) + { + case 'A': + /* Notification response */ + status = NotificationResponse(frontend, backend); + break; + + case 'B': + /* BinaryRow */ + status = BinaryRow(frontend, backend, num_fields); + break; + + case 'C': + /* Complete command response */ + status = CompleteCommandResponse(frontend, backend); + break; + + case 'D': + /* AsciiRow */ + status = AsciiRow(frontend, backend, num_fields); + break; + + case 'E': + /* Error Response */ + status = ErrorResponse(frontend, backend); + break; + + case 'G': + /* CopyIn Response */ + status = CopyInResponse(frontend, backend); + break; + + case 'H': + /* CopyOut Response */ + status = CopyOutResponse(frontend, backend); + break; + + case 'I': + /* Empty Query Response */ + status = EmptyQueryResponse(frontend, backend); + break; + + case 'N': + /* Notice Response */ + status = NoticeResponse(frontend, backend); + break; + + case 'P': + /* CursorResponse */ + status = CursorResponse(frontend, backend); + break; + + case 'T': + /* RowDescription */ + status = RowDescription(frontend, backend); + if (status < 0) + return POOL_ERROR; + + num_fields = status; + status = POOL_CONTINUE; + break; + + case 'V': + /* FunctionResultResponse and FunctionVoidResponse */ + status = FunctionResultResponse(frontend, backend); + break; + + case 'Z': + /* Ready for query */ + status = ReadyForQuery(frontend, backend, 1); + break; + + default: + show_error("%s:Unknown message type %c(%02x)",func, kind, kind); + exit(1); + } + } + + if (status != POOL_CONTINUE) + return status; + + if (kind == 'Z' && frontend->no_forward && state == 1) + { + state = 0; + } + + } + return POOL_CONTINUE; +} + +static POOL_STATUS Query(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend, char *query) +{ +#ifdef PRINT_DEBUG + char * func = "Query()"; +#endif + char *string; + int len; + static char *sq = "show pool_status"; + + if (query == NULL) + { + /* read actual query */ + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + if (pool_read(frontend, &len, sizeof(len)) < 0) + return POOL_END; + len = ntohl(len) - 4; + string = pool_read2(frontend, len); + } + else + string = pool_read_string(frontend, &len, 0); + + if (string == NULL) + return POOL_END; + } + else + { + len = strlen(query)+1; + string = query; + } + +#ifdef PRINT_DEBUG + show_debug("%s: %s", func,string); +#endif + + /* process status reporting? */ + if (strncasecmp(sq, string, strlen(sq)) == 0) + { +#ifdef PRINT_DEBUG + show_debug("%s:process reporting",func); +#endif + process_reporting(frontend, backend); + return POOL_CONTINUE; + } + + /* load balance trick */ + if (load_balance_enabled(backend, string)) + start_load_balance(backend); + + /* forward the query to the backend */ + pool_write(MASTER(backend), "Q", 1); + + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + int sendlen = htonl(len + 4); + pool_write(MASTER(backend), &sendlen, sizeof(sendlen)); + } + + if (pool_write_and_flush(MASTER(backend), string, len) < 0) + { + return POOL_END; + } + + if (REPLICATION) + { + /* in "strict mode" we need to wait for master completing the query */ + if (pool_config_replication_strict || STRICT_MODE(string)) + if (synchronize(MASTER(backend))) + return POOL_END; + + pool_write(SECONDARY(backend), "Q", 1); + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + int sendlen = htonl(len + 4); + pool_write(SECONDARY(backend), &sendlen, sizeof(sendlen)); + } + + if (pool_write_and_flush(SECONDARY(backend), string, len) < 0) + { + return POOL_END; + } + } + return POOL_CONTINUE; +} + +static POOL_STATUS ReadyForQuery(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend, int send_ready) +{ +#ifdef PRINT_DEBUG + char * func = "ReadyForQuery()"; +#endif + + pool_flush(frontend); + + if (send_ready) + { + pool_write(frontend, "Z", 1); + + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + int len; + signed char state; + + if ((len = pool_read_message_length(backend)) < 0) + return POOL_END; + +#ifdef PRINT_DEBUG + show_debug("%s: message length: %d", func, len); +#endif + + len = htonl(len); + pool_write(frontend, &len, sizeof(len)); + + state = pool_read_kind(backend); + if (state < 0) + return POOL_END; + + /* set transaction state */ +#ifdef PRINT_DEBUG + show_debug("%s: transaction state: %c", func, state); +#endif + MASTER(backend)->tstate = state; + if (REPLICATION) + SECONDARY(backend)->tstate = state; + + pool_write(frontend, &state, 1); + } + + if (pool_flush(frontend)) + return POOL_END; + } + + /* end load balance mode */ + if (IN_LOAD_BALANCE) + end_load_balance(backend); + + return ProcessFrontendResponse(frontend, backend); +} + +static POOL_STATUS CompleteCommandResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char * func = "CompleteCommandResponse()"; + char *string, *string1; + int len, len1; + + /* read command tag */ + string = pool_read_string(MASTER(backend), &len, 0); + if (string == NULL) + return POOL_END; + + if (REPLICATION) + { + string1 = pool_read_string(SECONDARY(backend), &len1, 0); + if (string1 == NULL) + return POOL_END; + + if (len != len1) + { + show_error("%s: message length does not match between master(%d \"%s\",) and secondary(%d \"%s\",)", + func, len, string, len1, string1); + } + } + + /* forward to the frontend */ + pool_write(frontend, "C", 1); +#ifdef PRINT_DEBUG + show_debug("%s: string: \"%s\"",func, string); +#endif + if (pool_write(frontend, string, len) < 0) + { + return POOL_END; + } + return POOL_CONTINUE; +} + +static int RowDescription(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char * func = "RowDescription()"; + short num_fields, num_fields1; + int oid, mod; + int oid1, mod1; + short size, size1; + char *string, *string1; + int len, len1; + int i; + + /* # of fields (could be 0) */ + pool_read(MASTER(backend), &num_fields, sizeof(short)); + if (REPLICATION) + { + pool_read(SECONDARY(backend), &num_fields1, sizeof(short)); + if (num_fields != num_fields1) + { + show_error("%s: num_fields deos not match between backends master(%d) and secondary(%d)", + func, num_fields, num_fields1); + return POOL_FATAL; + } + } + + /* forward it to the frontend */ + pool_write(frontend, "T", 1); + pool_write(frontend, &num_fields, sizeof(short)); + + num_fields = ntohs(num_fields); + for (i = 0;i 0 */ + if (size > 0) + { + buf = pool_read2(MASTER(backend), size); + if (buf == NULL) + return POOL_END; + } + } + + if (REPLICATION && size1 > 0 && (mask & nullmap1[i/8])) + { + /* read and discard secondary data */ + if (pool_read2(SECONDARY(backend), size1) == NULL) + return POOL_END; + } + + if (buf) + { + pool_write(frontend, buf, size); + snprintf(msgbuf, Min(sizeof(msgbuf), size+1), "%s", buf); +#ifdef PRINT_DEBUG + show_debug("%s: len: %d data: %s", func, size, msgbuf); +#endif + } + + mask >>= 1; + } + + return POOL_CONTINUE; +} + +static POOL_STATUS BinaryRow(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend, + short num_fields) +{ + char * func = "BinaryRow()"; + static char nullmap[8192], nullmap1[8192]; + int nbytes; + int i; + unsigned char mask; + int size, size1; + char *buf; + + pool_write(frontend, "B", 1); + + nbytes = (num_fields + 7)/8; + + if (nbytes <= 0) + return POOL_CONTINUE; + + /* NULL map */ + pool_read(MASTER(backend), nullmap, nbytes); + if (pool_write(frontend, nullmap, nbytes) < 0) + return POOL_END; + + if (REPLICATION) + { + if (pool_read(SECONDARY(backend), nullmap1, nbytes) < 0) + return POOL_END; + + if (memcmp(nullmap, nullmap1, nbytes)) + { + /* XXX: NULLMAP maybe different among + backends. If we were a paranoid, we have to treat + this as a fatal error. However in the real world + we'd better to adapt this situation. Just throw a + log... */ + show_error("%s: NULLMAP differ between master and secondary",func); + } + } + + mask = 0; + + for (i = 0;i 0 */ + if (size > 0) + { + buf = pool_read2(MASTER(backend), size); + if (buf == NULL) + return POOL_END; + } + } + + if (REPLICATION && size1 > 0 && (mask & nullmap1[i/8])) + { + /* read and discard secondary data */ + if (pool_read2(SECONDARY(backend), size1) == NULL) + return POOL_END; + } + + if (buf) + pool_write(frontend, buf, size); + + mask >>= 1; + } + return POOL_CONTINUE; +} + +static POOL_STATUS CursorResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char * func = "CursorResponse()"; + char *string, *string1; + int len, len1; + + /* read cursor name */ + string = pool_read_string(MASTER(backend), &len, 0); + if (string == NULL) + return POOL_END; + if (REPLICATION) + { + string1 = pool_read_string(SECONDARY(backend), &len1, 0); + if (string1 == NULL) + return POOL_END; + if (len != len1) + { + show_error("%s: length does not match between master(%d) and secondary(%d)", + func, len, len1); + show_error("%s: master(%s) secondary(%s)", func, string, string1); + return POOL_END; + } + } + + /* forward to the frontend */ + pool_write(frontend, "P", 1); + if (pool_write(frontend, string, len) < 0) + { + return POOL_END; + } + return POOL_CONTINUE; +} + +POOL_STATUS ErrorResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char *string; + int len; + + /* read error message */ + string = pool_read_string(MASTER(backend), &len, 0); + if (string == NULL) + return POOL_END; + if (REPLICATION) + { + string = pool_read_string(SECONDARY(backend), &len, 0); + if (string == NULL) + return POOL_END; + } + + /* forward to the frontend */ + pool_write(frontend, "E", 1); + if (pool_write_and_flush(frontend, string, len) < 0) + return POOL_END; + + return POOL_CONTINUE; +} + +static POOL_STATUS NoticeResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char *string, *string1; + int len, len1; + + /* read notice message */ + string = pool_read_string(MASTER(backend), &len, 0); + if (string == NULL) + return POOL_END; + if (REPLICATION) + { + string1 = pool_read_string(SECONDARY(backend), &len1, 0); + if (string1 == NULL) + return POOL_END; + } + + /* forward to the frontend */ + pool_write(frontend, "N", 1); + if (pool_write_and_flush(frontend, string, len) < 0) + { + return POOL_END; + } + return POOL_CONTINUE; +} + +static POOL_STATUS CopyInResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + POOL_STATUS status; + + /* forward to the frontend */ + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + if (SimpleForwardToFrontend('G', frontend, backend) != POOL_CONTINUE) + return POOL_END; + if (pool_flush(frontend) != POOL_CONTINUE) + return POOL_END; + } + else + if (pool_write_and_flush(frontend, "G", 1) < 0) + return POOL_END; + + status = CopyDataRows(frontend, backend, 1); + return status; +} + +static POOL_STATUS CopyOutResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + POOL_STATUS status; + + /* forward to the frontend */ + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + if (SimpleForwardToFrontend('H', frontend, backend) != POOL_CONTINUE) + return POOL_END; + if (pool_flush(frontend) != POOL_CONTINUE) + return POOL_END; + } + else + if (pool_write_and_flush(frontend, "H", 1) < 0) + return POOL_END; + + status = CopyDataRows(frontend, backend, 0); + return status; +} + +static POOL_STATUS CopyDataRows(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend, int copyin) +{ +#ifdef PRINT_DEBUG + char * func = "CopyDataRows()"; +#endif + char *string; + int len; + +#ifdef PRINT_DEBUG + int i = 0; + char *buf; +#endif + + for (;;) + { + if (copyin) + { + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + char kind; + POOL_STATUS status; + + if (pool_read(frontend, &kind, 1) < 0) + return POOL_END; + + status = SimpleForwardToBackend(kind, frontend, backend); + if (status == POOL_END) + return status; + + /* CopyData? */ + if (kind == 'd') + continue; + else + break; + } + else + { + string = pool_read_string(frontend, &len, 1); + if (string == NULL) + return POOL_END; + } + } + else + { + /* CopyOut */ + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + signed char kind; + POOL_STATUS status; + + if ((kind = pool_read_kind(backend)) < 0) + return POOL_END; + + status = SimpleForwardToFrontend(kind, frontend, backend); + if (status == POOL_END) + return status; + + /* CopyData? */ + if (kind == 'd') + continue; + else + break; + } + else + { + string = pool_read_string(MASTER(backend), &len, 1); + if (REPLICATION) + string = pool_read_string(SECONDARY(backend), &len, 1); + } + } + + if (string == NULL) + return POOL_END; + +#ifdef PRINT_DEBUG + buf = malloc(len + 1); + if (buf == NULL) + { + show_error("CopyDataRows: malloc failed: %s", strerror(errno)); + return POOL_END; + } + strncpy(buf, string, len); + buf[len] = '\0'; + show_debug("%s: copy line %d %d bytes :%s:",func, i++, len, buf); + free(buf); +#endif + + if (copyin) + { + pool_write(MASTER(backend), string, len); + if (REPLICATION) + pool_write(SECONDARY(backend), string, len); + } + else + pool_write(frontend, string, len); + + if (len == PROTO_MAJOR_V3) + { + /* end of copy? */ + if (string[0] == '\\' && + string[1] == '.' && + string[2] == '\n') + { + break; + } + } + } + + if (copyin) + { + if (pool_flush(MASTER(backend)) <0) + return POOL_END; + if (REPLICATION) + { + if (pool_flush(SECONDARY(backend)) <0) + return POOL_END; + } + } + else + if (pool_flush(frontend) <0) + return POOL_END; + + return POOL_CONTINUE; +} + +static POOL_STATUS EmptyQueryResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char c; + + if (pool_read(MASTER(backend), &c, sizeof(c)) < 0) + return POOL_END; + + if (REPLICATION) + { + if (pool_read(SECONDARY(backend), &c, sizeof(c)) < 0) + return POOL_END; + } + + pool_write(frontend, "I", 1); + return pool_write_and_flush(frontend, "", 1); +} + +static POOL_STATUS NotificationResponse(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + int pid, pid1; + char *condition, *condition1; + int len, len1; + + pool_write(frontend, "A", 1); + + if (pool_read(MASTER(backend), &pid, sizeof(pid)) < 0) + return POOL_ERROR; + + if (REPLICATION) + { + if (pool_read(SECONDARY(backend), &pid1, sizeof(pid1)) < 0) + return POOL_ERROR; + } + + condition = pool_read_string(MASTER(backend), &len, 0); + if (condition == NULL) + return POOL_END; + if (REPLICATION) + { + condition1 = pool_read_string(SECONDARY(backend), &len1, 0); + if (condition1 == NULL) + return POOL_END; + } + + pool_write(frontend, &pid, sizeof(pid)); + + return pool_write_and_flush(frontend, condition, len); +} + +static POOL_STATUS FunctionCall(POOL_CONNECTION *frontend, + POOL_CONNECTION_POOL *backend) +{ + char dummy[2]; + int oid; + int argn; + int i; + + pool_write(MASTER(backend), "F", 1); + if (REPLICATION) + pool_write(SECONDARY(backend), "F", 1); + + /* dummy */ + if (pool_read(frontend, dummy, sizeof(dummy)) < 0) + return POOL_ERROR; + pool_write(MASTER(backend), dummy, sizeof(dummy)); + if (REPLICATION) + pool_write(SECONDARY(backend), dummy, sizeof(dummy)); + + /* function object id */ + if (pool_read(frontend, &oid, sizeof(oid)) < 0) + return POOL_ERROR; + + pool_write(MASTER(backend), &oid, sizeof(oid)); + if (REPLICATION) + pool_write(SECONDARY(backend), &oid, sizeof(oid)); + + /* number of arguments */ + if (pool_read(frontend, &argn, sizeof(argn)) < 0) + return POOL_ERROR; + pool_write(MASTER(backend), &argn, sizeof(argn)); + if (REPLICATION) + pool_write(SECONDARY(backend), &argn, sizeof(argn)); + + argn = ntohl(argn); + + for (i=0;ilen <= 0 && frontend->no_forward != 0) + return POOL_CONTINUE; + + if (pool_read(frontend, &fkind, 1) < 0) + { + show_error("%s: failed to read kind",func); + return POOL_END; + } + +#ifdef PRINT_DEBUG + show_debug("%s:read kind from frontend %c(%02x)", func, fkind, fkind); +#endif + + switch (fkind) + { + case 'X': + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + int len; + pool_read(frontend, &len, sizeof(len)); + } + status = POOL_END; + break; + + case 'Q': + status = Query(frontend, backend, NULL); + break; + + default: + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + status = SimpleForwardToBackend(fkind, frontend, backend); + if (pool_flush(MASTER(backend))) + status = POOL_ERROR; + if (REPLICATION) + if (pool_flush(SECONDARY(backend))) + status = POOL_ERROR; + } + else if (MAJOR(backend) == PROTO_MAJOR_V2 && fkind == 'F') + status = FunctionCall(frontend, backend); + else + { + show_error("%s: unknown message type %c(%02x)", func, fkind, fkind); + status = POOL_ERROR; + } + break; + } + + return status; +} + +static int timeoutmsec; +/* + * enable read timeout + */ +void pool_enable_timeout(void) +{ + timeoutmsec = pool_config_replication_timeout; +} + +/* + * disable read timeout + */ +void pool_disable_timeout(void) +{ + timeoutmsec = 0; +} + +/* + * wait until read data is ready + */ +static int synchronize(POOL_CONNECTION *cp) +{ + return pool_check_fd(cp, 1); +} + +/* + * wait until read data is ready + * if notimeout is non 0, wait forever. + */ +int pool_check_fd(POOL_CONNECTION *cp, int notimeout) +{ + char * func = "pool_check_fd()"; + fd_set readmask; + fd_set exceptmask; + int fd; + int fds; + struct timeval timeout; + struct timeval *tp; + + fd = cp->fd; + + for (;;) + { + FD_ZERO(&readmask); + FD_ZERO(&exceptmask); + FD_SET(fd, &readmask); + FD_SET(fd, &exceptmask); + + if (notimeout || timeoutmsec == 0) + tp = NULL; + else + { + timeout.tv_sec = 0; + timeout.tv_usec = pool_config_replication_timeout*1000; + tp = &timeout; + } + + fds = select(fd+1, &readmask, NULL, &exceptmask, tp); + + if (fds == -1) + { + if (errno == EAGAIN || errno == EINTR) + continue; + + show_error("%s: select() failed. reason %s",func, strerror(errno)); + break; + } + + if (FD_ISSET(fd, &exceptmask)) + { + show_error("%s: exception occurred",func); + break; + } + + if (fds == 0) + { + show_error("%s: data is not ready tp->tv_sec %d tp->tp_usec %d", func, tp->tv_sec, tp->tv_usec); + break; + } + return 0; + } + return -1; +} + +static void process_reporting(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) +{ + static char *cursorname = "blank"; + static short num_fields = 3; + static char *field_names[] = {"item", "value", "description"}; + static int oid = 0; + static short fsize = -1; + static int mod = 0; + short n; + int i; + short s; + int len; + short colnum; + + static char nullmap[2] = {0xff, 0xff}; + int nbytes = (num_fields + 7)/8; + +#define MAXVALLEN 512 + + typedef struct { + char *name; + char value[MAXVALLEN+1]; + char *desc; + } POOL_REPORT_STATUS; + +#define MAXITEMS 128 + + POOL_REPORT_STATUS status[MAXITEMS]; + + short nrows; + int size; + int hsize; + + i = 0; + + status[i].name = "inetdomain"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_inetdomain); + status[i].desc = "1 if accepting TCP/IP connection"; + i++; + + status[i].name = "port"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_port); + status[i].desc = "pgpool accepting port number"; + i++; + + status[i].name = "socket_dir"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_socket_dir); + status[i].desc = "pgpool socket directory"; + i++; + + status[i].name = "backend_host_name"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_backend_host_name); + status[i].desc = "master backend host name"; + i++; + + status[i].name = "backend_port"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_backend_port); + status[i].desc = "master backend port number"; + i++; + + status[i].name = "secondary_backend_host_name"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_secondary_backend_host_name); + status[i].desc = "secondary backend host name"; + i++; + + status[i].name = "secondary_backend_port"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_secondary_backend_port); + status[i].desc = "secondary backend port number"; + i++; + + status[i].name = "num_init_children"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_num_init_children); + status[i].desc = "# of children initially pre-forked"; + i++; + + status[i].name = "child_life_time"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_child_life_time); + status[i].desc = "if idle for this seconds, child exits (not implemented yet)"; + i++; + + status[i].name = "connection_life_time"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_connection_life_time); + status[i].desc = "if idle for this seconds, connection closes"; + i++; + + status[i].name = "max_pool"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_max_pool); + status[i].desc = "max # of connection pool per child"; + i++; + + status[i].name = "logdir"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_logdir); + status[i].desc = "logging directory"; + i++; + + status[i].name = "backend_socket_dir"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_backend_socket_dir); + status[i].desc = "Unix domain socket directory for the PostgreSQL server"; + i++; + + status[i].name = "replication_mode"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_replication_mode); + status[i].desc = "non 0 if operating in replication mode"; + i++; + + status[i].name = "replication_strict"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_replication_strict); + status[i].desc = "non 0 if operating in strict mode"; + i++; + + status[i].name = "replication_timeout"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_replication_timeout); + status[i].desc = "if secondary does not respond in this milli seconds, abort the session"; + i++; + + status[i].name = "current_backend_host_name"; + snprintf(status[i].value, MAXVALLEN, "%s", pool_config_current_backend_host_name); + status[i].desc = "current master host name"; + i++; + + status[i].name = "current_backend_port"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_current_backend_port); + status[i].desc = "current master port #"; + i++; + + status[i].name = "replication_enabled"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_replication_enabled); + status[i].desc = "non 0 if actually operating in replication mode"; + i++; + + status[i].name = "load_balance_mode"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_load_balance_mode); + status[i].desc = "non 0 if operating in load balancing mode"; + i++; + + status[i].name = "replication_stop_on_mismatch"; + snprintf(status[i].value, MAXVALLEN, "%d", pool_config_replication_stop_on_mismatch); + status[i].desc = "stop replication mode on fatal error"; + i++; + + nrows = i; + + if (MAJOR(backend) == PROTO_MAJOR_V2) + { + /* cursor response */ + pool_write(frontend, "P", 1); + pool_write(frontend, cursorname, strlen(cursorname)+1); + } + + /* row description */ + pool_write(frontend, "T", 1); + + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + len = sizeof(num_fields) + sizeof(len); + + for (i=0;iwrite_fd); + + if (REPLICATION) + { + pool_write(SECONDARY(backend), "X", 1); + if (MAJOR(backend) == PROTO_MAJOR_V3) + { + len = htonl(4); + pool_write(MASTER(backend), &len, sizeof(len)); + } + fflush(SECONDARY(backend)->write_fd); + } +} + +/* + * ------------------------------------------------------- + * V3 functions + * ------------------------------------------------------- + */ +POOL_STATUS SimpleForwardToFrontend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) +{ + char * func = "SimpleForwardToFrontend()"; + int len, len1; + char *p; + int status; + + pool_write(frontend, &kind, 1); + + status = pool_read(MASTER(backend), &len, sizeof(len)); + if (status < 0) + { + show_error("%s: error while reading message length",func); + return POOL_END; + } + + if (REPLICATION) + { + status = pool_read(SECONDARY(backend), &len1, sizeof(len1)); + if (status < 0) + { + show_error("%s: error while reading message length from secondary backend",func); + return POOL_END; + } + + if (len != len1) + { + show_error("%s: length does not match between backends master(%d) secondary(%d) kind:(%c)", + func, ntohl(len), ntohl(len1), kind); + } + } + + pool_write(frontend, &len, sizeof(len)); + + len = ntohl(len); + len -= 4; + + p = pool_read2(MASTER(backend), len); + if (p == NULL) + return POOL_END; + + if (REPLICATION) + { + len1 = ntohl(len1); + len1 -= 4; + if (pool_read2(SECONDARY(backend), len1) == NULL) + return POOL_END; + } + + return pool_write(frontend, p, len); +} + +POOL_STATUS SimpleForwardToBackend(char kind, POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) +{ + int len; + int sendlen; + char *p; + + if (pool_write(MASTER(backend), &kind, 1)) + return POOL_END; + if (REPLICATION) + if (pool_write(SECONDARY(backend), &kind, 1)) + return POOL_END; + + if (pool_read(frontend, &sendlen, sizeof(sendlen))) + { + return POOL_END; + } + + len = ntohl(sendlen) - 4; + + p = pool_read2(frontend, len); + if (p == NULL) + return POOL_END; + + if (pool_write(MASTER(backend), &sendlen, sizeof(sendlen))) + return POOL_END; + if (pool_write(MASTER(backend), p, len)) + return POOL_END; + + if (REPLICATION) + { + if (pool_write(SECONDARY(backend), &sendlen, sizeof(sendlen))) + return POOL_END; + if (pool_write(SECONDARY(backend), p, len)) + return POOL_END; + } + + return POOL_CONTINUE; +} + +POOL_STATUS ParameterStatus(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) +{ +#ifdef PRINT_DEBUG + char * func = "ParameterStatus()"; +#endif + int len; + int sendlen; + char *p; + char *name; + char *value; + + pool_write(frontend, "S", 1); + + len = pool_read_message_length(backend); + if (len < 0) + { + return POOL_END; + } + + sendlen = htonl(len); + pool_write(frontend, &sendlen, sizeof(sendlen)); + + len -= 4; + + p = pool_read2(MASTER(backend), len); + if (p == NULL) + return POOL_END; + + name = p; + value = p + strlen(name) + 1; + +#ifdef PRINT_DEBUG + show_debug("%s:name: %s value: %s",func, name, value); +#endif + + pool_add_param(&MASTER(backend)->params, name, value); + +#ifdef PRINT_DEBUG + pool_param_debug_print(&MASTER(backend)->params); +#endif + + if (REPLICATION) + if (pool_read2(SECONDARY(backend), len) == NULL) + return POOL_END; + + return pool_write(frontend, p, len); + +} + +/* + * reset backend status. return values are: + * 0: no query was issued 1: a query was issued 2: no more queries remain -1: error + */ +static int reset_backend(POOL_CONNECTION_POOL *backend, int qcnt) +{ +#ifdef NO_RESET_ALL + static char *queries[] = {"ABORT"}; +#else + static char *queries[] = {"ABORT", "RESET ALL"}; +#endif + + char *query; + int qn = sizeof(queries)/sizeof(char *); + + /* for PGCluster */ + if (!Use_Connection_Pool) + return 2; + + if (qcnt >= qn) + return 2; + + query = queries[qcnt]; + + /* if transaction state is idle, we don't need to issue ABORT */ + if (TSTATE(backend) == 'I' && !strcmp("ABORT", query)) + return 0; + + if (Query(NULL, backend, query) != POOL_CONTINUE) + return -1; + + return 1; +} + +/* + * return non 0 if load balance is possible + */ +static int load_balance_enabled(POOL_CONNECTION_POOL *backend, char *sql) +{ + if (pool_config_load_balance_mode && + REPLICATION && + MAJOR(backend) == PROTO_MAJOR_V3 && + TSTATE(backend) == 'I' && + !strncasecmp(sql, "SELECT", 6)) + return 1; + return 0; +} + +/* + * start load balance mode + */ +static void start_load_balance(POOL_CONNECTION_POOL *backend) +{ +#ifdef PRINT_DEBUG + char * func = "start_load_balance()"; +#endif + int i; + int master; + + /* save backend connection slots */ + for (i=0;inum;i++) + { + slots[i] = backend->slots[i]; + } + + /* temporary turn off replication mode */ + /*REPLICATION = 0; */ + + /* choose a master in random manner */ + master = random() % backend->num; + backend->slots[0] = slots[master]; +#ifdef PRINT_DEBUG + show_debug("%s: selected master is %d", func,master); +#endif + + /* start load balancing */ + /*in_load_balance = 1;*/ +} + +/* + * finish load balance mode + */ +static void end_load_balance(POOL_CONNECTION_POOL *backend) +{ + int i; + + /* restore backend connection slots */ + for (i=0;inum;i++) + { + backend->slots[i] = slots[i]; + } + + /* turn on replication mode */ + /* REPLICATION = 1; */ + + /*in_load_balance = 0;*/ +#ifdef PRINT_DEBUG + show_debug("end_load_balance: end load balance mode"); +#endif +} + +/* + * send error message to frontend + */ +void pool_send_error_message(POOL_CONNECTION *frontend, int protoMajor, + char *code, + char *message, + char *detail, + char *hint, + char *file, + int line) +{ +#define MAXDATA 1024 +#define MAXMSGBUF 128 + char * func = "pool_send_error_message()"; + + if (protoMajor == PROTO_MAJOR_V2) + { + pool_write(frontend, "E", 1); + pool_write_and_flush(frontend, message, strlen(message)+1); + } + else if (protoMajor == PROTO_MAJOR_V3) + { + char data[MAXDATA]; + char msgbuf[MAXMSGBUF]; + int len; + int thislen; + int sendlen; + + len = 0; + + pool_write(frontend, "E", 1); + + /* error level */ + thislen = snprintf(msgbuf, MAXMSGBUF, "SERROR"); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + + /* code */ + thislen = snprintf(msgbuf, MAXMSGBUF, "C%s", code); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + + /* message */ + thislen = snprintf(msgbuf, MAXMSGBUF, "M%s", message); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + + /* detail */ + if (*detail != '\0') + { + thislen = snprintf(msgbuf, MAXMSGBUF, "D%s", detail); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + } + + /* hint */ + if (*hint != '\0') + { + thislen = snprintf(msgbuf, MAXMSGBUF, "H%s", hint); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + } + + /* file */ + thislen = snprintf(msgbuf, MAXMSGBUF, "F%s", file); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + + /* line */ + thislen = snprintf(msgbuf, MAXMSGBUF, "L%d", line); + memcpy(data +len, msgbuf, thislen+1); + len += thislen + 1; + + /* stop null */ + len++; + *(data + len) = '\0'; + + sendlen = len; + len = htonl(len + 4); + pool_write(frontend, &len, sizeof(len)); + pool_write_and_flush(frontend, data, sendlen); + } + else + show_error("%s: unknown protocol major %d",func, protoMajor); +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/pool_stream.c pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_stream.c --- postgresql-8.2.4/src/pgcluster/pglb/pool_stream.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/pool_stream.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,584 @@ +/*-------------------------------------------------------------------- + * FILE: + * pool_stream.c + * + * NOTE: + * stream I/O modules + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2003-2006, Tatsuo Ishii + *-------------------------------------------------------------------- + */ +/* +* Permission to use, copy, modify, and distribute this software and +* its documentation for any purpose and without fee is hereby +* granted, provided that the above copyright notice appear in all +* copies and that both that copyright notice and this permission +* notice appear in supporting documentation, and that the name of the +* author not be used in advertising or publicity pertaining to +* distribution of the software without specific, written prior +* permission. The author makes no representations about the +* suitability of this software for any purpose. It is provided "as +* is" without express or implied warranty. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "postgres_fe.h" +#include "libpq/pqcomm.h" +#include "replicate_com.h" +#include "pglb.h" + +#define READBUFSZ 1024 + +POOL_CONNECTION *pool_open(int fd); +void pool_close(POOL_CONNECTION *cp); +int pool_read(POOL_CONNECTION *cp, void *buf, int len); +char *pool_read2(POOL_CONNECTION *cp, int len); +int pool_write(POOL_CONNECTION *cp, void *buf, int len); +int pool_flush(POOL_CONNECTION *cp); +int pool_write_and_flush(POOL_CONNECTION *cp, void *buf, int len); +char *pool_read_string(POOL_CONNECTION *cp, int *len, int line); + +static int mystrlen(char *str, int upper, int *flag); +static int mystrlinelen(char *str, int upper, int *flag); +static int save_pending_data(POOL_CONNECTION *cp, void *data, int len); +static int consume_pending_data(POOL_CONNECTION *cp, void *data, int len); + + +/* +* open read/write file descriptors. +* returns POOL_CONNECTION on success otherwise NULL. +*/ +POOL_CONNECTION *pool_open(int fd) +{ + POOL_CONNECTION *cp; + + cp = (POOL_CONNECTION *)malloc(sizeof(POOL_CONNECTION)); + if (cp == NULL) + { + show_error("pool_open: malloc failed: %s", strerror(errno)); + return NULL; + } + + memset(cp, 0, sizeof(*cp)); + + cp->write_fd = fdopen(fd, "w"); + if (cp->write_fd == NULL) + { + show_error("pool_open: fdopen failed: %s",strerror(errno)); + free(cp); + return NULL; + } + + /* initialize pending data buffer */ + cp->hp = malloc(READBUFSZ); + if (cp->hp == NULL) + { + show_error("pool_open: malloc failed"); + return NULL; + } + cp->bufsz = READBUFSZ; + cp->po = 0; + cp->len = 0; + cp->sbuf = NULL; + cp->sbufsz = 0; + cp->buf2 = NULL; + cp->sbufsz = 0; + + cp->fd = fd; + return cp; +} + +/* +* close read/write file descriptors. +*/ +void pool_close(POOL_CONNECTION *cp) +{ + close(cp->fd); + fclose(cp->write_fd); + free(cp->hp); + if (cp->sbuf) + free(cp->sbuf); + if (cp->buf2) + free(cp->buf2); + pool_discard_params(&cp->params); + free(cp); +} + +/* +* read len bytes from cp +* returns 0 on success otherwise -1. +*/ +int pool_read(POOL_CONNECTION *cp, void *buf, int len) +{ + static char readbuf[READBUFSZ]; + + int consume_size; + int readlen; + + consume_size = consume_pending_data(cp, buf, len); + len -= consume_size; + buf += consume_size; + + while (len > 0) + { + if (cp->issecondary_backend) + { + if (pool_check_fd(cp, 0)) + { + show_error("pool_read: secondary data is not ready. abort this session"); + exit(1); + } + } + + readlen = read(cp->fd, readbuf, READBUFSZ); + if (readlen == -1) + { + show_error("pool_read: read failed (%s)", strerror(errno)); + + if (cp->isbackend) + { + /* fatal error, notice to parent and exit */ + notice_backend_error(); + exit(1); + } + else + { + return -1; + } + } + else if (readlen == 0) + { + show_error("pool_read: EOF encountered"); + + if (cp->isbackend) + { + /* fatal error, notice to parent and exit */ + notice_backend_error(); + exit(1); + } + else + { + /* + * if backend offers authentication method, frontend could close connection + */ + return -1; + } + } + + if (len < readlen) + { + /* overrun. we need to save remaining data to pending buffer */ + if (save_pending_data(cp, readbuf+len, readlen-len)) + return -1; + memmove(buf, readbuf, len); + break; + } + + memmove(buf, readbuf, readlen); + buf += readlen; + len -= readlen; + } + + return 0; +} + +/* +* read exactly len bytes from cp +* returns buffer address on success otherwise NULL. +*/ +char *pool_read2(POOL_CONNECTION *cp, int len) +{ + char *buf; + int req_size; + int alloc_size; + int consume_size; + int readlen; + + req_size = cp->len + len; + + if (req_size > cp->bufsz2) + { + alloc_size = ((req_size+1)/READBUFSZ+1)*READBUFSZ; + cp->buf2 = realloc(cp->buf2, alloc_size); + if (cp->buf2 == NULL) + { + show_error("pool_read2: failed to realloc"); + exit(1); + } + cp->bufsz2 = alloc_size; + } + + buf = cp->buf2; + + consume_size = consume_pending_data(cp, buf, len); + len -= consume_size; + buf += consume_size; + + while (len > 0) + { + if (cp->issecondary_backend) + { + if (pool_check_fd(cp, 0)) + { + show_error("pool_read2: secondary data is not ready. abort this session"); + exit(1); + } + } + + readlen = read(cp->fd, buf, len); + if (readlen == -1) + { + show_error("pool_read2: read failed (%s)", strerror(errno)); + + if (cp->isbackend) + { + /* fatal error, notice to parent and exit */ + notice_backend_error(); + exit(1); + } + else + { + return NULL; + } + } + else if (readlen == 0) + { + show_error("pool_read2: EOF encountered"); + + if (cp->isbackend) + { + /* fatal error, notice to parent and exit */ + notice_backend_error(); + exit(1); + } + else + { + /* + * if backend offers authentication method, frontend could close connection + */ + return NULL; + } + } + + buf += readlen; + len -= readlen; + } + + return cp->buf2; +} + +/* +* write len bytes from cp +* returns 0 on success otherwise -1. +*/ +int pool_write(POOL_CONNECTION *cp, void *buf, int len) +{ + if (!cp->no_forward) + fwrite(buf, len, 1, cp->write_fd); + + return 0; +} + +/* +* flush write buffer +*/ +int pool_flush(POOL_CONNECTION *cp) +{ + if (fflush(cp->write_fd) != 0) + { + show_error("pool_flush: fflush failed (%s)", strerror(errno)); + + if (cp->isbackend) + { + notice_backend_error(); + exit(1); + } + else + { + return -1; + } + } + return 0; +} + +/* +* combo of pool_write and pool_flush +*/ +int pool_write_and_flush(POOL_CONNECTION *cp, void *buf, int len) +{ + if (pool_write(cp, buf, len)) + return -1; + return pool_flush(cp); +} + +/* + * read a string until EOF or NULL is encountered. + * if line is not 0, read until new line is encountered. +*/ +char *pool_read_string(POOL_CONNECTION *cp, int *len, int line) +{ + int readp; + int readsize; + int readlen; + int strlength; + int flag; + int consume_size; + +#ifdef DEBUG + static char pbuf[READBUFSZ]; +#endif + + *len = 0; + readp = 0; + + /* initialize read buffer */ + if (cp->sbufsz == 0) + { + cp->sbuf = malloc(READBUFSZ); + if (cp->sbuf == NULL) + { + show_error("pool_read_string: malloc failed"); + return NULL; + } + cp->sbufsz = READBUFSZ; + *cp->sbuf = '\0'; + } + + /* any pending data? */ + if (cp->len) + { + if (line) + strlength = mystrlinelen(cp->hp+cp->po, cp->len, &flag); + else + strlength = mystrlen(cp->hp+cp->po, cp->len, &flag); + + /* buffer is too small? */ + if ((strlength + 1) > cp->sbufsz) + { + cp->sbufsz = ((strlength+1)/READBUFSZ+1)*READBUFSZ; + cp->sbuf = realloc(cp->sbuf, cp->sbufsz); + if (cp->sbuf == NULL) + { + show_error("pool_read_string: realloc failed"); + return NULL; + } + } + + /* consume pending and save to read string buffer */ + consume_size = consume_pending_data(cp, cp->sbuf, strlength); + + *len = strlength; + + /* is the string null terminated? */ + if (consume_size == strlength && !flag) + { + /* not null or line terminated. + * we need to read more since we have not encountered NULL or new line yet + */ + readsize = cp->sbufsz - strlength; + readp = strlength; + } + else + { +#ifdef PRINT_DEBUG + show_debug("pool_read_string: read all from pending data. po:%d len:%d", + cp->po, cp->len); +#endif + return cp->sbuf; + } + } else + { + readsize = cp->sbufsz; + } + + + for (;;) + { + readlen = read(cp->fd, cp->sbuf+readp, readsize); + if (readlen == -1) + { + show_error("pool_read_string: read() failed. reason:%s", strerror(errno)); + + if (cp->isbackend) + { + notice_backend_error(); + exit(1); + } + else + { + return NULL; + } + } + + if (readlen == 0) + return NULL; + + /* check overrun */ + if (line) + strlength = mystrlinelen(cp->sbuf+readp, readlen, &flag); + else + strlength = mystrlen(cp->sbuf+readp, readlen, &flag); + + if (strlength < readlen) + { + save_pending_data(cp, cp->sbuf+readp+strlength, readlen-strlength); + *len += strlength; +#ifdef PRINT_DEBUG + show_debug("pool_read_string: total result %d with pending data po:%d len:%d", *len, cp->po, cp->len); +#endif + return cp->sbuf; + } + + *len += readlen; + + /* encountered null or newline? */ + if (flag) + { + /* ok we have read all data */ +#ifdef PRINT_DEBUG + show_debug("pool_read_string: total result %d ", *len); +#endif + break; + } + + readp += readlen; + readsize = READBUFSZ; + + if ((*len+readsize) > cp->sbufsz) + { + cp->sbufsz += READBUFSZ; + + cp->sbuf = realloc(cp->sbuf, cp->sbufsz); + if (cp->sbuf == NULL) + { + show_error("pool_read_string: realloc failed"); + return NULL; + } + } + } + return cp->sbuf; +} + +/* + * returns the byte length of str, including \0, no more than upper. + * if encountered \0, flag is set to non 0. + * example: + * mystrlen("abc", 2) returns 2 + * mystrlen("abc", 3) returns 3 + * mystrlen("abc", 4) returns 4 + * mystrlen("abc", 5) returns 4 + */ +static int mystrlen(char *str, int upper, int *flag) +{ + int len; + + *flag = 0; + + for (len = 0;len < upper; len++, str++) + { + if (!*str) + { + len++; + *flag = 1; + break; + } + } + return len; +} + +/* + * returns the byte length of str terminated by \n or \0 (including \n or \0), no more than upper. + * if encountered \0 or \n, flag is set to non 0. + * example: + * mystrlinelen("abc", 2) returns 2 + * mystrlinelen("abc", 3) returns 3 + * mystrlinelen("abc", 4) returns 4 + * mystrlinelen("abc", 5) returns 4 + * mystrlinelen("abcd\nefg", 4) returns 4 + * mystrlinelen("abcd\nefg", 5) returns 5 + * mystrlinelen("abcd\nefg", 6) returns 5 + */ +static int mystrlinelen(char *str, int upper, int *flag) +{ + int len; + + *flag = 0; + + for (len = 0;len < upper; len++, str++) + { + if (!*str || *str == '\n') + { + len++; + *flag = 1; + break; + } + } + return len; +} + +/* + * save pending data + */ +static int save_pending_data(POOL_CONNECTION *cp, void *data, int len) +{ + int reqlen; + size_t realloc_size; + char *p; + + /* to be safe */ + if (cp->len == 0) + cp->po = 0; + + reqlen = cp->po + cp->len + len; + + /* pending buffer is enough? */ + if (reqlen > cp->bufsz) + { + /* too small, enlarge it */ + realloc_size = (reqlen/READBUFSZ+1)*READBUFSZ; + p = realloc(cp->hp, realloc_size); + if (p == NULL) + { + show_error("save_pending_data: realloc failed"); + return -1; + } + + cp->bufsz = realloc_size; + cp->hp = p; + } + + memmove(cp->hp + cp->po + cp->len, data, len); + cp->len += len; + + return 0; +} + +/* + * consume pending data. returns actually consumed data length. + */ +static int consume_pending_data(POOL_CONNECTION *cp, void *data, int len) +{ + int consume_size; + + if (cp->len <= 0) + return 0; + + consume_size = Min(len, cp->len); + memmove(data, cp->hp + cp->po, consume_size); + cp->len -= consume_size; + + if (cp->len <= 0) + cp->po = 0; + else + cp->po += consume_size; + + return consume_size; +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/recovery.c pgcluster-1.7.0rc7/src/pgcluster/pglb/recovery.c --- postgresql-8.2.4/src/pgcluster/pglb/recovery.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/recovery.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,262 @@ +/*-------------------------------------------------------------------- + * FILE: + * recovery.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pglb for the recovery. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_SYS_SELECT_H +#include +#endif +#include "replicate_com.h" +#include "pglb.h" + + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +void PGRrecovery_main(int fork_wait_time); + +static int set_recovery(RecoveryPacket *packet); +static int receive_recovery(int fd); + + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRrecovery_main() + * NOTES + * main module of recovery function + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +void +PGRrecovery_main(int fork_wait_time) +{ + char * func = "PGRrecovery_main()"; + int fd = -1; + int rtn; + pid_t pgid = 0; + pid_t pid = 0; + + pgid = getpgid(0); + pid = fork(); + if (pid != 0) + { + return; + } + + PGRsignal(SIGCHLD, SIG_DFL); + PGRsignal(SIGHUP, PGRexit_subprocess); + PGRsignal(SIGINT, PGRexit_subprocess); + PGRsignal(SIGQUIT, PGRexit_subprocess); + PGRsignal(SIGTERM, PGRexit_subprocess); + PGRsignal(SIGPIPE, SIG_IGN); + /* + * in child process, + * call recovery module + */ + setpgid(0,pgid); + + if (fork_wait_time > 0) { +#ifdef PRINT_DEBUG + show_debug("recovery process: wait fork(): pid = %d", getpid()); +#endif + sleep(fork_wait_time); + } + + fd = PGRcreate_recv_socket(ResolvedName, Recovery_Port_Number); + if (fd < 0) + { + show_error("%s:PGRcreate_recv_socket failed",func); + exit(1); + } + + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = 60; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(fd,&rmask); + rtn = select(fd+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn && FD_ISSET(fd, &rmask)) + { + receive_recovery(fd); + } + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * set_recovery() + * NOTES + * check a recovery request from replication server + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static int +set_recovery(RecoveryPacket *packet) +{ +#ifdef PRINT_DEBUG + char * func = "set_recovery()"; +#endif + int status = STATUS_OK; + ClusterTbl key; + ClusterTbl * ptr; + + PGRset_key_of_cluster(&key,packet); +#ifdef PRINT_DEBUG + show_debug("%s:received no:%d",func, ntohs(packet->packet_no)); +#endif + switch (ntohs(packet->packet_no)) + { + case RECOVERY_PREPARE_REQ: + /* add cluster db */ +#ifdef PRINT_DEBUG + show_debug("%s:add_db host:%s port:%d max:%d", + func, packet->hostName,ntohs(packet->port),ntohs(packet->max_connect)); +#endif + ptr = PGRsearch_cluster_tbl(&key); + if (ptr == NULL) + { + ptr = PGRadd_cluster_tbl(&key); + } + if (ptr != NULL) + { + PGRset_status_on_cluster_tbl(TBL_STOP,ptr); + if (Use_Connection_Pool) + { + signal(SIGCHLD,PGRrecreate_child); + status = PGRpre_fork_child(ptr); + } + } + break; + case RECOVERY_FINISH: + /* start cluster db */ + ptr = PGRsearch_cluster_tbl(&key); + if (ptr != NULL) + { +#ifdef PRINT_DEBUG + show_debug("%s:start_db host:%s port:%d max:%d", + func,packet->hostName,ntohs(packet->port),ntohs(packet->max_connect)); +#endif + PGRset_status_on_cluster_tbl(TBL_INIT,ptr); + } + break; + case RECOVERY_PGDATA_ANS: + /* stop cluster db */ + ptr = PGRsearch_cluster_tbl(&key); + if (ptr != NULL) + { +#ifdef PRINT_DEBUG + show_debug("%s:stop_db host:%s port:%d max:%d", + func, packet->hostName,ntohs(packet->port),ntohs(packet->max_connect)); +#endif + PGRset_status_on_cluster_tbl(TBL_STOP,ptr); + } + break; + case RECOVERY_ERROR: + /* delete cluster db */ + ptr = PGRsearch_cluster_tbl(&key); + if (ptr != NULL) + { + PGRset_status_on_cluster_tbl(TBL_FREE,ptr); + if (Use_Connection_Pool) + { + PGRquit_children_on_cluster(ptr->rec_no); + } + } + break; + /* cluster db has error */ + case RECOVERY_ERROR_CONNECTION: + /* set error cluster db */ + ptr = PGRsearch_cluster_tbl(&key); + if (ptr != NULL) + { + PGRset_status_on_cluster_tbl(TBL_ERROR,ptr); + if (Use_Connection_Pool) + { + PGRquit_children_on_cluster(ptr->rec_no); + } + } + break; + } + return STATUS_OK; +} + +static int +receive_recovery(int fd) +{ + int status = STATUS_ERROR; + int r_size = -1; + int recv_sock = -1; + RecoveryPacket packet; + + recv_sock = PGRcreate_acception(fd,ResolvedName,Recovery_Port_Number); + if (recv_sock >= 0 ) + { + memset(&packet,0, sizeof(RecoveryPacket)); + r_size = PGRread_byte(recv_sock,(char *)&packet,sizeof(RecoveryPacket),MSG_WAITALL); + if ( r_size == sizeof(RecoveryPacket) ) + { + status = set_recovery(&packet); + } + } + PGRclose_sock(&recv_sock); + return status; +} diff -aruN postgresql-8.2.4/src/pgcluster/pglb/socket.c pgcluster-1.7.0rc7/src/pgcluster/pglb/socket.c --- postgresql-8.2.4/src/pgcluster/pglb/socket.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pglb/socket.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,395 @@ +/*-------------------------------------------------------------------- + * FILE: + * socket.c + * + * NOTE: + * This file is composed of the communication modules + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +/* + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * +*/ +#include "postgres.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#include "replicate_com.h" +#include "pglb.h" + + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRcreate_unix_domain_socket(char * sock_dir, unsigned short port); +int PGRcreate_recv_socket(char * hostName , unsigned short portNumber); +int PGRcreate_acception(int fd, char * hostName , unsigned short portNumber); +void PGRclose_sock(int * sock); +int PGRread_byte(int sock,char * buf,int len, int flag); +int PGRcreate_cluster_socket( int * sock, ClusterTbl * ptr ); + +static int create_send_socket(int * fdP, char * hostName , unsigned short portNumber); + + +/* +* create UNIX domain socket +*/ +int +PGRcreate_unix_domain_socket(char * sock_dir, unsigned short port) +{ + char * func = "PGRcreate_unix_domain_socket()"; + struct sockaddr_un addr; + int fd; + int status; + int len; + + /* set unix domain socket path */ + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + { + show_error("%s:Failed to create UNIX domain socket. reason: %s",func, strerror(errno)); + return -1; + } + memset((char *) &addr, 0, sizeof(addr)); + ((struct sockaddr *)&addr)->sa_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/.s.PGSQL.%d",sock_dir,port); + len = sizeof(struct sockaddr_un); + status = bind(fd, (struct sockaddr *)&addr, len); + if (status == -1) + { + show_error("%s: bind() failed. reason: %s", func, strerror(errno)); + return -1; + } + + if (chmod(addr.sun_path, 0777) == -1) + { + show_error("%s: chmod() failed. reason: %s", func, strerror(errno)); + return -1; + } + + status = listen(fd, PGLB_MAX_SOCKET_QUEUE); + if (status < 0) + { + show_error("%s: listen() failed. reason: %s", func, strerror(errno)); + return -1; + } + return fd; +} + +int +PGRcreate_recv_socket(char * hostName , unsigned short portNumber) +{ + char * func = "PGRcreate_recv_socket()"; + int fd,err; + size_t len = 0; + struct sockaddr_in addr; + int one = 1; + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) + { + show_error("%s: socket() failed. (%s)", func, strerror(errno)); + return -1; + } + if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one))) == -1) + { + PGRclose_sock(&fd); + show_error("%s: setsockopt() failed. (%s)",func, strerror(errno)); + return -1; + } + addr.sin_family = AF_INET; + if ((hostName == NULL) || (hostName[0] == '\0')) + addr.sin_addr.s_addr = htonl(INADDR_ANY); + else + { + struct hostent *hp; + + hp = gethostbyname(hostName); + if ((hp == NULL) || (hp->h_addrtype != AF_INET)) + { + PGRclose_sock(&fd); + return -1; + } + memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length); + } + + addr.sin_port = htons(portNumber); + len = sizeof(struct sockaddr_in); + + err = bind(fd, (struct sockaddr *) & addr, len); + if (err < 0) + { + PGRclose_sock(&fd); + show_error("%s: bind() failed. (%s)",func, strerror(errno)); + return -1; + } + err = listen(fd, PGLB_MAX_SOCKET_QUEUE); + if (err < 0) + { + PGRclose_sock(&fd); + show_error("%s: listen() failed. (%s)", func, strerror(errno)); + return -1; + } + return fd; +} + +int +PGRcreate_acception(int fd, char * hostName , unsigned short portNumber) +{ + char * func = "PGRcreate_acception()"; + int sock; + struct sockaddr addr; + size_t len = 0; + int one = 1; + int count; + + len = sizeof(struct sockaddr); + count = 0; + while ((sock = accept(fd,&addr,&len)) < 0) + { + show_error("%s:accept error",func); + PGRclose_sock(&fd); + if ( count > PGLB_CONNECT_RETRY_TIME) + { + return -1; + } + fd = PGRcreate_recv_socket(hostName , portNumber); + count ++; + } + + count = 0; + while (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) < 0) + { + show_error("%s: setsockopt TCP_NODELAY error (%s)",func, strerror(errno)); + if ( count > PGLB_CONNECT_RETRY_TIME) + { + return -1; + } + count ++; + } + count = 0; + while (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) < 0) + { + show_error("%s:setsockopt SO_KEEPALIVE error (%s)",func,strerror(errno)); + if ( count > PGLB_CONNECT_RETRY_TIME) + { + return -1; + } + count ++; + } + + return sock; +} + +void +PGRclose_sock(int * sock) +{ + close(*sock); + *sock = -1; +} + +int +PGRread_byte(int sock,char * buf,int len, int flag) +{ + char * func = "PGRread_byte()"; + int r; + char * read_ptr; + int read_size = 0; + int max_buf_size ; + int pid; + + pid = getpid(); + max_buf_size = len; + read_ptr = (char*)buf; + for (;;) + { + r = recv(sock,read_ptr + read_size ,max_buf_size - read_size, flag); + if (r < 0) + { + if (errno == EINTR) + { + continue; + } +#ifdef EAGAIN + if (errno == EAGAIN) + { + return read_size; + } +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + if (errno == EWOULDBLOCK) + { + show_error("%s:no data (%s)",func,strerror(errno)); + return read_size; + } +#endif +#ifdef ECONNRESET + if (errno == ECONNRESET) + { + PGRclose_sock(&sock); + show_error("%s:connection reset (%s)",func, strerror(errno)); + return -1; + } +#endif + show_error("%s:recv() failed. (%s)",func,strerror(errno)); + read_size = -1; + break; + } + if (r > 0) + { + read_size += r; + if (max_buf_size == read_size) + { + break; + } + break; + } + if (read_size) + { + return read_size; + } + else + { + return -1; + } + } + + return read_size; +} + +int +PGRcreate_cluster_socket( int * sock, ClusterTbl * ptr ) +{ + char * func = "PGRcreate_cluster_socket()"; + int status = STATUS_ERROR; + + /* + if (PGRis_connection_full(ptr) == 1) + { + return STATUS_ERROR; + } + */ + if (ptr != (ClusterTbl *) NULL) + { + status = create_send_socket(sock, ptr->hostName, ptr->port) ; + } + else + { + show_error("%s:ClusterTbl is not initialize",func); + } + return status; +} + +static int +create_send_socket(int * fdP, char * hostName , unsigned short portNumber) +{ + char * func = "create_send_socket()"; + int sock; + size_t len = 0; + struct sockaddr_in addr; + int fd; + int one = 1; + +#ifdef PRINT_DEBUG + show_debug("%s: host:%s port:%d",func, hostName,portNumber); +#endif + + memset((char *)&addr,0,sizeof(addr)); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) + { + * fdP = -1; + + show_error("%s:socket() failed. (%s)",func, strerror(errno)); + return STATUS_ERROR; + } + if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one))) == -1) + { + PGRclose_sock(&fd); + * fdP = -1; + show_error("%s:setsockopt() failed. (%s)", func, strerror(errno)); + return STATUS_ERROR; + return STATUS_ERROR; + } + if ((setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one))) == -1) + { + PGRclose_sock(&fd); + * fdP = -1; + show_error("%s:setsockopt() failed. (%s)", func, strerror(errno)); + return STATUS_ERROR; + } + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) < 0) + { + PGRclose_sock(&fd); + * fdP = -1; + show_error("%s:setsockopt() failed. (%s)",func, strerror(errno)); + return STATUS_ERROR; + } + + addr.sin_family = AF_INET; + if ((hostName == NULL) || (hostName[0] == '\0')) + addr.sin_addr.s_addr = htonl(INADDR_ANY); + else + { + struct hostent *hp; + + hp = gethostbyname(hostName); + if ((hp == NULL) || (hp->h_addrtype != AF_INET)) + { + PGRclose_sock(&fd); + * fdP = -1; + return STATUS_ERROR; + } + memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length); + } + + addr.sin_port = htons(portNumber); + len = sizeof(struct sockaddr_in); + + if ((sock = connect(fd,(struct sockaddr*)&addr,len)) < 0) + { + PGRclose_sock(&fd); + * fdP = -1; + return STATUS_ERROR; + } + + * fdP = fd; + return STATUS_OK; +} + diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/AUTHORS pgcluster-1.7.0rc7/src/pgcluster/pgrp/AUTHORS --- postgresql-8.2.4/src/pgcluster/pgrp/AUTHORS 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/AUTHORS 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,3 @@ +Authors of pgrp + +pgrp was written by Atsushi Mitani diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/COPYING pgcluster-1.7.0rc7/src/pgcluster/pgrp/COPYING --- postgresql-8.2.4/src/pgcluster/pgrp/COPYING 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/COPYING 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,12 @@ +Copyright (c) 2003-2006 Atsushi Mitani + +Permission to use, copy, modify, and distribute this software and +its documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appear in all +copies and that both that copyright notice and this permission +notice appear in supporting documentation, and that the name of the +author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. The author makes no representations about the +suitability of this software for any purpose. It is provided "as +is" without express or implied warranty. diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/Makefile pgcluster-1.7.0rc7/src/pgcluster/pgrp/Makefile --- postgresql-8.2.4/src/pgcluster/pgrp/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,41 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/pgcluster/pgrp +# +#------------------------------------------------------------------------- + +subdir = src/pgcluster/pgrp +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +# this setup is for V2 protocol +#OBJS= cascade.o conf.o main.o recovery.o replicate.o rlog.o +# this setup is for V3 protocol +OBJS= pqformat.o cascade.o conf.o main.o recovery.o replicate.o rlog.o lifecheck.o + +EXTRA_OBJS = $(top_builddir)/src/backend/libpq/replicate_com.o ../libpgc/SUBSYS.o + +CFLAGS += -DPRINT_DEBUG +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) -DBINDIR=\"$(bindir)\" +all: pgreplicate + +pgreplicate: $(OBJS) $(libpq_builddir)/libpq.a + $(CC) $(CFLAGS) $(OBJS) $(EXTRA_OBJS) $(libpq) $(libpq_builddir)/libpq.a $(LDFLAGS) $(LIBS) -o $@ + +install: all installdirs + $(INSTALL_PROGRAM) pgreplicate$(X) $(DESTDIR)$(bindir)/pgreplicate$(X) + $(INSTALL_DATA) pgreplicate.conf.sample $(DESTDIR)$(datadir)/pgreplicate.conf.sample + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + $(mkinstalldirs) $(DESTDIR)$(datadir) + +uninstall: + rm -f $(addprefix $(DESTDIR)$(bindir)/, pgreplicate$(X)) + rm -f $(DESTDIR)$(datadir)/pgreplicate.conf.sample + +clean distclean maintainer-clean: + rm -f pgreplicate$(X) $(OBJS) + +clean_obj: + rm -f $(OBJS) diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/cascade.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/cascade.c --- postgresql-8.2.4/src/pgcluster/pgrp/cascade.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/cascade.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,928 @@ +/*-------------------------------------------------------------------- + * FILE: + * cascade.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for backup and cascade . + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#ifdef USE_REPLICATION + +#include "postgres.h" +#include "postgres_fe.h" + +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#include + +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include "access/xact.h" +#include "replicate_com.h" +#include "pgreplicate.h" + +#if 0 +static int count_cascade(int flag); +static void PGRinit_cascade_child(void); +#endif + +static int fixup_socket_for_cascades(int *sock ,ReplicateServerInfo * target); +static ReplicateServerInfo * get_cascade_data(int * cnt, int flag); +static int add_cascade_data(ReplicateHeader * header, ReplicateServerInfo * add_data); +static int update_cascade_data(ReplicateHeader * header, ReplicateServerInfo * update_data); +static void write_cascade_status_file(ReplicateServerInfo * cascade); +static int notice_cascade_data(int sock); +static int notice_cascade_data_to_cluster_db(void); + +/** + * socket variables, moved from Cascade_Inf->(lower|upper)->sock. + * Cascade->Inf is in shared memory, so sometimes cascades returns EBADF due to not initialized socket in specified process. + * 05/10/05 tanida@sraoss.co.jp + */ + +static int lsock=-1; /* socket for lower-cascade. */ +static int usock=-1; /* socket for upper-cascade. */ + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ + +#if 0 +static int +count_cascade(int flag) +{ + int cnt = 0; + int cascade_cnt = 0; + ReplicateServerInfo * cascade = NULL; + + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + return 0; + } + + /* count cascadeing replication server */ + switch (flag) + { + case UPPER_CASCADE: + case ALL_CASCADE: + cascade = Cascade_Tbl; + break; + case LOWER_CASCADE: + cascade = Cascade_Inf->myself; + break; + } + + if (cascade == NULL) + { + return 0; + } + while (cascade->useFlag != DB_TBL_END) + { + if (cascade->useFlag == DB_TBL_USE) + { + cascade_cnt ++; + } + if ((flag == UPPER_CASCADE) && + (cascade == Cascade_Inf->myself)) + { + break; + } + cnt ++; + if (cnt >= MAX_DB_SERVER -1 ) + { + break; + } + cascade ++; + } + return cascade_cnt; +} + +static void +PGRinit_cascade_child(void) { + fixup_socket_for_cascades(&usock,NULL); + fixup_socket_for_cascades(&lsock,NULL); +} +#endif /* if 0 */ + +static ReplicateServerInfo * +get_cascade_data(int * cnt, int flag) +{ + char * func = "get_cascade_data()"; + int i = 0; + int loop_cnt = 0; + int size = 0; + ReplicateServerInfo * buf = NULL; + ReplicateServerInfo * cascade = NULL; + + size = sizeof(ReplicateServerInfo) * MAX_DB_SERVER; + buf = (ReplicateServerInfo *)malloc(size); + if (buf == (ReplicateServerInfo *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + *cnt = 0; + return NULL; + } + memset(buf,0,size); + + switch (flag) + { + case UPPER_CASCADE: + case ALL_CASCADE: + cascade = Cascade_Tbl; + break; + case LOWER_CASCADE: + cascade = Cascade_Inf->myself; + break; + default: + free(buf); + *cnt = 0; + return NULL; + + } + + if (cascade == NULL) + { + free(buf); + *cnt = 0; + return NULL; + } + PGRsem_lock(CascadeSemID,1); + i = 0; + loop_cnt = 0; + while (cascade->useFlag != DB_TBL_END) + { + if (cascade->useFlag == DB_TBL_USE) + { + (buf + i)->useFlag = htonl(cascade->useFlag); + strncpy((buf + i)->hostName,cascade->hostName,sizeof(cascade->hostName)); + (buf + i)->portNumber = htons(cascade->portNumber); + (buf + i)->recoveryPortNumber = htons(cascade->recoveryPortNumber); + (buf + i)->lifecheckPortNumber = htons(cascade->lifecheckPortNumber); + i++; + } + if ((flag == UPPER_CASCADE) && + (cascade == Cascade_Inf->myself)) + { + break; + } + loop_cnt ++; + if (loop_cnt >= MAX_DB_SERVER -1 ) + { + break; + } + if (Cascade_Inf->end == cascade) + { + break; + } + cascade ++; + } + *cnt = i; + PGRsem_unlock(CascadeSemID,1); + + return buf; +} + +static int +update_cascade_data(ReplicateHeader * header, ReplicateServerInfo * update_data) +{ + char * func = "update_cascade_data()"; + int size = 0; + int cnt = 0; + ReplicateServerInfo * ptr = NULL; + ReplicateServerInfo * cascade = NULL; + char hostName[HOSTNAME_MAX_LENGTH]; + + + show_debug("executing %s",func); + if ((header == NULL ) || ( update_data == NULL)) + { + show_error("%s:receive data is wrong",func); + return STATUS_ERROR; + } + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + show_error("%s:config data read error",func); + return STATUS_ERROR; + } + + + size = ntohl(header->query_size); + cnt = size / sizeof(ReplicateServerInfo); + if (cnt >= MAX_DB_SERVER) + { + show_error("%s:update cascade data is too large. it's more than %d", func,MAX_DB_SERVER); + return STATUS_ERROR; + } + + Cascade_Inf->useFlag = DB_TBL_INIT; + fixup_socket_for_cascades(&usock,NULL); + fixup_socket_for_cascades(&lsock,NULL); + + Cascade_Inf->upper = NULL; + Cascade_Inf->lower = NULL; + + gethostname(hostName,sizeof(hostName)); + ptr = update_data; + cascade = Cascade_Tbl; + memset(cascade,0,(sizeof(ReplicateServerInfo)*MAX_DB_SERVER)); + Cascade_Inf->top = cascade; + while (cnt > 0) + { + + cascade->useFlag = ntohl(ptr->useFlag); + strncpy(cascade->hostName,ptr->hostName,sizeof(cascade->hostName)); + cascade->portNumber = ntohs(ptr->portNumber); + cascade->recoveryPortNumber = ntohs(ptr->recoveryPortNumber); + cascade->lifecheckPortNumber = ntohs(ptr->lifecheckPortNumber); + + if ((!strncmp(cascade->hostName,hostName,sizeof(cascade->hostName))) && + (cascade->portNumber == Port_Number) && + (cascade->recoveryPortNumber == Recovery_Port_Number)) + { + Cascade_Inf->myself = cascade; + } + + Cascade_Inf->end = cascade; + cascade ++; + ptr ++; + cnt --; + cascade->useFlag = DB_TBL_END; + } + Cascade_Inf->useFlag = DB_TBL_USE; + + return STATUS_OK; +} + +static int +add_cascade_data(ReplicateHeader * header, ReplicateServerInfo * add_data) +{ + char *func = "add_cascade_data()"; + int size = 0; + int cnt = 0; + ReplicateServerInfo * ptr = NULL; + ReplicateServerInfo * cascade = NULL; + char hostName[HOSTNAME_MAX_LENGTH]; + + if ((header == NULL ) || ( add_data == NULL)) + { + show_error("%s:receive data is wrong",func); + return STATUS_ERROR; + } + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + show_error("%s:config data read error",func); + return STATUS_ERROR; + } + size = ntohl(header->query_size); + cnt = size / sizeof(ReplicateServerInfo); + if (cnt >= MAX_DB_SERVER) + { + show_error("%s:addtional cascade data is too large. it's more than %d", func,MAX_DB_SERVER); + return STATUS_ERROR; + } + + Cascade_Inf->useFlag = DB_TBL_INIT; + fixup_socket_for_cascades(&lsock,NULL); + Cascade_Inf->lower = NULL; + + gethostname(hostName,sizeof(hostName)); + ptr = add_data; + cascade = Cascade_Inf->myself; + cascade ++; + while (cnt > 0) + { + cascade->useFlag = ntohl(ptr->useFlag); + strncpy(cascade->hostName,ptr->hostName,sizeof(cascade->hostName)); + cascade->portNumber = ntohs(ptr->portNumber); + cascade->recoveryPortNumber = ntohs(ptr->recoveryPortNumber); + cascade->lifecheckPortNumber = ntohs(ptr->lifecheckPortNumber); + cascade->replicate_id=-1; + cascade->response_mode=-1; + + Cascade_Inf->end = cascade; + + if ((!strncmp(cascade->hostName,hostName,sizeof(cascade->hostName))) && + (cascade->portNumber == Port_Number) && + (cascade->recoveryPortNumber == Recovery_Port_Number)) + { + ptr ++; + cnt --; + continue; + } + cascade ++; + cascade->useFlag = DB_TBL_END; + ptr ++; + cnt --; + } + Cascade_Inf->useFlag = DB_TBL_USE; + return STATUS_OK; +} + +int +PGRstartup_cascade(void) +{ + char * func = "PGRstartup_cascade()"; + int cnt = 0; + int status = STATUS_OK; + ReplicateHeader header; + ReplicateServerInfo * cascade = NULL; + ReplicateServerInfo * buf = NULL; + + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + show_error("%s:config data read error",func); + return STATUS_ERROR; + } + + /* count lower server */ + cascade = Cascade_Inf->myself; + if (cascade == NULL) + { + show_error("%s:cascade data initialize error",func); + return STATUS_ERROR; + } + buf = get_cascade_data(&cnt,LOWER_CASCADE); + if (cnt <= 0) + { + show_error("%s:cascade data get error",func); + return STATUS_ERROR; + } + + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CASCADE; + header.cmdSts = CMD_STS_TO_UPPER; + header.cmdType = CMD_TYPE_ADD; + header.query_size = htonl(sizeof(ReplicateServerInfo) * cnt); + + status = PGRsend_upper_cascade(&header, (char *)buf); + if (buf != NULL) + { + free(buf); + } + if (status == STATUS_OK) + { + memset(&header,0,sizeof(ReplicateHeader)); + buf = PGRrecv_cascade_answer( Cascade_Inf->upper, &header); + if (buf == NULL) + { + status=STATUS_ERROR; + } + else if((header.cmdSys == CMD_SYS_CASCADE) && + (header.cmdSts == CMD_STS_TO_LOWER) && + (header.cmdType == CMD_TYPE_UPDATE_ALL)) + { + status = update_cascade_data(&header,buf); + free(buf); + } + + } + show_debug("%s:startup packet result is %d",func,status); + return status; +} + +int +PGRsend_lower_cascade(ReplicateHeader * header, char * query) +{ + + + char * func = "PGRsend_lower_cascade()"; + ReplicateServerInfo *lower = PGRget_lower_cascade(); + + + while(lower!=NULL) + { + /** + * check lower_cascade validaty. + * + */ + if(lsock!=-1 && + PGRsend_cascade(lsock,header,query)==STATUS_OK) + { + return STATUS_OK; + } + else + { + /** + * current lower cascade is missing. + * fix socket , or go to next one. + * + */ + while( lower!=NULL && + fixup_socket_for_cascades(&lsock,lower)!=STATUS_OK) + { + show_error("%s:lower cascade maybe down,challenge new one.",func); + PGRset_cascade_server_status(lower,DB_TBL_ERROR); + lower =PGRget_lower_cascade(); + } + } + Cascade_Inf->lower=lower; + } + + + return STATUS_ERROR; +} + + +int +PGRsend_upper_cascade(ReplicateHeader * header, char * query) +{ + char * func = "PGRsend_upper_cascade()"; + ReplicateServerInfo *upper = PGRget_upper_cascade(); + + + while(upper!=NULL) + { + /** + * check upper_cascade validaty. + * + */ + if(usock!=-1 && + PGRsend_cascade(usock,header,query)==STATUS_OK) + { + return STATUS_OK; + } + else + { + /** + * current upper cascade is missing. + * fix socket , or go to next one. + * + */ + while( upper!=NULL && + fixup_socket_for_cascades(&usock,upper)!=STATUS_OK) + { + show_error("%s:upper cascade maybe down,challenge new one.",func); + PGRset_cascade_server_status(upper,DB_TBL_ERROR); + upper =PGRget_upper_cascade(); + } + } + Cascade_Inf->upper=upper; + } + + return STATUS_ERROR; +} + +ReplicateServerInfo * +PGRget_lower_cascade(void) +{ + char * func = "PGRget_lower_cascade()"; + ReplicateServerInfo * cascade = NULL; + + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + show_error("%s:config data read error",func); + return NULL; + } + + /* count lower server */ + + cascade = Cascade_Inf->myself; + if (cascade == NULL) + { + show_error("%s:cascade data initialize error",func); + return NULL; + } + if (cascade->useFlag != DB_TBL_END) + { + cascade ++; + } + while (cascade->useFlag != DB_TBL_END) + { +#ifdef PRINT_DEBUG + show_debug("%s:lower cascade search[%d]@[%s] use[%d]", + func, + cascade->portNumber, + cascade->hostName, + cascade->useFlag); +#endif + if (cascade->useFlag == DB_TBL_USE) + { +#ifdef PRINT_DEBUG + show_debug("%s:find lower cascade",func); +#endif + return cascade; + } + cascade ++; + } + return NULL; +} + +ReplicateServerInfo * +PGRget_upper_cascade(void) +{ + char * func = "PGRget_upper_cascade()"; + ReplicateServerInfo * cascade = NULL; + + if ((Cascade_Tbl == NULL) || (Cascade_Inf == NULL)) + { + show_error("%s:config data read error",func); + return NULL; + } + + + /* count lower server */ + cascade = Cascade_Inf->myself; + if ((cascade == NULL) || (Cascade_Inf->top == cascade)) + { + return NULL; + } + cascade --; + while (cascade != NULL) + { + if (cascade->useFlag == DB_TBL_USE) + { + return cascade; + } + if (Cascade_Inf->top == cascade) + { + break; + } + cascade --; + } + return NULL; +} + +static void +write_cascade_status_file(ReplicateServerInfo * cascade) +{ + switch( cascade->useFlag) + { + case DB_TBL_FREE: + PGRwrite_log_file(StatusFp,"cascade(%s) port(%d) free", + cascade->hostName, + cascade->portNumber); + break; + case DB_TBL_INIT: + PGRwrite_log_file(StatusFp,"cascade(%s) port(%d) initialize", + cascade->hostName, + cascade->portNumber); + break; + case DB_TBL_USE: + PGRwrite_log_file(StatusFp,"cascade(%s) port(%d) start use", + cascade->hostName, + cascade->portNumber); + break; + case DB_TBL_ERROR: + PGRwrite_log_file(StatusFp,"cascade(%s) port(%d) error", + cascade->hostName, + cascade->portNumber); + break; + case DB_TBL_TOP: + PGRwrite_log_file(StatusFp,"cascade(%s) port(%d) become top", + cascade->hostName, + cascade->portNumber); + break; + } +} + +void +PGRset_cascade_server_status(ReplicateServerInfo * cascade, int status) +{ + if (cascade == NULL) + { + return; + } + if (cascade->useFlag != status) + { + cascade->useFlag = status; + write_cascade_status_file(cascade); + } +} + +ReplicateServerInfo * +PGRrecv_cascade_answer(ReplicateServerInfo * cascade,ReplicateHeader * header) +{ + ReplicateServerInfo * answer = NULL; + int sock; + + if ((cascade == NULL) || (header == NULL)) + { + return NULL; + } + + /* FIXME: ReplicateServerInfo->sock must be removed in cascading. */ + if(cascade == Cascade_Inf->upper ) + { + sock=usock; + } + else if (cascade == Cascade_Inf->lower ) + { + sock=lsock; + } + else + { + show_debug("PGRrecv_cascade_answer:receiving packet from sock not belogs to cascade->upper / lower. maybe missing ."); + sock=cascade->sock; + } + answer = (ReplicateServerInfo*)PGRread_packet(sock,header); + return answer; +} + +int +PGRsend_cascade(int sock , ReplicateHeader * header, char * query) +{ + char * func ="PGRsend_cascade()"; + int s; + char * send_ptr; + char * buf; + int send_size = 0; + int buf_size; + int header_size; + int rtn; + fd_set wmask; + struct timeval timeout; + int query_size = 0; + + /* check parameter */ + if ((header == NULL) || (sock == -1)) + { + return STATUS_ERROR; + } + +#ifdef PRINT_DEBUG + show_debug("%s:PGRsend_cascade sock[%d]",func,sock); +#endif + query_size = ntohl(header->query_size); + header_size = sizeof(ReplicateHeader); + buf_size = header_size + query_size + 4; + buf = malloc(buf_size); + memset(buf,0,buf_size); + buf_size -= 4; + memcpy(buf,header,header_size); + if (query_size > 0) + { + memcpy((char *)(buf+header_size),query,query_size+1); + } + send_ptr = buf; + + for (;;) + { + timeout.tv_sec = 10; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&wmask); + FD_SET(sock,&wmask); + rtn = select(sock+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + + show_error("%s:select failed ,errno is %s",func , strerror(errno)); + free(buf); + return STATUS_ERROR; + } + + if (rtn && FD_ISSET(sock, &wmask)) + { + s = send(sock,send_ptr + send_size,buf_size - send_size ,0); + if (s < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:send failed: %d(%s)",func, errno, strerror(errno)); + free(buf); + return STATUS_ERROR; + } + } + else if (s == 0) + { + show_error("%s:unexpected EOF", func); + free(buf); + return STATUS_ERROR; + } + send_size += s; + if (send_size == buf_size) + { +#ifdef PRINT_DEBUG + show_debug("%s:send[%s] size[%d]",func,query,send_size); +#endif + free(buf); + return STATUS_OK; + } + } + } + return STATUS_OK; +} + +int +PGRwait_answer_cascade(int sock) +{ + ReplicateHeader header; + char * answer = NULL; + + answer = PGRread_packet(sock,&header); + if (answer != NULL) + { + free(answer); + return STATUS_OK; + } + return STATUS_ERROR; +} +/** + * fixup_socket_for_cascades checks socket's validaty. + * returns STATUS_OK if succeeded , or STATUS_ERROR if some error occured. + * if target is null , only close socket. + * + * originally written by tanida@sraoss.co.jp + */ +static int +fixup_socket_for_cascades(int *sock, ReplicateServerInfo *target) +{ + if (*sock > 0) + { + close(*sock); + *sock=-1; + } + if(target!=NULL) { + return PGR_Create_Socket_Connect(sock,target->hostName,target->portNumber); + } + return STATUS_OK; +} + + +static int +notice_cascade_data(int sock) +{ + char * func = "notice_cascade_data"; + ReplicateServerInfo *cascade_data = NULL; + ReplicateHeader header; + int cnt = 0; + int size = 0; + + if (sock <= 0) + { + return STATUS_ERROR; + } + + cascade_data = get_cascade_data(&cnt, ALL_CASCADE ); + if (cnt <= 0) + { + show_error("%s:cascade data is wrong",func); + return STATUS_ERROR; + } + size = sizeof (ReplicateServerInfo) * cnt ; + + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CASCADE ; + header.cmdSts = CMD_STS_TO_LOWER ; + header.cmdType = CMD_TYPE_UPDATE_ALL; + header.query_size = htonl(size); + PGRsend_cascade(sock, &header, (char *)cascade_data ); + if (cascade_data != NULL) + { + free(cascade_data); + } + return STATUS_OK; +} + +int +PGRcascade_main(int sock, ReplicateHeader * header, char * query) +{ + switch (header->cmdSts) + { + case CMD_STS_TO_UPPER: + if (header->cmdType == CMD_TYPE_ADD) + { + /* add lower cascade data to myself */ + add_cascade_data(header,(ReplicateServerInfo*)query); + /* send cascade data to upper */ + /* and receive new cascade data from upper */ + PGRstartup_cascade(); + /* return to lower with new cascade data */ + notice_cascade_data(sock); + /* notifies a cascade server's information to Cluster DBs */ + notice_cascade_data_to_cluster_db(); + } + break; + case CMD_STS_TO_LOWER: + /* + * use for cascading replication + */ + break; + } + return STATUS_OK; +} + +static int +notice_cascade_data_to_cluster_db(void) +{ + char userName[USERNAME_MAX_LENGTH]; + ReplicateServerInfo *s=NULL; + + if (Cascade_Inf->lower == NULL) + { + Cascade_Inf->lower = PGRget_lower_cascade(); + } + if (Cascade_Inf->lower == NULL) + { + return STATUS_ERROR; + } + s=Cascade_Inf->lower; + memset(userName,0,sizeof(userName)); + strncpy(userName ,getenv("LOGNAME"),sizeof(userName)-1); + + PGRnotice_replication_server(s->hostName, + s->portNumber, + s->recoveryPortNumber, + s->lifecheckPortNumber, + userName); + + return STATUS_OK; +} + +int +PGRwait_notice_rlog_done(void) +{ + ReplicateHeader header; + if (lsock != -1) + { + PGRread_packet(lsock,&header); + return STATUS_OK; + } + return STATUS_ERROR; + +} + + +int +PGRsend_notice_quit(void ) +{ + ReplicateHeader header; + int size = 0; + + size = strlen("QUIT_SAFELY"); + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CALL ; + header.cmdSts = CMD_STS_RESPONSE ; + header.cmdType = CMD_TYPE_FRONTEND_CLOSED; + header.query_size = htonl(size); + PGRsend_lower_cascade(&header, "QUIT_SAFELY"); + PGRwait_notice_rlog_done(); + return STATUS_OK; +} + +int +PGRsend_notice_rlog_done(int sock) +{ + ReplicateHeader header; + int size = 0; + + if (sock <= 0) + { + return STATUS_ERROR; + } + + size = strlen(PGR_QUERY_DONE_NOTICE_CMD); + memset(&header,0,sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CASCADE ; + header.cmdSts = CMD_STS_RESPONSE ; + header.cmdType = 0; + header.query_size = htonl(size); + PGRsend_cascade(sock, &header, PGR_QUERY_DONE_NOTICE_CMD); + return STATUS_OK; + +} +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/conf.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/conf.c --- postgresql-8.2.4/src/pgcluster/pgrp/conf.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/conf.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,694 @@ +/*-------------------------------------------------------------------- + * FILE: + * conf.c + * Replication server for PostgreSQL + * + * NOTE: + * Read and set configuration data in this modul. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include "replicate_com.h" +#include "pgreplicate.h" + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRget_Conf_Data() + * NOTES + * Initialize mamory and tables + * ARGS + * char * path: path of the setup file (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +int +PGRget_Conf_Data(char * path) +{ + char * func = "PGRget_Conf_Data()"; + HostTbl host_tbl[MAX_DB_SERVER]; + ConfDataType * conf = NULL; + int cnt = 0; + int lb_cnt = 0; + int cascade_cnt = 0; + int rec_no = 0; + int lb_rec_no = 0; + int cascade_rec_no = -1; + int i = 0; + int size = 0; + char fname[256]; + union semun sem_arg; + + /* + * open log file + */ + if (path == NULL) + { + path = "."; + } + size = sizeof(LogFileInf); + LogFileData = (LogFileInf *) malloc(size); + if (LogFileData == NULL) + { + show_error("%s:malloc() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } + memset(LogFileData,0,size); + + snprintf(fname,sizeof(fname),"%s/%s",path,PGREPLICATE_STATUS_FILE); + StatusFp = fopen(fname,"a"); + if (StatusFp == NULL) + { + show_error("%s:fopen failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + + snprintf(fname,sizeof(fname),"%s/%s",path,PGREPLICATE_RID_FILE); + RidFp = fopen(fname,"r+"); + if (RidFp == NULL) + { + RidFp = fopen(fname,"w+"); + if (RidFp == NULL) + { + show_error("%s:fopen failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + } + + /* + * read configuration file + */ + if (PGR_Get_Conf_Data(path,PGREPLICATE_CONF_FILE) != STATUS_OK) + { + show_error("%s:PGR_Get_Conf_Data failed",func); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("PGR_Get_Conf_Data ok"); +#endif + + /* allocate response information table */ + PGR_Response_Inf = (ResponseInf *)malloc(sizeof(ResponseInf)); + if (PGR_Response_Inf == NULL) + { + show_error("%s:malloc() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } + PGR_Response_Inf->response_mode = PGR_NORMAL_MODE; + PGR_Response_Inf->current_cluster = 0; + + /* + * memory allocate load balance table buffer + */ + LoadBalanceTbl = (RecoveryTbl *)malloc(sizeof(RecoveryTbl)*MAX_DB_SERVER); + if (LoadBalanceTbl == (RecoveryTbl *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("LoadBalanceTbl allocate ok"); +#endif + + /* + * memory allocate cascade server table buffer + */ + size = sizeof(ReplicateServerInfo) * MAX_DB_SERVER; + CascadeTblShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (CascadeTblShmid < 0) + { + show_error("%s:shmget() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:CascadeTbl shmget ok",func); +#endif + Cascade_Tbl = (ReplicateServerInfo *)shmat(CascadeTblShmid,0,0); + if (Cascade_Tbl == (ReplicateServerInfo *)-1) + { + show_error("%s:shmat() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:CascadeTbl shmat ok",func); +#endif + memset(Cascade_Tbl , 0 , size ); + + /* + * memory allocate cascade index + */ + size = sizeof(CascadeInf); + CascadeInfShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (CascadeInfShmid < 0) + { + show_error("%s:shmget() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:CascadeInf shmget ok",func); +#endif + Cascade_Inf = (CascadeInf *)shmat(CascadeInfShmid,0,0); + if (Cascade_Inf == (CascadeInf *)-1) + { + show_error("%s:shmat() failed. reason: %s",func, strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:CascadeInf shmat ok",func); +#endif + memset(Cascade_Inf , 0 , size ); + + /* + * memory allocate replication commit log buffer + */ + size = sizeof(CommitLogInf) * MAX_DB_SERVER * MAX_CONNECTIONS; + CommitLogShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (CommitLogShmid < 0) + { + show_error("%s:shmget() failed. reason: %s", func, strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:CommitLog shmget ok",func); +#endif + Commit_Log_Tbl = (CommitLogInf *)shmat(CommitLogShmid,0,0); + if (Commit_Log_Tbl == (CommitLogInf *)-1) + { + show_error("%s:shmat() failed. reason: %s",func, strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:Commit_Log_Tbl shmat ok",func); +#endif + memset(Commit_Log_Tbl , 0 , size ); + (Commit_Log_Tbl + (MAX_DB_SERVER * MAX_CONNECTIONS) -1)->inf.useFlag = DB_TBL_END; + + /* create semapho */ + if ((SemID = semget(IPC_PRIVATE,2,IPC_CREAT | IPC_EXCL | 0600)) < 0) + { + show_error("%s:semget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + for ( i = 0 ; i < 2 ; i ++) + { + semctl(SemID, i, GETVAL, sem_arg); + sem_arg.val = 1; + semctl(SemID, i, SETVAL, sem_arg); + } + + /* create semapho */ + if ((CascadeSemID = semget(IPC_PRIVATE,2,IPC_CREAT | IPC_EXCL | 0600)) < 0) + { + show_error("%s:semget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + for ( i = 0 ; i < 2 ; i ++) + { + semctl(CascadeSemID, i, GETVAL, sem_arg); + sem_arg.val = 1; + semctl(CascadeSemID, i, SETVAL, sem_arg); + } + + + if ((VacuumSemID = semget(IPC_PRIVATE,2,IPC_CREAT | IPC_EXCL | 0600)) < 0) + { + show_error("%s:semget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + for ( i = 0 ; i < 2 ; i ++) + { + semctl(VacuumSemID, i, GETVAL, sem_arg); + sem_arg.val = 1; + semctl(VacuumSemID, i, SETVAL, sem_arg); + } + size = sizeof(ReplicationLogInf); + Replicateion_Log = malloc(size); + if (Replicateion_Log == NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + memset(Replicateion_Log , 0 , size ); + Replicateion_Log->RLog_Sock_Path = NULL; +#ifdef PRINT_DEBUG + show_debug("%s:RLog Memory Allocation ok",func); +#endif + + + /* + * set each datas into the tables + */ + conf = ConfData_Top; + while (conf != (ConfDataType *)NULL) + { + show_debug("registering (key,value)=(%s,%s)",conf->key,conf->value); + /* get cluster db data */ + if (!STRCMP(conf->table,CLUSTER_SERVER_TAG)) + { + rec_no = conf->rec_no; + if (cnt < rec_no) + { + cnt = rec_no; + if (cnt >= MAX_DB_SERVER) + { + continue; + } + } + if (!STRCMP(conf->key,HOST_NAME_TAG)) + { + int ip; + strncpy(host_tbl[rec_no].hostName,conf->value,sizeof(host_tbl[rec_no].hostName)); + show_debug("registering hostname %s",host_tbl[rec_no].hostName); + ip=PGRget_ip_by_name(conf->value); + + sprintf(host_tbl[rec_no].resolvedName, + "%d.%d.%d.%d", + (ip ) & 0xff , + (ip >> 8) & 0xff , + (ip >> 16) & 0xff , + (ip >> 24) & 0xff ); + show_debug("resolved name is %s",host_tbl[rec_no].resolvedName); + + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key,PORT_TAG)) + { + host_tbl[rec_no].port = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key,RECOVERY_PORT_TAG)) + { + host_tbl[rec_no].recoveryPort = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + } + /* get cascade server data */ + else if (!STRCMP(conf->table, REPLICATION_SERVER_INFO_TAG)) + { + cascade_rec_no = conf->rec_no ; + if (cascade_cnt < cascade_rec_no) + { + cascade_cnt = cascade_rec_no; + if (cascade_cnt >= MAX_DB_SERVER) + { + continue; + } + } + if (!STRCMP(conf->key,HOST_NAME_TAG)) + { + strncpy((Cascade_Tbl+cascade_rec_no)->hostName,conf->value,sizeof(Cascade_Tbl->hostName)); + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key,PORT_TAG)) + { + if (atoi(conf->value) > 0) + { + (Cascade_Tbl+cascade_rec_no)->portNumber = atoi(conf->value); + } + else + { + (Cascade_Tbl+cascade_rec_no)->portNumber = DEFAULT_PGRP_PORT; + } + (Cascade_Tbl+cascade_rec_no)->sock = -1; + + conf = (ConfDataType*)conf->next; + PGRset_cascade_server_status(Cascade_Tbl+cascade_rec_no,DB_TBL_USE); + if (cascade_rec_no == 0) + { + Cascade_Inf->top = Cascade_Tbl; + } + continue; + } + if (!STRCMP(conf->key,RECOVERY_PORT_TAG)) + { + if (atoi(conf->value) > 0) + { + (Cascade_Tbl+cascade_rec_no)->recoveryPortNumber = atoi(conf->value); + } + else + { + (Cascade_Tbl+cascade_rec_no)->recoveryPortNumber = DEFAULT_PGRP_RECOVERY_PORT; + } + (Cascade_Tbl+cascade_rec_no)->rlog_sock=-1; + (Cascade_Tbl+cascade_rec_no +1)->useFlag = DB_TBL_END; + conf = (ConfDataType*)conf->next; + continue; + } + } + /* get loadbalancer table data */ + else if (!STRCMP(conf->table,LOAD_BALANCE_SERVER_TAG)) + { + lb_rec_no = conf->rec_no; + if (lb_cnt < lb_rec_no) + { + lb_cnt = lb_rec_no; + if (lb_cnt >= MAX_DB_SERVER) + { + continue; + } + } + if (!STRCMP(conf->key,HOST_NAME_TAG)) + { + strncpy((LoadBalanceTbl + lb_rec_no)->hostName, conf->value,sizeof(LoadBalanceTbl->hostName)); + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key,RECOVERY_PORT_TAG)) + { + (LoadBalanceTbl + lb_rec_no)->recoveryPort = atoi(conf->value); + (LoadBalanceTbl + lb_rec_no)->sock = -1; + (LoadBalanceTbl + lb_rec_no)->recovery_sock = -1; + conf = (ConfDataType*)conf->next; + continue; + } + } + /* get logging file data */ + else if (!STRCMP(conf->table, LOG_INFO_TAG)) + { + if (!STRCMP(conf->key, FILE_NAME_TAG)) + { + strncpy(LogFileData->file_name, conf->value ,sizeof(LogFileData->file_name)); + LogFileData->fp = NULL; + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key, FILE_SIZE_TAG)) + { + int i,len; + char * ptr; + int unit = 1; + len = strlen(conf->value); + ptr = conf->value; + for (i = 0; i < len ; i ++,ptr++) + { + if ((! isdigit(*ptr)) && (! isspace(*ptr))) + { + switch (*ptr) + { + case 'K': + case 'k': + unit = 1024; + break; + case 'M': + case 'm': + unit = 1024*1024; + break; + case 'G': + case 'g': + unit = 1024*1024*1024; + break; + } + *ptr = '\0'; + break; + } + } + LogFileData->max_size = atoi(conf->value) * unit; + conf = (ConfDataType*)conf->next; + continue; + } + if (!STRCMP(conf->key, LOG_ROTATION_TAG)) + { + LogFileData->rotation = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + } + else + { + if (!STRCMP(conf->key,HOST_NAME_TAG)) + { + int ip; + ip=PGRget_ip_by_name(conf->value); + if (ResolvedName == NULL) + { + ResolvedName = malloc(ADDRESS_LENGTH); + } + if (ResolvedName == NULL) + { + continue; + } + else + { + memset(ResolvedName,0,ADDRESS_LENGTH); + } + + sprintf(ResolvedName, + "%d.%d.%d.%d", + (ip ) & 0xff , + (ip >> 8) & 0xff , + (ip >> 16) & 0xff , + (ip >> 24) & 0xff ); + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,REPLICATE_PORT_TAG)) + { + Port_Number = atoi(conf->value); + conf = (ConfDataType*)conf->next; + continue; + } + /* get port number for recovery cluster db server */ + else if (!STRCMP(conf->key,RECOVERY_PORT_TAG)) + { + if (atoi(conf->value) > 0) + { + Recovery_Port_Number = atoi(conf->value); + } + else + { + Recovery_Port_Number =DEFAULT_PGRP_RECOVERY_PORT; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,LIFECHECK_PORT_TAG)) + { + if (atoi(conf->value) > 0) + { + LifeCheck_Port_Number = atoi(conf->value); + } + else + { + LifeCheck_Port_Number = DEFAULT_PGRP_LIFECHECK_PORT; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,RLOG_PORT_TAG)) + { + if (atoi(conf->value) > 0) + { + Replicateion_Log->RLog_Port_Number = atoi(conf->value); + } + else + { + Replicateion_Log->RLog_Port_Number = DEFAULT_PGRP_RLOG_PORT; + } + conf = (ConfDataType*)conf->next; + continue; + } + /* get response mode */ + else if (!STRCMP(conf->key,RESPONSE_MODE_TAG)) + { + if (!STRCMP(conf->value,RESPONSE_MODE_RELIABLE)) + { + PGR_Response_Inf->response_mode = PGR_RELIABLE_MODE; + } + else if (!STRCMP(conf->value,RESPONSE_MODE_FAST)) + { + PGR_Response_Inf->response_mode = PGR_FAST_MODE; + } + else + { + PGR_Response_Inf->response_mode = PGR_NORMAL_MODE; + } + conf = (ConfDataType*)conf->next; + continue; + } + /* get replication log use or not */ + else if (!STRCMP(conf->key,USE_REPLICATION_LOG_TAG)) + { + if (!STRCMP(conf->value,"yes")) + { + PGR_Use_Replication_Log = true; + } + conf = (ConfDataType*)conf->next; + continue; + } + /* get replication timeout */ + else if (!STRCMP(conf->key,TIMEOUT_TAG)) + { + /* get repliaction timeout */ + PGR_Replication_Timeout = PGRget_time_value(conf->value); + if ((PGR_Replication_Timeout < 1) || (PGR_Replication_Timeout > 3600)) + { + fprintf(stderr,"%s is out of range. It should be between 1sec-1hr.\n",TIMEOUT_TAG); + return STATUS_ERROR; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,LIFECHECK_TIMEOUT_TAG)) + { + /* get lifecheck timeout */ + PGR_Lifecheck_Timeout = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Timeout < 1) || (PGR_Lifecheck_Timeout > 3600)) + { + show_error("%s is out of range. It should be between 1sec-1hr.\n",LIFECHECK_TIMEOUT_TAG); + return STATUS_ERROR; + } + conf = (ConfDataType*)conf->next; + continue; + } + else if (!STRCMP(conf->key,LIFECHECK_INTERVAL_TAG)) + { + /* get lifecheck interval */ + PGR_Lifecheck_Interval = PGRget_time_value(conf->value); + if ((PGR_Lifecheck_Interval < 1) || (PGR_Lifecheck_Interval > 3600)) + { + show_error("%s is out of range. It should between 1sec-1hr.\n",LIFECHECK_INTERVAL_TAG); + return STATUS_ERROR; + } + conf = (ConfDataType*)conf->next; + continue; + } + } + conf = (ConfDataType*)conf->next; + } + + /* create cluster db server table */ + Host_Tbl_Begin = (HostTbl *)NULL; + + size = sizeof(HostTbl) * MAX_DB_SERVER; + HostTblShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (HostTblShmid < 0) + { + show_error("%s:shmget() failed. reason: %s", func,strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:HostTbl shmget ok",func); +#endif + Host_Tbl_Begin = (HostTbl *)shmat(HostTblShmid,0,0); + if (Host_Tbl_Begin == (HostTbl *)-1) + { + show_error("%s:shmat() failed. reason: %s", func, strerror(errno)); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:HostTbl shmat ok",func); +#endif + memset(Host_Tbl_Begin , 0 , size ); + Host_Tbl_Begin -> useFlag = DB_TBL_END; + + for ( i = 0 ; i <= cnt ; i ++) + { + PGRadd_HostTbl(&host_tbl[i],DB_TBL_INIT); + } + /* set load balance table */ + for ( i = 0 ; i <= lb_cnt ; i ++) + { + (LoadBalanceTbl + i)->port = -1; + (LoadBalanceTbl + i)->sock = -1; + } + memset((LoadBalanceTbl + i),0,sizeof(RecoveryTbl)); + PGR_Free_Conf_Data(); + + /* allocate result buffer of query */ + PGR_Result = malloc(PGR_MESSAGE_BUFSIZE); + if (PGR_Result == NULL) + { + show_error("%s:malloc() failed. reason: %s", func, strerror(errno)); + return STATUS_ERROR; + } + memset(PGR_Result,0,PGR_MESSAGE_BUFSIZE); + + /* allocate log_data */ + PGR_Log_Header = malloc(sizeof(ReplicateHeader)); + if (PGR_Log_Header == NULL) + { + show_error("%s:malloc() failed. reason: %s", func, strerror(errno)); + return STATUS_ERROR; + } + memset(PGR_Log_Header,0,sizeof(ReplicateHeader)); + + /* allocate send query id */ + size = sizeof(unsigned int) * (MAX_DB_SERVER +1); + PGR_Send_Query_ID = malloc (size); + if (PGR_Send_Query_ID == NULL) + { + show_error("%s:malloc() failed. reason: %s", func, strerror(errno)); + return STATUS_ERROR; + } + memset(PGR_Send_Query_ID, 0, size); + for ( i = 0 ; i < MAX_DB_SERVER ; i ++) + { + StartReplication[i] = true; + } + + /* set self data into cascade table */ + + cascade_rec_no ++; + if (ResolvedName != NULL) + { + strncpy((Cascade_Tbl+cascade_rec_no)->hostName,ResolvedName,ADDRESS_LENGTH); + } + else + { + + gethostname((Cascade_Tbl+cascade_rec_no)->hostName,sizeof(Cascade_Tbl->hostName)); + } + (Cascade_Tbl+cascade_rec_no)->portNumber = Port_Number; + (Cascade_Tbl+cascade_rec_no)->recoveryPortNumber = Recovery_Port_Number; + (Cascade_Tbl+cascade_rec_no)->sock = -1; + + PGRset_cascade_server_status(Cascade_Tbl+cascade_rec_no,DB_TBL_USE); + /* terminate */ + (Cascade_Tbl+(cascade_rec_no+1))->useFlag = DB_TBL_END; + + Cascade_Inf->top = Cascade_Tbl; + Cascade_Inf->end = Cascade_Tbl+cascade_rec_no; + Cascade_Inf->upper = NULL; + Cascade_Inf->lower = NULL; + if (cascade_rec_no >= 1) + { + Cascade_Inf->upper = (Cascade_Tbl+cascade_rec_no - 1); + } + (Cascade_Tbl+(cascade_rec_no+1))->useFlag = DB_TBL_END; + + Cascade_Inf->myself = (Cascade_Tbl+cascade_rec_no); + Cascade_Inf->useFlag = DB_TBL_USE; + + PGR_Response_Inf->response_mode = PGR_NORMAL_MODE; + + return STATUS_OK; +} + diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/lifecheck.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/lifecheck.c --- postgresql-8.2.4/src/pgcluster/pgrp/lifecheck.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/lifecheck.c 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,276 @@ +/*-------------------------------------------------------------------- + * FILE: + * lifecheck.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for the lifecheck. + * + * Portions Copyright (c) 2003-2007, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" +#include "postgres_fe.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* +#include "libpq/pqsignal.h" +#include "utils/guc.h" +#include "miscadmin.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "tcop/tcopprot.h" +#include "postmaster/postmaster.h" +*/ + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + + +#ifdef HAVE_CRYPT_H +#include +#endif + + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#include "access/xact.h" +#include "lib/dllist.h" +#include "libpq/pqformat.h" +#include "replicate_com.h" +#include "pgreplicate.h" + +#define PING_DB "template1" +#define PING_QUERY "SELECT 1" + +static HostTbl * PGR_Cluster_DB_4_Lifecheck = (HostTbl*)NULL; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +int PGRlifecheck_main(int fork_wait_time); + +static bool is_started_replication(void); +static void set_timeout(SIGNAL_ARGS); +static int lifecheck_loop(void); +static int ping_cluster(PGconn * conn); +static void set_host_status( HostTbl * host_ptr , int status ); + +int +PGRlifecheck_main(int fork_wait_time) +{ + bool started = false; + pid_t pgid = 0; + pid_t pid = 0; + + pgid = getpgid(0); + pid = fork(); + if (pid != 0) + { + return STATUS_OK; + } + + /* + * in child process, + * call recovery module + */ + setpgid(0,pgid); + + PGRsignal(SIGHUP, PGRexit_subprocess); + PGRsignal(SIGTERM, PGRexit_subprocess); + PGRsignal(SIGINT, PGRexit_subprocess); + PGRsignal(SIGQUIT, PGRexit_subprocess); + PGRsignal(SIGALRM, set_timeout); + + if (fork_wait_time > 0) { + sleep(fork_wait_time); + } + + if (PGRuserName == NULL) + { + PGRuserName = getenv("LOGNAME"); + if (PGRuserName == NULL) + { + PGRuserName = getenv("USER"); + if (PGRuserName == NULL) + PGRuserName = "postgres"; + } + } + + for (;;) + { + started = is_started_replication(); + if (!started) + { + /* wait next lifecheck as interval */ + sleep(PGR_Lifecheck_Interval); + continue; + } + + /* life check to all cluster dbs */ + lifecheck_loop(); + + /* wait next lifecheck as interval */ + sleep(PGR_Lifecheck_Interval); + } + return STATUS_OK; +} + +static bool +is_started_replication(void) +{ + HostTbl * host_ptr = (HostTbl*)NULL; + + host_ptr = Host_Tbl_Begin; + while(host_ptr->useFlag != DB_TBL_END) + { + if (host_ptr->useFlag == DB_TBL_USE) + { + return true; + } + host_ptr ++; + } + return false; +} + +static void +set_timeout(SIGNAL_ARGS) +{ + if (PGR_Cluster_DB_4_Lifecheck != NULL) + { + PGR_Cluster_DB_4_Lifecheck->retry_count ++; + if (PGR_Cluster_DB_4_Lifecheck->retry_count > PGR_CONNECT_RETRY_TIME ) + { + set_host_status(PGR_Cluster_DB_4_Lifecheck,DB_TBL_ERROR); + } + } + PGRsignal(SIGALRM, set_timeout); +} + +static int +lifecheck_loop(void) +{ + HostTbl * host_ptr = (HostTbl*)NULL; + char port[8]; + char * host = NULL; + PGconn * conn = NULL; + + host_ptr = Host_Tbl_Begin; + if (host_ptr == NULL) + { + return STATUS_ERROR; + } + alarm(0); + while(host_ptr->useFlag != DB_TBL_END) + { + /* + * check the status of the cluster DB + */ + if (host_ptr->useFlag != DB_TBL_USE) + { + host_ptr ++; + continue; + } + snprintf(port,sizeof(port),"%d", host_ptr->port); + host = (char *)(host_ptr->resolvedName); + /* set host data */ + PGR_Cluster_DB_4_Lifecheck = host_ptr; + + /* set alarm as lifecheck timeout */ + alarm(PGR_Lifecheck_Timeout); + + /* connect DB */ + conn = PGRcreateConn(host,port, PING_DB ,PGRuserName,"","",""); + if ((conn != NULL) && + (ping_cluster(conn) == STATUS_OK)) + { + set_host_status(host_ptr, DB_TBL_USE); + } + else + { + set_host_status(host_ptr, DB_TBL_ERROR); + } + /* reset alarm */ + alarm(0); + + PQfinish(conn); + conn = NULL; + host_ptr ++; + } + + return STATUS_OK; +} + +static int +ping_cluster(PGconn * conn) +{ + int status = 0; + PGresult * res = (PGresult *)NULL; + + res = PQexec(conn, PING_QUERY ); + + status = PQresultStatus(res); + if (res != NULL) + { + PQclear(res); + } + if ((status == PGRES_NONFATAL_ERROR ) || + (status == PGRES_FATAL_ERROR )) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static void +set_host_status( HostTbl * host_ptr , int status ) +{ + if (host_ptr == NULL) + return; + if (status == DB_TBL_ERROR) + { + host_ptr->retry_count ++; + if (host_ptr->retry_count > PGR_CONNECT_RETRY_TIME ) + { + PGRset_host_status(host_ptr, status); + } + } + else + { + host_ptr->retry_count = 0; + PGRset_host_status(host_ptr, status); + } +} + diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/main.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/main.c --- postgresql-8.2.4/src/pgcluster/pgrp/main.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/main.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,935 @@ +/*-------------------------------------------------------------------- + * FILE: + * main.c + * Replication server for PostgreSQL + * + * NOTE: + * This is the main module of the replication server. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#ifdef HAVE_SYS_SELECT_H +#include +#endif + +#ifdef HAVE_GETOPT_H +#include +#endif + +#include "miscadmin.h" +#include "nodes/nodes.h" + +#include "libpq-fe.h" +#include "libpq/libpq-fs.h" +#include "libpq-int.h" +#include "fe-auth.h" + + +#include "access/xact.h" +#include "replicate_com.h" +#include "pgreplicate.h" + +#ifdef WIN32 +#include "win32.h" +#endif +#include +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +/*-------------------------------------- + * GLOBAL VARIABLE DECLARATION + *-------------------------------------- + */ +/* for replicate_com.h */ + +ConfDataType * ConfData_Top = (ConfDataType *)NULL; +ConfDataType * ConfData_End = (ConfDataType *)NULL; + +/* replication server data */ +char * ResolvedName = NULL; +uint16_t Port_Number = 0; +uint16_t LifeCheck_Port_Number = 0; +uint16_t Recovery_Port_Number = 0; +bool PGR_Parse_Session_Started = false; +int PGR_Replication_Timeout = 60; +int PGR_Lifecheck_Timeout = 3; +int PGR_Lifecheck_Interval = 15; + +/* global table data */ +HostTbl *Host_Tbl_Begin = NULL; +Dllist * Transaction_Tbl_Begin = NULL; +TransactionTbl * Transaction_Tbl_End = NULL; +RecoveryTbl * LoadBalanceTbl = NULL; +RecoveryStatusInf * Recovery_Status_Inf = NULL; +ReplicateHeader * PGR_Log_Header = NULL; +ReplicateServerInfo * Cascade_Tbl = NULL;; +CommitLogInf * Commit_Log_Tbl = NULL; +QueryLogType * Query_Log_Top = NULL; +QueryLogType * Query_Log_End = NULL; +CascadeInf * Cascade_Inf = NULL; +ReplicationLogInf * Replicateion_Log = NULL; +/* IPC's id data */ +int RecoveryShmid = 0; +int ReplicateSerializationShmid=0; +int RecoveryMsgShmid = 0; +int *RecoveryMsgid = NULL; +int HostTblShmid = 0; +int LockWaitTblShmid = 0; +int LoadBalanceTblShmid = 0; +int CascadeTblShmid = 0; +int CascadeInfShmid = 0; +int CommitLogShmid = 0; +int QueryLogMsgid = 0; +int QueryLogAnsMsgid = 0; +int PGconnMsgid = 0; +int MaxBackends = 0; +char * PGR_Result = NULL; +int SemID = 0; +int RecoverySemID= 0; +int RecovErysemid = 0; +int VacuumSemID = 0; +int CascadeSemID= 0; +char * PGR_Data_Path = NULL; +char * PGR_Write_Path = NULL; +int IS_SESSION_AUTHORIZATION = 0; +ResponseInf * PGR_Response_Inf = NULL; +bool StartReplication[MAX_DB_SERVER]; +bool PGR_Cascade = false; +bool PGR_Use_Replication_Log = false; +bool PGR_AutoCommit = true; +unsigned int * PGR_Send_Query_ID = NULL; +unsigned int PGR_Query_ID = 0; +volatile bool exit_processing = false; +int pgreplicate_pid = 0; + +int ReplicateSock = -1; +int exit_signo = SIGTERM; + +RecoveryQueueInf RecoveryQueue; +char * Backend_Socket_Dir = NULL; + +unsigned int * PGR_ReplicateSerializationID = NULL; + +int Log_Print = 0; +int Debug_Print = 0; +FILE * LogFp = (FILE *)NULL; +FILE * StatusFp = (FILE *)NULL; +FILE * RidFp = (FILE *)NULL; +FILE * QueueFp = (FILE *)NULL; + +extern char *optarg; +char * PGRuserName = NULL; + +int fork_wait_time = 0; +int Idle_Flag = IDLE_MODE; +volatile bool Exit_Request = false; + +pthread_mutex_t transaction_table_mutex; + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static void startup_replication_server(void); +static int replicate_loop(int fd); +static void replicate_main(void); +static void quick_exit(SIGNAL_ARGS); +static void daemonize(void); +static void write_pid_file(void); +static void stop_pgreplicate(void); +static bool is_exist_pid_file(void); +static void usage(void); +static void set_exit_processing(int signo); + +/*-------------------------------------------------------------------- + * SYMBOL + * replicate_loop() + * NOTES + * replication module + * ARGS + * int fd : + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + *-------------------------------------------------------------------- + */ +static int +replicate_loop(int fd) +{ + char * func = "replicate_loop()"; + pid_t pgid = 0; + pid_t pid = 0; + int sock = -1; + int rtn = 0; + int cnt = 0; + int result; + bool exist_sys_log=false; + bool exist_replicate=false; + bool clear_connection = false; + + + result = PGR_Create_Acception(fd,&sock,"",Port_Number); + if (result == STATUS_ERROR) + { + show_error("%s: accept failed (%s)", func, strerror(errno)); + if (sock != -1) + close(sock); + return 1; + } + + pgid = getpgid(0); + pid = fork(); + if (pid <0) + { + show_error("%s:fork failed (%s)",func,strerror(errno)); + PGRreplicate_exit(0); + } + if (pid == 0) + { + int status = LOOP_CONTINUE; + bool PGR_Cascade = false; + ReplicateHeader header; + ReplicateHeader header_save_for_recovering; + char * query = NULL; + + if (fork_wait_time > 0) { + sleep(fork_wait_time); + } + + close(fd); + + PGRsignal(SIGHUP, quick_exit); + PGRsignal(SIGINT, quick_exit); + PGRsignal(SIGQUIT, quick_exit); + PGRsignal(SIGTERM, quick_exit); + PGRsignal(SIGALRM, quick_exit); + PGRsignal(SIGPIPE, SIG_IGN); + setpgid(0,pgid); + + if (PGRinit_transaction_table() != STATUS_OK) + { + show_error("transaction table memory allocate failed"); + PGR_Close_Sock(&sock); + exit(1); + } + + pthread_mutex_init(&transaction_table_mutex, NULL); + + /* child loop */ + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + if (query != NULL) + { + free(query); + query = NULL; + } + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + rtn = select(sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR) + continue; + } + + if (rtn && FD_ISSET(sock, &rmask)) + { + query = NULL; + query = PGRread_packet(sock,&header); + if ((query == NULL) || (header.cmdSts == 0)) + { + + if (exist_sys_log) + { + show_error("%s:upper cascade closed? , errno=%d(%s)",func,errno,strerror(errno)); + memset(&header, 0, sizeof(ReplicateHeader)); + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = CMD_STS_QUERY_SUSPEND; + header.query_size = htonl(0); + PGRsend_rlog_to_local(&header, NULL); + exist_sys_log = false; + } + else + { + if (exist_replicate) + { + PGRclear_connections(); + clear_connection = true; + header_save_for_recovering.cmdSts=CMD_TYPE_OTHER; + header_save_for_recovering.cmdType=CMD_TYPE_CONNECTION_CLOSE; + header_save_for_recovering.query_size = htonl(21); + PGRdo_replicate(sock,&header_save_for_recovering,"PGR_CLOSE_CONNECTION"); + } + PGRsend_notice_quit(); + } + break; + } + cnt = 0; + switch (header.cmdSys) + { + case CMD_SYS_LIFECHECK: + PGRreturn_result(sock,"1", PGR_NOWAIT_ANSWER); + break; + case CMD_SYS_PREREPLICATE: + if(Cascade_Inf!=NULL || + Cascade_Inf->upper == NULL) + { + /* 1 means "I am primary replicate server." */ + PGRreturn_result(sock,"1", PGR_NOWAIT_ANSWER); + } + else + { + /* 0 means "I am not primary replicate server." */ + PGRreturn_result(sock,"0", PGR_NOWAIT_ANSWER); + } + break; + case CMD_SYS_REPLICATE: + if (exist_replicate == false) + { + exist_replicate=true; + memcpy(&header_save_for_recovering, + &header, + sizeof(ReplicateHeader)); + } + status = PGRdo_replicate(sock,&header,query); + break; + case CMD_SYS_LOG: + exist_sys_log = true; + PGRsend_rlog_to_local(&header, query); + /* set own replicate id by rlog */ + PGRset_replication_id(ntohl(header.replicate_id)); + PGRsend_notice_rlog_done(sock); + break; + case CMD_SYS_CASCADE: + PGR_Cascade = true; + PGRcascade_main(sock,&header,query); + break; + case CMD_SYS_CALL: + if (header.cmdSts == CMD_STS_TRANSACTION_ABORT) + { + PGRreconfirm_commit(sock,&header); + } + else if (header.cmdSts == CMD_STS_NOTICE) + { + + } + else if (header.cmdSts == CMD_STS_RESPONSE) + { + if (header.cmdType == CMD_TYPE_FRONTEND_CLOSED) + { + PGRsend_notice_rlog_done(sock); + status = LOOP_END; + } + } + break; + default: + show_error("WARNING: unknown Header->cmdSys %c",header.cmdSys); + } + } + if (status == LOOP_END) + { + break; + } + } + + PGR_Close_Sock(&sock); + if (query != NULL) + { + free(query); + query = NULL; + } + if (!clear_connection) + PGRclear_connections(); + PGRdestroy_transaction_table(); + pthread_mutex_destroy(&transaction_table_mutex); + exit(0); + } + else + { + PGR_Close_Sock(&sock); + return 0; + } +} + +static void +startup_replication_server(void) +{ + ReplicateHeader header; + char hostName[HOSTNAME_MAX_LENGTH]; + char userName[USERNAME_MAX_LENGTH]; + char query[256]; + + if (PGRuserName == NULL) + { + PGRuserName = getenv("LOGNAME"); + if (PGRuserName == NULL) + { + PGRuserName = getenv("USER"); + if (PGRuserName == NULL) + PGRuserName = "postgres"; + } + } + memset(&header,0,sizeof(ReplicateHeader)); + memset(query,0,sizeof(query)); + memset(hostName,0,sizeof(hostName)); + memset(userName,0,sizeof(userName)); + if (ResolvedName != NULL) + { + strncpy(hostName,ResolvedName,ADDRESS_LENGTH); + } + else + { + gethostname(hostName,sizeof(hostName)-1); + } + strncpy(userName ,PGRuserName,sizeof(userName)-1); + snprintf(query,sizeof(query)-1,"SELECT %s(%d,'%s',%d,%d)", + PGR_SYSTEM_COMMAND_FUNC, + PGR_STARTUP_REPLICATION_SERVER_FUNC_NO, + hostName, + Port_Number, + Recovery_Port_Number); + header.cmdSts = CMD_STS_NOTICE; + header.query_id = htonl(PGRget_next_query_id()); + header.query_size = htonl(strlen(query)); + memcpy(header.from_host,hostName,sizeof(header.from_host)); + memcpy(header.userName,userName,sizeof(header.userName)); + strcpy(header.dbName,"template1"); + replicate_packet_send_internal( &header, query,-1,PGRget_recovery_status(),true); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * replicate_main() + * NOTES + * Replication main module + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +replicate_main(void) +{ +#ifdef PRINT_DEBUG + char * func = "replicate_main()"; +#endif + int status; + int rtn; + show_debug ("%s:entering replicate_main",func); + + /* cascade start up notice */ + if (Cascade_Inf->upper != NULL) + { + show_debug("initialize cascade information"); + PGRstartup_cascade(); + } + + status = PGR_Create_Socket_Bind(&ReplicateSock, ResolvedName, Port_Number); + + if (status != STATUS_OK) + { + show_debug("%s %d port bind failed. quit.",func,Port_Number); + stop_pgreplicate(); + PGRreplicate_exit(0); + } +#ifdef PRINT_DEBUG + show_debug("%s %d port bind OK",func,Port_Number); +#endif + + + /* replication start up notice */ + startup_replication_server(); + + for (;;) + { + fd_set rmask; + struct timeval timeout; + + if (exit_processing == true) + PGRreplicate_exit(0); + + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(ReplicateSock,&rmask); + rtn = select(ReplicateSock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn < 0) + continue; + + if (rtn && FD_ISSET(ReplicateSock, &rmask)) + { + /* + * get recovery status. + */ + PGRcheck_recovered_host(); + + if (exit_processing == true) + break; + + /* + * call replication module + */ + replicate_loop(ReplicateSock); + } + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * quick_exit() + * NOTES + * Exit child process + * ARGS + * SIGNAL_ARGS: receive signal number(I) + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +quick_exit(SIGNAL_ARGS) +{ +#ifdef PRINT_DEBUG + show_debug("quick_exit:signo = %d", postgres_signal_arg); +#endif + exit(0); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * daemonize() + * NOTES + * Daemonize this process + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +daemonize(void) +{ + char * func = "daemonize()"; + int i; + pid_t pid; + + pid = fork(); + if (pid == (pid_t) -1) + { + show_error("%s:fork() failed. reason: %s",func, strerror(errno)); + exit(1); + return; /* not reached */ + } + else if (pid > 0) + { /* parent */ + exit(0); + } + +#ifdef HAVE_SETSID + if (setsid() < 0) + { + show_error("%s:setsid() failed. reason:%s", func,strerror(errno)); + exit(1); + } +#endif + + i = open("/dev/null", O_RDWR); + dup2(i, 0); + dup2(i, 1); + dup2(i, 2); + close(i); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * write_pid_file() + * NOTES + * The process ID is written in the file. + * This process ID is used when finish pglb. + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +write_pid_file(void) +{ + char * func = "write_pid_file()"; + FILE *fd; + char fname[256]; + char pidbuf[128]; + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGREPLICATE_PID_FILE); + fd = fopen(fname, "w"); + if (!fd) + { + show_error("%s:could not open pid file as %s. reason: %s", + func, fname, strerror(errno)); + exit(1); + } + snprintf(pidbuf, sizeof(pidbuf), "%d", getpid()); + fwrite(pidbuf, strlen(pidbuf), 1, fd); + if (fclose(fd)) + { + show_error("%s:could not write pid file as %s. reason: %s", + func,fname, strerror(errno)); + exit(1); + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * stop_pgreplicate() + * NOTES + * Stop the pgreplicate process + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +stop_pgreplicate(void) +{ + char * func = "stop_pgreplicate()"; + FILE *fd; + char fname[256]; + char pidbuf[128]; + pid_t pid; + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGREPLICATE_PID_FILE); + fd = fopen(fname, "r"); + if (!fd) + { + show_error("%s:could not open pid file as %s. reason: %s", + func,fname, strerror(errno)); + exit(1); + } + memset(pidbuf,0,sizeof(pidbuf)); + fread(pidbuf, sizeof(pidbuf), 1, fd); + fclose(fd); + pid = atoi(pidbuf); + + if (kill (pid,SIGTERM) == -1) + { + show_error("%s:could not stop pid: %d, reason: %s",func,pid,strerror(errno)); + exit(1); + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * is_exist_pid_file() + * NOTES + * Check existence of pid file. + * ARGS + * void + * RETURN + * 1: the pid file is exist + * 0: the pid file is not exist + *-------------------------------------------------------------------- + */ +static bool +is_exist_pid_file(void) +{ + char fname[256]; + struct stat buf; + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGREPLICATE_PID_FILE); + if (stat(fname,&buf) == 0) + { + /* pid file is exist */ + return true; + } + else + { + /* pid file is not exist */ + return false; + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * child_wait() + * NOTES + * Waiting for hung up a child + * ARGS + * int signal_args: signal number (expecting the SIGCHLD) + * RETURN + * none + *-------------------------------------------------------------------- + */ +void +child_wait(SIGNAL_ARGS) +{ + pid_t pid = 0; + + do { + int ret; + pid = waitpid(-1,&ret,WNOHANG); + } while(pid > 0); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * usage() + * NOTES + * show usage of pglb + * ARGS + * void + * RETURN + * none + *-------------------------------------------------------------------- + */ +static void +usage(void) +{ + char * path; + + path = getenv("PGDATA"); + if (path == NULL) + path = "."; + fprintf(stderr,"PGReplicate version [%s]\n",PGREPLICATE_VERSION); + fprintf(stderr,"A replication server for cluster DB servers (based on PostgreSQL)\n\n"); + fprintf(stderr,"usage: pgreplicate [-D path_of_config_file] [-W path_of_work_files] [-U login user][-l][-n][-v][-h][stop]\n"); + fprintf(stderr," config file default path: %s/%s\n",path, PGREPLICATE_CONF_FILE); + fprintf(stderr," -l: print error logs in the log file.\n"); + fprintf(stderr," -n: don't run in daemon mode.\n"); + fprintf(stderr," -v: debug mode. need '-n' flag\n"); + fprintf(stderr," -h: print this help\n"); + fprintf(stderr," stop: stop pgreplicate\n"); +} + +/*-------------------------------------------------------------------- + * SYMBOL + * main() + * NOTES + * main module of pgreplicate + * ARGS + * int argc: number of parameter + * char ** argv: value of parameter + * RETURN + * none + *-------------------------------------------------------------------- + */ +int +main(int argc, char * argv[]) +{ + char * func = "main()"; + int opt = 0; + char * r_path = NULL; + char * w_path = NULL; + bool detach = true; + pid_t rlog_pid; + + r_path = getenv("PGDATA"); + if (r_path == NULL) + r_path = "."; + while ((opt = getopt(argc, argv, "U:D:W:w:lvnh")) != -1) + { + switch (opt) + { + case 'U': + if (!optarg) + { + usage(); + exit(1); + } + PGRuserName = strdup(optarg); + break; + case 'D': + if (!optarg) + { + usage(); + exit(1); + } + r_path = optarg; + break; + case 'W': + if (!optarg) + { + usage(); + exit(1); + } + w_path = optarg; + break; + case 'w': + fork_wait_time = atoi(optarg); + if (fork_wait_time < 0) + fork_wait_time = 0; + break; + case 'l': + Log_Print = 1; + break; + case 'v': + Debug_Print = 1; + break; + case 'n': + detach = false; + break; + case 'h': + usage(); + exit(0); + break; + default: + usage(); + exit(1); + } + } + PGR_Data_Path = r_path; + if (w_path == NULL) + { + PGR_Write_Path = PGR_Data_Path; + } + else + { + PGR_Write_Path = w_path; + } + + if (optind == (argc-1) && !strncasecmp(argv[optind],"stop",4)) + { + stop_pgreplicate(); + exit(0); + } + else if (optind == argc) + { + if (is_exist_pid_file()) + { + fprintf(stderr,"pid file %s/%s found. is another pgreplicate running?", PGR_Write_Path, PGREPLICATE_PID_FILE); + exit(1); + } + } + else if (optind < argc) + { + usage(); + exit(1); + } + + if (detach) + { + daemonize(); + } + + PGR_Under_Replication_Server = true; + write_pid_file(); + pgreplicate_pid = getpid(); + + PGRsignal(SIGINT, set_exit_processing); + PGRsignal(SIGQUIT, set_exit_processing); + PGRsignal(SIGTERM, set_exit_processing); + PGRsignal(SIGCHLD, child_wait); + PGRsignal(SIGPIPE, SIG_IGN); + + if (PGRget_Conf_Data(PGR_Data_Path) != STATUS_OK) + { + show_error("%s:PGRget_Conf_Data error",func); + PGRreplicate_exit(0); + } + if (PGRinit_recovery() != STATUS_OK) + { + show_error("%s:PGRinit_recovery error",func); + PGRreplicate_exit(0); + } + if (PGRload_replication_id() != STATUS_OK) + { + show_error("%s:PGRload_replication_id error",func); + PGRreplicate_exit(0); + } + + if ( PGR_Use_Replication_Log == true ) + { +#ifdef PRINT_DEBUG + show_debug("Use Replication Log. Start PGR_RLog_Main()"); +#endif + rlog_pid = PGR_RLog_Main(); + if (rlog_pid < 0) + { + show_error("%s:PGR_RLog_Main failed",func); + PGRreplicate_exit(0); + } + } + + /* + * fork recovery process + */ + PGRrecovery_main(fork_wait_time); + + /* + * fork lifecheck process + */ + PGRlifecheck_main(fork_wait_time); + + /* + * call replicate module + */ + Replicateion_Log->r_log_sock =-1; + + if (fork_wait_time > 0) { +#ifdef PRINT_DEBUG + show_debug("replicate process: wait fork(): pid = %d", getpid()); +#endif + sleep(fork_wait_time); + } + + replicate_main(); + + PGRreplicate_exit(0); + return STATUS_OK; +} + +static void +set_exit_processing(int signo) +{ + exit_signo = signo; + exit_processing = true; + PGRsignal(signo, SIG_IGN); +} + diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/pgreplicate.conf.sample pgcluster-1.7.0rc7/src/pgcluster/pgrp/pgreplicate.conf.sample --- postgresql-8.2.4/src/pgcluster/pgrp/pgreplicate.conf.sample 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/pgreplicate.conf.sample 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,113 @@ +#============================================================= +# PGReplicate configuration file +#------------------------------------------------------------- +# file: pgreplicate.conf +#------------------------------------------------------------- +# This file controls: +# o which hosts & port are cluster server +# o which port use for replication request from cluster server +#============================================================= +# +#------------------------------------------------------------- +# A setup of Cluster DB(s) +# +# o Host_Name : The host name of Cluster DB. +# Please write a host name by FQDN. +# DO NOT write IP address. +# o Port : The connection port with postmaster. +# o Recovery_Port : The connection port at the time of +# a recovery sequence. +#------------------------------------------------------------- +# +# master.pgcluster.org +# 5432 +# 7001 +# +# +# clusterdb2.pgcluster.org +# 5432 +# 7001 +# +# +# cluster3.pgcluster.org +# 5432 +# 7001 +# +# +#------------------------------------------------------------- +# A setup of Load Balance Server +# +# o Host_Name : The host name of a load balance server. +# Please write a host name by FQDN or IP address. +# o Recovery_Port : The connection port at the time of +# a recovery sequence . +#------------------------------------------------------------- +# +# loadbalancer.pgcluster.org +# 6001 +# +# +#------------------------------------------------------------ +# A setup of the cascade connection between replication servers. +# When you do not use RLOG recovery, you can skip this setup +# +# o Host_Name : The host name of the upper replication server. +# Please write a host name by FQDN or IP address. +# o Port : The connection port with postmaster. +# o Recovery_Port : The connection port at the time of +# a recovery sequence . +#------------------------------------------------------------ +# +# upper_replicate.pgcluster.org +# 8002 +# 8102 +# +# +#------------------------------------------------------------- +# A setup of a replication server +# +# o Host_Name : The host name of the this replication server. +# Please write a host name by FQDN or IP address. +# o Replicate_Port : Connection port for replication +# o Recovery_Port : Connection port for recovery +# o RLOG_Port : Connection port for replication log +# o Response_mode : Timing which returns a response +# normal -- return result of DB which received the query +# reliable -- return result after waiting for response of +# all Cluster DBs. +# o Use_Replication_Log : Use replication log +# [yes/no]. default : no +# o Replication_Timeout : Timeout of each replication response +# o Lifecheck_Timeout : Timeout of the lifecheck response +# o Lifecheck_Interval : Interval time of the lifecheck +# (range 1s - 1h) +# 10s -- 10 seconds +# 10min -- 10 minutes +# 1h -- 1 hours +#------------------------------------------------------------- + replicate.pgcluster.org + 8001 + 8101 + 8301 + normal + no + 1min + 3s + 15s +#------------------------------------------------------------- +# A setup of a log files +# +# o File_Name : Log file name with full path +# o File_Size : Maximum size of each log files +# Please specify in a number and unit(K or M) +# 10 -- 10 Byte +# 10K -- 10 KByte +# 10M -- 10 MByte +# o Rotate : Rotation times +# If specified 0, old versions are removed. +#------------------------------------------------------------- + + /tmp/pgreplicate.log + 1M + 3 + diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/pgreplicate.h pgcluster-1.7.0rc7/src/pgcluster/pgrp/pgreplicate.h --- postgresql-8.2.4/src/pgcluster/pgrp/pgreplicate.h 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/pgreplicate.h 2007-03-01 16:27:56.000000000 +0100 @@ -0,0 +1,425 @@ +/*-------------------------------------------------------------------- + * FILE: + * pgreplicate.h + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#ifndef PGREPLICATE_H +#define PGREPLICATE_H + +#define PGREPLICATE_VERSION "1.7.0rc7" + +#include "lib/dllist.h" +#include "lib/stringinfo.h" +#include "../libpgc/libpgc.h" + +/* cascade packet id */ +#define CMD_SYS_CASCADE 'C' +#define CMD_STS_TO_UPPER 'U' +#define CMD_STS_TO_LOWER 'L' +#define CMD_TYPE_ADD 'A' +#define CMD_TYPE_DELTE 'D' +#define CMD_TYPE_UPDATE_ALL 'A' + +/* log packet id */ +#define CMD_SYS_LOG 'L' +#define CMD_STS_DELETE_QUERY 'q' +#define CMD_STS_DELETE_TRANSACTION 't' +#define CMD_STS_UPDATE_QUERY 'r' +#define CMD_STS_UPDATE_TRANSACTION 'u' + +#define INIT_TRANSACTION_TBL_NUM (12) +#define FILENAME_MAX_LENGTH (256) +#define MAX_DB_SERVER (32) +#define MAX_CONNECTIONS (128) +#define MAX_QUEUE_FILE_SIZE (0x40000000) +#define PGR_MAX_TICKETS (0x7FFFFFFF) +#define PGR_MAX_QUERY_ID (0x7FFFFFFF) +#define PGR_CONNECT_RETRY_TIME (3) +#define PGR_EXEC_RETRY_TIME (5) +#define DB_TBL_FREE (0) +#define DB_TBL_INIT (1) +#define DB_TBL_USE (2) +#define DB_TBL_ERROR (-1) +#define DB_TBL_TOP (10) +#define DB_TBL_END (11) +#define RECOVERY_FILE_MTYPE (1) +#define QUERY_LOG_MTYPE (2) +#define PGREPLICATE_CONF_FILE "pgreplicate.conf" +#define PGREPLICATE_LOG_FILE "pgreplicate.log" +#define PGREPLICATE_STATUS_FILE "pgreplicate.sts" +#define PGREPLICATE_PID_FILE "pgreplicate.pid" +#define PGREPLICATE_RID_FILE "pgreplicate.rid" +#define RECOVERY_QUEUE_FILE "pgr_recovery" +/* setup data tag of the configuration file */ +#define CLUSTER_SERVER_TAG "Cluster_Server_Info" +#define LOAD_BALANCE_SERVER_TAG "LoadBalance_Server_Info" +#define REPLICATE_PORT_TAG "Replication_Port" +#define RECOVERY_PORT_TAG "Recovery_Port" +#define LIFECHECK_PORT_TAG "LifeCheck_Port" +#define RLOG_PORT_TAG "RLOG_Port" +#define RESPONSE_MODE_TAG "Response_Mode" +#define RESPONSE_MODE_FAST "fast" +#define RESPONSE_MODE_NORMAL "normal" +#define RESPONSE_MODE_RELIABLE "reliable" +#define USE_REPLICATION_LOG_TAG "Use_Replication_Log" +#define RESERVED_CONNECTIONS_TAG "Reserved_Connections" +/* semapho numner of recovery queue */ +#define SEM_NUM_OF_RECOVERY (1) +#define SEM_NUM_OF_RECOVERY_QUEUE (2) +/* semapho numner of lock tickets */ +#define SEM_NUM_OF_LOCK (1) +#define STATUS_LOCK_CONFLICT (2) +#define STATUS_DEADLOCK_DETECT (3) +#define STATUS_ABORTED (4) +#define STATUS_NOT_YET_REPLICATE (5) +#define STATUS_ALREADY_REPLICATED (6) +#define STATUS_SKIP_REPLICATE (7) +#define PGR_NOWAIT_ANSWER (0) +#define PGR_WAIT_ANSWER (1) +#define LOOP_CONTINUE (0) +#define LOOP_END (1) +#define LOWER_CASCADE (1) +#define UPPER_CASCADE (2) +#define ALL_CASCADE (3) +#define NOTICE_SYSTEM_CALL_TYPE (10) +#define RECOVERY_QUERY_TYPE (20) + +#define PGR_TIME_OUT (60) +#define PGR_SEND_RETRY_CNT (100) +#define PGR_SEND_WAIT_MSEC (500) +#define PGR_RECV_RETRY_CNT (100) +#define PGR_RECV_WAIT_MSEC (500) +#define PGR_SEM_UNLOCK_WAIT_MSEC (100) +#define PGR_SEM_LOCK_WAIT_MSEC (500) +#define PGR_RECOVERY_RETRY_CNT (6000) +#define PGR_RECOVERY_WAIT_MSEC (500) +#define PGR_CHECK_POINT (300) + +#define PGR_RECOVERY_1ST_STAGE (1) +#define PGR_RECOVERY_2ND_STAGE (2) + +#define IDLE_MODE (0) +#define BUSY_MODE (1) + +/* + * connection table for transaction query + */ +typedef struct { + int useFlag; + int lock; + int transaction_count; + unsigned short port; + unsigned short pid; + unsigned int hostIP; + unsigned int srcHostIP; + char host[HOSTNAME_MAX_LENGTH]; + char srcHost[HOSTNAME_MAX_LENGTH]; + char dbName[DBNAME_MAX_LENGTH]; + PGconn * conn; + bool in_transaction; + bool exec_copy; +}TransactionTbl; + +/* + * cluster server table + */ +typedef struct { + int useFlag; + char hostName[HOSTNAME_MAX_LENGTH]; + char resolvedName[24]; + int port; + int recoveryPort; + int hostNum; + int transaction_count; + int retry_count; +}HostTbl; + + +typedef struct { + FILE * queue_fp; + int current_queue_no; +} RecoveryQueueInf; + + +/* + * host table for recovery request + */ +typedef struct { + char hostName[HOSTNAME_MAX_LENGTH]; + char resolvedName[24]; + int port; + int recoveryPort; + int sock; + int recovery_sock; +} RecoveryTbl; + +/* + * status table for recovery + */ +typedef struct { + int useFlag; + int transaction_count; + int recovery_status; + unsigned int replication_id; + HostTbl target_host; + int read_queue_no; + int write_queue_no; + int check_point; + unsigned int file_size; + char write_file[FILENAME_MAX_LENGTH]; + char read_file[FILENAME_MAX_LENGTH]; +} RecoveryStatusInf; + +typedef struct { + long mtype; + char mdata[1]; +} RecoveryQueueFile; + +typedef struct { + long mtype; + unsigned int replicationId; + char mdata[1]; +} RecoveryQueueQuery; + +typedef struct { + unsigned int entry_ticket; + unsigned int lock_wait_queue_length; + int overflow; +} LockWaitInf; + +typedef struct { + int response_mode; + int current_cluster; +} ResponseInf; + +typedef struct { + ReplicateHeader * header; + char * query; + char * next; + char * last; +} QueryLogType; + +typedef struct { + ReplicateServerInfo * top; + ReplicateServerInfo * end; + ReplicateServerInfo * lower; + ReplicateServerInfo * upper; + ReplicateServerInfo * myself; + int useFlag; +} CascadeInf; + +typedef struct { + union + { + int useFlag; + int commit_log_num; + } inf; + ReplicateHeader header; +} CommitLogInf; + +typedef struct { + int useFlag; + char * RLog_Sock_Path; + uint16_t RLog_Port_Number; + int r_log_sock; + ReplicateHeader * header; + char * query; +} ReplicationLogInf; + +typedef struct { + char hostName[HOSTNAME_MAX_LENGTH]; + uint16_t port; + uint16_t pid; + uint32_t request_id; +} QueryLogID; + +typedef struct { + QueryLogID query_log_id; + char * last; + char * next; +} ConfirmQueryList; + +typedef struct { + ReplicateHeader * header; + char * query; + int dest; + int current_cluster; + int transaction_count; + HostTbl * host_ptr; + TransactionTbl *transaction_tbl; +} ThreadArgInf; + +/* replication server data */ +extern char * ResolvedName; +extern uint16_t Port_Number; +extern uint16_t LifeCheck_Port_Number; +extern uint16_t Recovery_Port_Number; +extern int Reserved_Connections; +extern bool PGR_Parse_Session_Started; +extern int PGR_Replication_Timeout; + +/* global tables */ +extern HostTbl * Host_Tbl_Begin; +extern Dllist * Transaction_Tbl_Begin; +extern TransactionTbl * Transaction_Tbl_End; +extern RecoveryTbl * LoadBalanceTbl; +extern RecoveryStatusInf * Recovery_Status_Inf; +extern LockWaitInf * Lock_Wait_Tbl; +extern ReplicateHeader * PGR_Log_Header; +extern ReplicateServerInfo * Cascade_Tbl; +extern CascadeInf * Cascade_Inf; +extern CommitLogInf * Commit_Log_Tbl; +extern QueryLogType * Query_Log_Top; +extern QueryLogType * Query_Log_End; +extern ReplicationLogInf * Replicateion_Log; +extern int RecoveryShmid; +extern int ReplicateSerializationShmid; +extern int RecoveryMsgShmid; +extern int *RecoveryMsgid; +extern int HostTblShmid; +extern int LockWaitTblShmid; +extern int CascadeTblShmid; +extern int CascadeInfShmid; +extern int CommitLogShmid; +extern int MaxBackends; +extern char * PGR_Result; +extern int SemID; +extern int RecoverySemID; +extern int CascadeSemID; +extern int LockSemID; +extern int VacuumSemID; +extern char * PGR_Data_Path; +extern char * PGR_Write_Path; +extern FILE * LogFp; +extern FILE * StatusFp; +extern FILE * RidFp; +extern FILE * QueueFp; +extern int Log_Print; +extern int Debug_Print; +extern char * Function; +extern int IS_SESSION_AUTHORIZATION; +extern ResponseInf * PGR_Response_Inf; +extern bool StartReplication[MAX_DB_SERVER]; +extern bool PGR_Cascade; +extern bool PGR_Use_Replication_Log; +extern bool PGR_AutoCommit; +extern unsigned int * PGR_ReplicateSerializationID; +extern unsigned int * PGR_Send_Query_ID; +extern unsigned int PGR_Query_ID; +extern volatile bool exit_processing; +extern RecoveryQueueInf RecoveryQueue; +extern int pgreplicate_pid; +extern char * PGRuserName; +extern int exit_signo; + +extern int ReplicateSock; + +/* smart shutdown */ +extern int Idle_Flag; +extern volatile bool Exit_Request; + +/* + * external prototype in main.c + */ +extern void child_wait(SIGNAL_ARGS); + +/* + * external prototype in conf.c + */ +extern int PGRget_Conf_Data(char * path); + +/* + * external prototype in replicate.c + */ +extern int PGRset_replication_id(uint32_t rid); +extern bool PGRis_same_host(char * host1, unsigned short port1 , char * host2, unsigned short port2); +extern HostTbl * PGRadd_HostTbl(HostTbl * conf_data, int useFlag); +extern HostTbl * PGRget_master(void); +extern void PGRset_recovery_status(int status); +extern int PGRget_recovery_status(void); +extern int PGRcheck_recovered_host(void); +extern int PGRset_recovered_host(HostTbl * target,int useFlag); +extern int PGRinit_recovery(void); +extern void PGRexit_subprocess(int signo); +extern void PGRreplicate_exit(int exit_status); +extern int PGRsend_replicate_packet_to_server( HostTbl * host_ptr, ReplicateHeader * header, char *query , char * result,unsigned int replicationId, bool recovery); +extern int PGRreplicate_packet_send_each_server( HostTbl * ptr, bool return_response, ReplicateHeader * header, char * query,int dest); +extern HostTbl * PGRget_HostTbl(char * hostName,int port); +extern int PGRset_queue(ReplicateHeader * header,char * query); +extern int PGRset_host_status(HostTbl * host_ptr,int status); +extern void PGRclear_connections(void); +extern void PGRdestroy_transaction_table(void); +extern void PGRsem_unlock( int semid, short sem_num ); +extern void PGRsem_lock( int semid, short sem_num ); +extern int PGRdo_replicate(int sock,ReplicateHeader *header, char * query); +extern int PGRreturn_result(int dest, char * result, int wait); +extern int PGRreplicate_packet_send( ReplicateHeader * header, char * query,int dest,int recovery_status); +extern char * PGRread_packet(int sock, ReplicateHeader *header); +extern void PGRnotice_replication_server(char * hostName, unsigned short portNumber,unsigned short recoveryPortNumber, unsigned short lifecheckPortNumber, char * userName); +extern char * PGRread_query(int sock, ReplicateHeader *header); +extern int PGRsync_oid(ReplicateHeader *header); +extern unsigned int PGRget_next_query_id(void); +extern int PGRinit_transaction_table(void); +extern int replicate_packet_send_internal(ReplicateHeader * header, char * query,int dest,int recovery_status,bool isHeldLock); +extern int PGRsync_oid(ReplicateHeader *header); +extern int PGRload_replication_id(void); +extern PGconn * PGRcreateConn( char * host, char * port,char * database, char * userName, char * password, char * md5Salt, char * cryptSalt ); +/* + * external prototype in recovery.c + */ +extern int PGRsend_load_balance_packet(RecoveryPacket * packet); +extern void PGRrecovery_main(int fork_wait_time); +extern FILE * PGRget_recovery_queue_file_for_write(void); +extern FILE * PGRget_recovery_queue_file_for_read(int next); + +/* + * external prototype in rlog.c + */ +extern int PGRwrite_rlog(ReplicateHeader * header, char * query); +extern ReplicateHeader * PGRget_requested_query(ReplicateHeader * header); +extern void PGRreconfirm_commit(int sock, ReplicateHeader * header); +extern void PGRset_rlog(ReplicateHeader * header, char * query); +extern void PGRunset_rlog(ReplicateHeader * header, char * query); +extern int PGRresend_rlog_to_db(void); +extern void PGRreconfirm_query(int sock, ReplicateHeader * header); +extern pid_t PGR_RLog_Main(void); +extern int PGRcreate_send_rlog_socket(void); +extern int PGRsend_rlog_packet(int sock,ReplicateHeader * header, const char * query_string); +extern int PGRrecv_rlog_result(int sock,void * result, int size); +extern int PGRsend_rlog_to_local(ReplicateHeader * header,char * query); +extern int PGRget_rlog_header(ReplicateHeader * header); + +/* + * external prototype in cascade.c + */ +extern int PGRstartup_cascade(void); +extern int PGRsend_lower_cascade(ReplicateHeader * header, char * query); +extern int PGRsend_upper_cascade(ReplicateHeader * header, char * query); +extern int PGRwait_answer_cascade(int sock); +extern ReplicateServerInfo * PGRget_lower_cascade(void); +extern ReplicateServerInfo * PGRget_upper_cascade(void); +extern void PGRset_cascade_server_status(ReplicateServerInfo * cascade, int status); +extern ReplicateServerInfo * PGRrecv_cascade_answer(ReplicateServerInfo * cascade,ReplicateHeader * header); +extern int PGRsend_cascade(int sock , ReplicateHeader * header, char * query); +extern int PGRcascade_main(int sock, ReplicateHeader * header, char * query); +extern int PGRwait_notice_rlog_done(void); +extern int PGRsend_notice_rlog_done(int sock); +extern int PGRsend_notice_quit(void); + +/* + * external prototype in pqformat.c + */ +extern const char * pq_getmsgstring(StringInfo msg); +extern unsigned int pq_getmsgint(StringInfo msg, int b); +extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen); +extern const char * pq_getmsgbytes(StringInfo msg, int datalen); +extern int pq_getmsgbyte(StringInfo msg); + +/* + * external prototype in lifecheck.c + */ +extern int PGRlifecheck_main(int fork_wait_time); + +#endif /* PGREPLICATE_H */ diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/pqformat.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/pqformat.c --- postgresql-8.2.4/src/pgcluster/pgrp/pqformat.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/pqformat.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,173 @@ +/*------------------------------------------------------------------------- + * pqformat.c + * Routines for formatting and parsing frontend/backend messages + * + * These modules copyed from src/backend/libpq/pgformat.c. + * Original modules have some shared modules and macro, + * then it is difficult link to replication server directory. + * Therefore, these modules were custamized. + * (removed shared module and macro) + * + * Original source code is under the following copyright + * + * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * Message parsing after input: + * pq_getmsgbyte - get a raw byte from a message buffer + * pq_getmsgint - get a binary integer from a message buffer + * pq_getmsgbytes - get raw data from a message buffer + * pq_copymsgbytes - copy raw data from a message buffer + * pq_getmsgstring - get a null-terminated text string (with conversion) + */ + +/* -------------------------------- + * pq_getmsgstring - get a null-terminated text string (with conversion) + * + * May return a pointer directly into the message buffer, or a pointer + * to a palloc'd conversion result. + * -------------------------------- + */ + +#include "postgres.h" +#include +#include +#include +#include +#include +#ifdef HAVE_ENDIAN_H +#include +#endif + +#include "mb/pg_wchar.h" + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" +#include "replicate_com.h" +#include "pgreplicate.h" + +const char * pq_getmsgstring(StringInfo msg); +unsigned int pq_getmsgint(StringInfo msg, int b); +void pq_copymsgbytes(StringInfo msg, char *buf, int datalen); +const char * pq_getmsgbytes(StringInfo msg, int datalen); +int pq_getmsgbyte(StringInfo msg); + +const char * +pq_getmsgstring(StringInfo msg) +{ + char *str; + int slen; + + if (msg == NULL) + { + return NULL; + } + str = &msg->data[msg->cursor]; + /* + * It's safe to use strlen() here because a StringInfo is guaranteed to + * have a trailing null byte. But check we found a null inside the + * message. + */ + slen = strlen(str); + if (msg->cursor + slen >= msg->len) + { + return NULL; + } + msg->cursor += slen + 1; + + return str; +} + + +/* -------------------------------- + * pq_getmsgint - get a binary integer from a message buffer + * + * Values are treated as unsigned. + * -------------------------------- + */ +unsigned int +pq_getmsgint(StringInfo msg, int b) +{ + unsigned int result; + unsigned char n8; + uint16 n16; + uint32 n32; + + switch (b) + { + case 1: + pq_copymsgbytes(msg, (char *) &n8, 1); + result = n8; + break; + case 2: + pq_copymsgbytes(msg, (char *) &n16, 2); + result = ntohs(n16); + break; + case 4: + pq_copymsgbytes(msg, (char *) &n32, 4); + result = ntohl(n32); + break; + default: + result = 0; /* keep compiler quiet */ + break; + } + return result; +} + +/* -------------------------------- + * pq_copymsgbytes - copy raw data from a message buffer + * + * Same as above, except data is copied to caller's buffer. + * -------------------------------- + */ +void +pq_copymsgbytes(StringInfo msg, char *buf, int datalen) +{ + if (datalen < 0 || datalen > (msg->len - msg->cursor)) + { + return; + } + memcpy(buf, &msg->data[msg->cursor], datalen); + msg->cursor += datalen; +} + + +/* -------------------------------- + * pq_getmsgbytes - get raw data from a message buffer + * + * Returns a pointer directly into the message buffer; note this + * may not have any particular alignment. + * -------------------------------- + */ +const char * +pq_getmsgbytes(StringInfo msg, int datalen) +{ + const char *result; + + if (datalen < 0 || datalen > (msg->len - msg->cursor)) + { + return NULL; + } + result = &msg->data[msg->cursor]; + msg->cursor += datalen; + return result; +} + +/* -------------------------------- + * pq_getmsgbyte - get a raw byte from a message buffer + * -------------------------------- + */ +int +pq_getmsgbyte(StringInfo msg) +{ + if (msg->cursor >= msg->len) + { + return 0; + } + return (unsigned char) msg->data[msg->cursor++]; +} diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/recovery.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/recovery.c --- postgresql-8.2.4/src/pgcluster/pgrp/recovery.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/recovery.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,1143 @@ +/*-------------------------------------------------------------------- + * FILE: + * recovery.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for the recovery. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + +#ifdef HAVE_CRYPT_H +#include +#endif + +#include "miscadmin.h" +#include "nodes/nodes.h" + +#include "libpq-fe.h" +#include "libpq/libpq-fs.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include "access/xact.h" +#include "replicate_com.h" +#include "pgreplicate.h" + + +#ifdef WIN32 +#include "win32.h" +#else +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#endif + +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif +#include "pgreplicate.h" + + +/*-------------------------------------- + * GLOBAL VARIABLE DECLARATION + *-------------------------------------- + */ +RecoveryPacket MasterPacketData; +RecoveryTbl Master; +RecoveryTbl Target; + + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static int read_packet(int sock,RecoveryPacket * packet); +static int read_packet_from_master( RecoveryTbl * host, RecoveryPacket * packet ); +static int send_recovery_packet(int sock, RecoveryPacket * packet); +static int send_packet(RecoveryTbl * host, RecoveryPacket * packet ); +static void start_recovery_prepare(void); +static void reset_recovery_prepare(void); +static void finish_recovery(void); +static bool first_setup_recovery(int * sock, RecoveryPacket * packet); +static int wait_transaction_count_clear(void); +static bool second_setup_recovery (RecoveryPacket * packet); +static void pgrecovery_loop(int fd); +static int PGRsend_queue(RecoveryTbl * master, RecoveryTbl * target); +static int send_vacuum(HostTbl *host, char * userName, int stage); +static char * read_queue_file(FILE * fp, ReplicateHeader * header, char * query); + +#ifdef PRINT_DEBUG +static void show_recovery_packet(RecoveryPacket * packet); +#endif + +int PGRsend_load_balance_packet(RecoveryPacket * packet); +void PGRrecovery_main(int fork_wait_time); + +/*----------------------------------------------------------- + * SYMBOL + * read_packet() + * NOTES + * Read recovery packet data + * ARGS + * int sock : socket + * RecoveryPacket * packet : read packet buffer + * RETURN + * -1 : error + * >0 : read size + *----------------------------------------------------------- + */ +static int +read_packet(int sock,RecoveryPacket * packet) +{ +#ifdef PRINT_DEBUG + char * func = "read_packet()"; +#endif + int r = 0; + char * read_ptr = NULL; + int read_size = 0; + int packet_size = 0; + + if (packet == NULL) + { + return -1; + } + read_ptr = (char*)packet; + packet_size = sizeof(RecoveryPacket); + for (;;) + { + r = recv(sock,read_ptr + read_size ,packet_size - read_size, MSG_WAITALL); + if (r < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:recv failed: %d(%s)",func, errno, strerror(errno)); + return -1; + } + } + else if (r > 0) + { + read_size += r; + if (read_size == packet_size) + { +#ifdef PRINT_DEBUG + show_debug("%s:receive packet",func); + show_recovery_packet(packet); +#endif + return read_size; + } + } + else /* r == 0 */ + { + show_error("%s:unexpected EOF", func); + return -1; + } + } + return -1; +} + +static int +read_packet_from_master( RecoveryTbl * host, RecoveryPacket * packet ) +{ + int read_size = 0; + int rtn; + fd_set rmask; + struct timeval timeout; + + for(;;) + { + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(host->recovery_sock,&rmask); + rtn = select(host->recovery_sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + + if (rtn == 0) /* timeout */ + { + return -1; + } + + if (rtn && FD_ISSET(host->recovery_sock, &rmask)) + { + read_size = read_packet(host->recovery_sock, packet); + return read_size; + } + } +} + +static int +send_recovery_packet(int sock, RecoveryPacket * packet) +{ + char *func = "send_recovery_packet"; + char * send_ptr; + int send_size= 0; + int buf_size = 0; + int s; + + send_ptr = (char *)packet; + buf_size = sizeof(RecoveryPacket); + + for (;;) + { + s = send(sock, send_ptr + send_size,buf_size - send_size ,0); + if (s < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + + show_error("%s:send error: %d(%s)", func, errno, strerror(errno)); + return STATUS_ERROR; + } + else if (s == 0) + { + show_error("%s:unexpected EOF", func); + return STATUS_ERROR; + } + + send_size += s; + if (send_size == buf_size) + return STATUS_OK; + } +} + +static int +send_packet(RecoveryTbl * host, RecoveryPacket * packet ) +{ + char * func = "send_packet()"; + int count = 0; + + if (host->recovery_sock == -1) + { + while(PGR_Create_Socket_Connect(&(host->recovery_sock), host->hostName , host->recoveryPort) != STATUS_OK ) + { + if (count > MAX_RETRY_TIMES ) + { + show_error("%s:host[%s] port[%d]PGR_Create_Socket_Connect failed",func,host->hostName, host->recoveryPort); + return STATUS_ERROR; + } + count ++; + } + } + count = 0; + while (send_recovery_packet(host->recovery_sock,packet) != STATUS_OK) + { + close(host->recovery_sock); + host->recovery_sock = -1; + PGR_Create_Socket_Connect(&(host->recovery_sock), host->hostName , host->recoveryPort); +#ifdef PRINT_DEBUG + show_debug("%s:PGR_Create_Socket_Connectsock[%d] host[%s] port[%d]", + func,host->recovery_sock,host->hostName,host->recoveryPort); +#endif + if (count > PGR_CONNECT_RETRY_TIME ) + { + + show_error("%s:send failed and PGR_Create_Socket_Connect failed",func); + return STATUS_ERROR; + } + count ++; + } + return STATUS_OK; +} + +static void +start_recovery_prepare(void) +{ + PGRset_recovery_status (RECOVERY_PREPARE_START); +} + +static void +reset_recovery_prepare(void) +{ + PGRset_recovery_status (RECOVERY_INIT); +} + +static void +finish_recovery(void) +{ + PGRset_recovery_status (RECOVERY_INIT); +} + +int +PGRsend_load_balance_packet(RecoveryPacket * packet) +{ + char * func = "PGRsend_load_balance_packet()"; + RecoveryTbl * lbp; + int status; + + lbp = LoadBalanceTbl; + if (lbp == (RecoveryTbl *)NULL) + { + show_error("%s:recovery table is NULL",func); + return STATUS_ERROR; + } + while (lbp->hostName[0] != 0) + { + if (lbp->recovery_sock != -1) + { + close(lbp->recovery_sock); + lbp->recovery_sock = -1; + } +#ifdef PRINT_DEBUG + show_debug("%s:host[%s] port[%d]",func,lbp->hostName,lbp->recoveryPort); +#endif + status = send_packet(lbp,packet); + if (lbp->recovery_sock != -1) + { + close(lbp->recovery_sock); + lbp->recovery_sock = -1; + } + lbp ++; + } + return STATUS_OK; +} + +static int +send_vacuum(HostTbl *host, char * userName, int stage) +{ + int rtn = STATUS_OK; + ReplicateHeader header; + char * query = NULL; + + if (stage == PGR_RECOVERY_1ST_STAGE) + { + query = strdup("VACUUM"); + } + else + { + query = strdup("VACUUM FULL"); + } + memset(&header,0,sizeof(header)); + header.query_size = strlen(query) + 1; + strncpy(header.dbName,"template1",sizeof(header.dbName)); + strncpy(header.userName,userName,sizeof(header.userName)); + header.cmdSys = CMD_SYS_REPLICATE; + header.cmdSts = CMD_STS_QUERY; + header.cmdType = CMD_TYPE_VACUUM; + header.pid = getpid(); + header.query_id = getpid(); + header.isAutoCommit=1; + rtn = PGRsend_replicate_packet_to_server(host,&header,query,PGR_Result,0, true); + if (query !=NULL) + free(query); + return rtn; +} + +static bool +first_setup_recovery(int * sock, RecoveryPacket * packet) +{ + char * func = "first_setup_recovery()"; + int status; + HostTbl * master = (HostTbl *)NULL; + bool loop_end = false; + HostTbl host_tbl; + char * userName = NULL; + int ip; + + memset(Target.hostName,0,sizeof(Target.hostName)); + strncpy(Target.hostName,packet->hostName,sizeof(Target.hostName)); + ip = PGRget_ip_by_name(Target.hostName); + sprintf(Target.resolvedName, + "%d.%d.%d.%d", + (ip ) & 0xff , + (ip >> 8) & 0xff , + (ip >> 16) & 0xff , + (ip >> 24) & 0xff ); + Target.port = ntohs(packet->port); + Target.recoveryPort = ntohs(packet->recoveryPort); + Target.sock = *sock; + Target.recovery_sock = *sock; +#ifdef PRINT_DEBUG + show_debug("%s:1st setup target %s",func,Target.hostName); + show_debug("%s:1st setup port %d",func,Target.port); +#endif + /* + * check another recovery process + */ + if (PGRget_recovery_status() != RECOVERY_INIT) + { + /* + * recovery process is already running + */ +#ifdef PRINT_DEBUG + show_debug("%s:already recovery job runing",func); +#endif + memset(packet,0,sizeof(packet)); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_OCCUPIED) ; + status = send_packet(&Target,packet); + loop_end = true; + return loop_end; + } + /* + * add recovery target to host table + */ +#ifdef PRINT_DEBUG + show_debug("%s:add recovery target to host table",func); +#endif + memcpy(host_tbl.hostName,Target.hostName,sizeof(host_tbl.hostName)); + memcpy(host_tbl.resolvedName,Target.resolvedName,sizeof(host_tbl.resolvedName)); + host_tbl.port = Target.port; + host_tbl.recoveryPort = Target.recoveryPort; + PGRset_recovered_host(&host_tbl,DB_TBL_INIT); + PGRadd_HostTbl(&host_tbl,DB_TBL_INIT); + /* + * send prepare recovery to load balancer + */ + PGRsend_load_balance_packet(packet); + userName = strdup(packet->userName); + + /* + * set RECOVERY_PGDATA_REQ packet data + */ +#ifdef PRINT_DEBUG + show_debug("%s:set RECOVERY_PGDATA_REQ packet data",func); +#endif + memset(packet,0,sizeof(RecoveryPacket)); + PGRset_recovery_packet_no(packet, RECOVERY_PGDATA_REQ ); + +retry_connect_master: + master = PGRget_master(); + if (master == (HostTbl *)NULL) + { + /* + * connection error , master may be down + */ + show_error("%s:get master info error , master may be down",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_TARGET_ONLY); + status = send_packet(&Target, packet); + reset_recovery_prepare(); + loop_end = true; + if (userName != NULL) + free(userName); + return loop_end; + } + /* send vauum command to master server */ + status = send_vacuum(master, userName, PGR_RECOVERY_1ST_STAGE ); + if (status != STATUS_OK) + { + PGRset_host_status(master, DB_TBL_ERROR); + goto retry_connect_master; + } + + memcpy(Master.hostName,master->hostName,sizeof(Master.hostName)); + memcpy(Master.resolvedName,master->resolvedName,sizeof(Master.resolvedName)); + Master.sock = -1; + Master.recovery_sock = -1; + Master.port = master->port; + Master.recoveryPort = master->recoveryPort; + +#ifdef PRINT_DEBUG + show_debug("%s:send packet to master %s recoveryPort %d",func, Master.hostName, Master.recoveryPort); +#endif + status = send_packet(&Master, packet); + if (status != STATUS_OK) + { + /* + * connection error , master may be down + */ + show_error("%s:connection error , master may be down",func); + PGRset_host_status(master,DB_TBL_ERROR); + goto retry_connect_master ; + } + + /* + * start prepare of recovery + * set recovery status to "prepare start" + * start transaction count up + */ + start_recovery_prepare(); + /* + * wait answer from master server + */ +#ifdef PRINT_DEBUG + show_debug("%s:wait answer from master server",func); +#endif + memset(packet,0,sizeof(RecoveryPacket)); + read_packet_from_master(&Master, packet); +#ifdef PRINT_DEBUG + show_debug("%s:get answer from master:no[%d]",func,ntohs(packet->packet_no)); +#endif + if (ntohs(packet->packet_no) == RECOVERY_PGDATA_ANS) + { + /* + * send a packet to load balancer that is stopped master's + * load balancing until all recovery process is finished + */ + PGRsend_load_balance_packet(packet); + memcpy((char *)&MasterPacketData,packet,sizeof(RecoveryPacket)); + + /* + * prepare answer from master DB + */ + PGRset_recovery_packet_no(packet, RECOVERY_PREPARE_ANS ); + memcpy(packet->hostName,Master.hostName,sizeof(packet->hostName)); + status = send_packet(&Target, packet); + if (status != STATUS_OK) + { + show_error("%s:no[%d] send_packet to target error",func,ntohs(packet->packet_no)); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_TARGET_ONLY); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + loop_end = true; + } + } + if (userName != NULL) + free(userName); + + + return loop_end; +} + +static int +wait_transaction_count_clear(void) +{ + char * func ="wait_transaction_count_clear()"; + HostTbl * master = (HostTbl *)NULL; + int cnt = 0; + int recovery_status = PGRget_recovery_status(); + + while (recovery_status != RECOVERY_CLEARED) + { + master = PGRget_master(); + if (master == (HostTbl *)NULL) + { + show_error("%s:get master info error , master may be down",func); + continue; + } + if ((recovery_status == RECOVERY_PREPARE_START) && + (master->transaction_count==0)) + { + PGRset_recovery_status(RECOVERY_CLEARED); + break; + } + + sleep(1); +#ifdef PRINT_DEBUG + show_debug("now, waiting clear every transaction for recovery"); +#endif + cnt ++; + if (cnt > RECOVERY_TIMEOUT * 60 ) + { + show_error("sorry, it is timeout for waiting clear transaction"); + return STATUS_ERROR; + } + recovery_status = PGRget_recovery_status(); + } + return STATUS_OK; +} + +static bool +second_setup_recovery (RecoveryPacket * packet) +{ + char * func = "second_setup_recovery()"; + HostTbl * master = (HostTbl *)NULL; + int status; + bool loop_end = false; + char * userName = NULL; + int recovery_status = 0; + + /* send vauum command to master server */ + while ((master = PGRget_master()) != NULL) + { + /* + * wait until all started transactions are going to finish + */ + status = wait_transaction_count_clear(); + if (status != STATUS_OK) + { + show_error("%s:transaction is too busy, please try again after",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_TARGET_ONLY); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + return true; + } + userName = strdup(packet->userName); + status = send_vacuum(master, userName, PGR_RECOVERY_2ND_STAGE ); + if (status != STATUS_OK) + { + PGRset_host_status(master, DB_TBL_ERROR); + if (userName != NULL) + { + free(userName); + userName = NULL; + } + continue; + } + break; + } + + if (master == NULL) + { + show_error("%s:vacuum error , master may be down",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_TARGET_ONLY); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + + return true; + } + + recovery_status = PGRget_recovery_status(); + if ((recovery_status != RECOVERY_PREPARE_START) && + (recovery_status != RECOVERY_WAIT_CLEAN) && + (recovery_status != RECOVERY_CLEARED)) + { + show_error("%s:queue set failed. stop to recovery",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_CONNECTION); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + if (userName != NULL) + free(userName); + return true; + } + + /* + * then, send fsync request to master DB + */ + PGRset_recovery_packet_no(packet, RECOVERY_FSYNC_REQ ); + status = send_packet(&Master,packet); + if (status != STATUS_OK) + { + /* + * connection error , master may be down + */ + show_error("%s:connection error , master may be down",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_CONNECTION); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + if (userName != NULL) + free(userName); + return true; + } + + recovery_status = PGRget_recovery_status(); + if ((recovery_status != RECOVERY_PREPARE_START) && + (recovery_status != RECOVERY_WAIT_CLEAN) && + (recovery_status != RECOVERY_CLEARED)) + { + show_error("%s:queue set failed. stop to recovery",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_CONNECTION); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + if (userName != NULL) + free(userName); + return true; + } + + /* + * wait answer from master server + */ + memset(packet,0,sizeof(RecoveryPacket)); + read_packet_from_master(&Master,packet); + if (ntohs(packet->packet_no) == RECOVERY_FSYNC_ANS ) + { + /* + * master DB finished fsync + */ + PGRset_recovery_packet_no(packet, RECOVERY_START_ANS ); + memcpy(packet->hostName,Master.hostName,sizeof(packet->hostName)); + status = send_packet(&Target,packet); + if (status != STATUS_OK) + { + finish_recovery(); + loop_end = true; + } + } + else + { + show_error("%s:failure answer returned",func); + PGRset_recovery_packet_no(packet, RECOVERY_ERROR_CONNECTION); + status = send_packet(&Target,packet); + status = send_packet(&Master,packet); + reset_recovery_prepare(); + loop_end = true; + } + if (userName != NULL) + free(userName); + return loop_end; +} + +static char * +read_queue_file(FILE * fp, ReplicateHeader * header, char *query) +{ + char * func = "read_queue_file()"; + int size = 0; + + if (fp == NULL) + { + return NULL; + } + if (fread((char*)header,sizeof(ReplicateHeader),1,fp) < 1) + { + return NULL; + } + size = ntohl(header->query_size); + if (size >= 0) + { + query = malloc(size+4); + if (query == NULL) + { + show_error("%s:malloc failed:(%s)",func,strerror(errno)); + } + memset(query,0,size+4); + if (size > 0) + { + if (fread(query,size,1,fp) < 1) + { + return NULL; + } + } + return query; + } + return NULL; +} + +/** + * send queries from queue. + * + * return + * STATUS_OK - success both + * STATUS_ERROR - fail both + */ +static int +PGRsend_queue(RecoveryTbl * master, RecoveryTbl * target) +{ + char * func = "PGRsend_queue()"; + HostTbl * master_ptr = NULL; + HostTbl * target_ptr = NULL; + RecoveryQueueFile * msg = NULL; + FILE * rfp = NULL; + ReplicateHeader header; + char * query = NULL; + int size = 0; + int status = 0; + int query_size = 0; + int rtn=0; + + if (master == (RecoveryTbl *)NULL) + { + show_error("%s:there is no master ",func); + return STATUS_ERROR; + } +#ifdef PRINT_DEBUG + show_debug("%s:master %s - %d",func,master->hostName,master->port); +#endif + master_ptr = PGRget_HostTbl(master->resolvedName,master->port); + if (master_ptr == (HostTbl *)NULL) + { + show_error("%s:master table is null",func); + return STATUS_ERROR; + } + if (target != (RecoveryTbl *)NULL) + { +#ifdef PRINT_DEBUG + show_debug("%s:target %s - %d",func,target->hostName,target->port); +#endif + target_ptr = PGRget_HostTbl(target->resolvedName,target->port); + if (target_ptr == (HostTbl *)NULL) + { + show_error("%s:target table is null",func); + return STATUS_ERROR; + } + } + + size = sizeof(RecoveryQueueFile) + FILENAME_MAX_LENGTH; + msg = (RecoveryQueueFile *)malloc(size+4); + if (msg == NULL) + { +#ifdef PRINT_DEBUG + show_debug("%s:malloc() failed. reason: %s",func, strerror(errno)); +#endif + return STATUS_ERROR; + } + memset(msg,0,size+4); + status = STATUS_OK; + while (msgrcv(*RecoveryMsgid , msg, FILENAME_MAX_LENGTH, 0, IPC_NOWAIT) > 0 ) + { + strncpy(Recovery_Status_Inf->read_file,(char *)(msg->mdata),FILENAME_MAX_LENGTH); + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + if (!strncmp(Recovery_Status_Inf->write_file,Recovery_Status_Inf->read_file,sizeof(Recovery_Status_Inf->write_file))) + { + memset(Recovery_Status_Inf->write_file,0,sizeof(Recovery_Status_Inf->write_file)); + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + rfp = fopen(Recovery_Status_Inf->read_file,"r"); + if (rfp == NULL) + { + show_error("%s:queue file [%s] can not be opened:(%s)",func,Recovery_Status_Inf->read_file,strerror(errno)); + return STATUS_ERROR; + } + while ((query = read_queue_file(rfp, &header,query)) != NULL) + { + query_size = ntohl(header.query_size); + if (query_size < 0) + { + if (query != NULL) + { + free(query); + query = NULL; + } + break; + } + PGR_Response_Inf->current_cluster = 0; + rtn=PGRsend_replicate_packet_to_server(master_ptr,&header,query,PGR_Result,ntohl(header.replicate_id), true); + if (target_ptr != NULL) + { + PGR_Response_Inf->current_cluster = 1; + rtn=PGRsend_replicate_packet_to_server(target_ptr,&header,query,PGR_Result,ntohl(header.replicate_id), true); + } + } + if (query != NULL) + { + free(query); + query = NULL; + } + if (rfp != NULL) + { + fclose(rfp); + rfp = NULL; + unlink(Recovery_Status_Inf->read_file); + memset(Recovery_Status_Inf->read_file,0,sizeof(Recovery_Status_Inf->read_file)); + } + } +#ifdef PRINT_DEBUG + show_debug("%s:send_queue return status %d",func,status); +#endif + return status; +} + +static void +pgrecovery_loop(int fd) +{ + char * func = "pgrecovery_loop()"; + int count; + int sock; + int status; + bool loop_end = false; + RecoveryPacket packet; + HostTbl new_host; + RecoveryTbl * lbp; + + lbp = LoadBalanceTbl; + if (lbp == (RecoveryTbl *)NULL) + { + show_error("%s:recovery table is NULL",func); + return ; + } +#ifdef PRINT_DEBUG + show_debug("%s:recovery accept port %d",func, Recovery_Port_Number); +#endif + count = 0; + while ((status = PGR_Create_Acception(fd,&sock,"",Recovery_Port_Number)) != STATUS_OK) + { + show_error("%s:PGR_Create_Acception failed",func); + PGR_Close_Sock(&sock); + sock = -1; + if ( count > PGR_CONNECT_RETRY_TIME) + { + return; + } + count ++; + } + if(sock==-1) { + show_error("can't create recovery socket.exit."); + PGRreplicate_exit(1); + } + for(;;) + { + int read_size = 0; + int rtn; + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + /* + * read packet from target cluster server + */ + rtn = select(sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + + if (rtn == 0) /* timeout */ + { + return; + } + + if (rtn && FD_ISSET(sock, &rmask)) + { + read_size = read_packet(sock, &packet); + } + else + { + continue; + } + +#ifdef PRINT_DEBUG + show_debug("%s:receive packet no:%d",func,ntohs(packet.packet_no)); +#endif + + switch (ntohs(packet.packet_no)) + { + case RECOVERY_PREPARE_REQ : + /* + * start prepare of recovery + */ + +#ifdef PRINT_DEBUG + show_debug("%s:1st master %s - %d", + func,Master.hostName,Master.port); + show_debug("%s:1st target %s - %d", + func,Target.hostName,Target.port); +#endif + + loop_end = first_setup_recovery(&sock, &packet); +#ifdef PRINT_DEBUG + show_debug("%s:first_setup_recovery end:%d ",func,loop_end); +#endif + break; + case RECOVERY_START_REQ : + /* + * now, recovery process will start + * stop the transaction count up + * start queueing and stop send all queries for master DB + */ +#ifdef PRINT_DEBUG + show_debug("%s:2nd master %s - %d", + func, Master.hostName,Master.port); + show_debug("%s:2nd target %s - %d", + func, Target.hostName,Target.port); +#endif + loop_end = second_setup_recovery (&packet); +#ifdef PRINT_DEBUG + show_debug("%s:second_setup_recovery end :%d ", + func,loop_end); +#endif + break; + case RECOVERY_QUEUE_DATA_REQ : + /* + * send all queries in queue + */ + +#ifdef PRINT_DEBUG + show_debug("%s:last master %s - %d", + func, Master.hostName,Master.port); + show_debug("%s:last target %s - %d", + func, Target.hostName,Target.port); +#endif + status = PGRsend_queue(&Master,&Target); + if (status == STATUS_OK) + { + memcpy(new_host.hostName,Target.hostName,sizeof(new_host.hostName)); + memcpy(new_host.resolvedName,Target.resolvedName,sizeof(new_host.resolvedName)); + new_host.port = Target.port; + new_host.recoveryPort = Target.recoveryPort; + PGRset_recovered_host(&new_host,DB_TBL_USE); + PGRadd_HostTbl(&new_host,DB_TBL_USE); + PGRset_recovery_packet_no(&packet, RECOVERY_QUEUE_DATA_ANS ); + status = send_packet(&Target, &packet); + if (status != STATUS_OK) + { + finish_recovery(); + } + } + else + { + /* connection error , master or target may be down */ + show_error("%s:PGRsend_queue failed",func); + PGRset_recovery_packet_no(&packet, RECOVERY_ERROR_CONNECTION); + status = send_packet(&Target,&packet); + finish_recovery(); + } + loop_end = true; + break; + case RECOVERY_FINISH : + /* + * finished rsync DB datas from master to target + */ + /* + * stop queueing, and re-initialize recovery status + */ + finish_recovery(); + loop_end = true; + /* + * send finish recovery to load balancer + */ + if (Master.recovery_sock != -1) + { + close(Master.recovery_sock); + Master.recovery_sock = -1; + } + if (Target.recovery_sock != -1) + { + close(Target.recovery_sock); + Target.recovery_sock = -1; + } + send_packet(&Master, &packet); + MasterPacketData.packet_no = packet.packet_no; + PGRsend_load_balance_packet(&MasterPacketData); + PGRsend_load_balance_packet(&packet); + memset((char *)&MasterPacketData,0,sizeof(RecoveryPacket)); + break; + case RECOVERY_ERROR_ANS : +#ifdef PRINT_DEBUG + show_debug("%s:recovery error accept. top queueing and initiarse recovery status",func); +#endif + status = PGRsend_queue(&Master,NULL); + memset(&packet,0,sizeof(RecoveryPacket)); + PGRset_recovery_packet_no(&packet, RECOVERY_ERROR_ANS); + send_packet(&Master, &packet); + finish_recovery(); + loop_end = true; + PGRset_recovery_packet_no(&MasterPacketData, RECOVERY_FINISH ); + PGRsend_load_balance_packet(&MasterPacketData); + memset((char *)&MasterPacketData,0,sizeof(RecoveryPacket)); + break; + default: + show_error("%s:unknown packet. abort to parse"); + loop_end=true; + break; + } + if (loop_end) + { + if (Master.sock != -1) + { + close (Master.sock); + } + if (Master.recovery_sock != -1) + { + close (Master.recovery_sock); + } + PGR_Close_Sock(&sock); + return; + } + } +} + +void +PGRrecovery_main(int fork_wait_time) +{ + char * func = "PGRrecovery_main()"; + int status; + int fd = -1; + int rtn; + pid_t pgid = 0; + pid_t pid = 0; + + pgid = getpgid(0); + pid = fork(); + if (pid != 0) + { + return; + } + + PGRsignal(SIGCHLD, SIG_DFL); + PGRsignal(SIGHUP, PGRexit_subprocess); + PGRsignal(SIGINT, PGRexit_subprocess); + PGRsignal(SIGQUIT, PGRexit_subprocess); + PGRsignal(SIGTERM, PGRexit_subprocess); + PGRsignal(SIGPIPE, SIG_IGN); + /* + * in child process, + * call recovery module + */ + setpgid(0,pgid); + + if (fork_wait_time > 0) { +#ifdef PRINT_DEBUG + show_debug("recovery process: wait fork(): pid = %d", getpid()); +#endif + sleep(fork_wait_time); + } + +#ifdef PRINT_DEBUG + show_debug("%s:PGRrecovery_main bind port %d",func,Recovery_Port_Number); +#endif + status = PGR_Create_Socket_Bind(&fd, "", Recovery_Port_Number); + if (status != STATUS_OK) + { + show_error("%s:PGR_Create_Socket_Bind failed",func); + exit(1); + } + memset(&MasterPacketData,0,sizeof(RecoveryPacket)); + memset(&Master,0,sizeof(RecoveryTbl)); + memset(&Target,0,sizeof(RecoveryTbl)); + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = RECOVERY_TIMEOUT; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(fd,&rmask); + rtn = select(fd+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn && FD_ISSET(fd, &rmask)) + { + pgrecovery_loop(fd); + } + } +} + +#ifdef PRINT_DEBUG +static void +show_recovery_packet(RecoveryPacket * packet) +{ + show_debug("no = %d",ntohs(packet->packet_no)); + show_debug("max_connect = %d",ntohs(packet->max_connect)); + show_debug("port = %d",ntohs(packet->port)); + show_debug("recoveryPort = %d",ntohs(packet->recoveryPort)); + if (packet->hostName != NULL) + show_debug("hostName = %s",packet->hostName); + if (packet->pg_data != NULL) + show_debug("pg_data = %s",packet->pg_data); +} +#endif diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/replicate.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/replicate.c --- postgresql-8.2.4/src/pgcluster/pgrp/replicate.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/replicate.c 2007-03-01 16:27:15.000000000 +0100 @@ -0,0 +1,4088 @@ +/*-------------------------------------------------------------------- + * FILE: + * replicate.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for the replication. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#include "postgres.h" +#include "postgres_fe.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" + +#include +#include +#include +#include + +#ifdef HAVE_NETINET_TCP_H +#include +#endif + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + + +#ifdef HAVE_CRYPT_H +#include +#endif + + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#include "access/xact.h" +#include "lib/dllist.h" +#include "libpq/pqformat.h" +#include "replicate_com.h" +#include "pgreplicate.h" + + +#define IPC_NMAXSEM (32) + +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static TransactionTbl * setTransactionTbl(HostTbl * host_ptr, ReplicateHeader * header); +static TransactionTbl * insertTransactionTbl( HostTbl * host_ptr, TransactionTbl * datap); +static TransactionTbl * getTransactionTbl( HostTbl * host_ptr, ReplicateHeader * header); +static void deleteTransactionTbl(HostTbl * host_ptr,ReplicateHeader * header); + +static HostTbl * deleteHostTbl(HostTbl * ptr); +static bool is_master_in_recovery(char * host, int port,int recovery_status); +static void sem_quit(int semid); +static int send_cluster_status_to_load_balance(HostTbl * host_ptr,int status); +static void set_transaction_status(int status); +static void check_transaction_status(ReplicateHeader * header,TransactionTbl *transaction); +static HostTbl * check_host_transaction_status(ReplicateHeader * header,HostTbl *host ); +static void clearHostTbl(void); +static bool is_need_sync_time(ReplicateHeader * header); +static bool is_need_wait_answer(ReplicateHeader * header); +static void write_host_status_file(HostTbl * host_ptr); + +static void delete_template(HostTbl * ptr, ReplicateHeader * header); +static char * check_copy_command(char * query); +static int read_answer(int dest); +static bool is_autocommit_off(char * query); +static bool is_autocommit_on(char * query); +static unsigned int get_host_ip_from_tbl(char * host); +static unsigned int get_srcHost_ip_from_tbl(char * srcHost); + +static int next_replication_id(void); +static void check_replication_id(void); +static bool is_need_use_rlog(ReplicateHeader * header); +static bool is_need_queue_jump( ReplicateHeader * header,char * query); +static int check_delete_transaction (HostTbl * host_ptr, ReplicateHeader * header); + +static bool is_executed_query_in_origin( ReplicateHeader *header ); +static bool is_executed_query( PGconn *conn,ReplicateHeader *header ); + +static void * thread_send_source(void * arg); +static void * thread_send_cluster(void * arg); + +static int send_replicate_packet_to_server( TransactionTbl * transaction_tbl, int current_cluster, HostTbl * host_ptr, ReplicateHeader * header, char *query , char * result,unsigned int replicationId, bool recovery); +static int check_result( PGresult * res ); +static bool compare_results(int *results, int size, int source_id); + +static int send_func(HostTbl * host_ptr,ReplicateHeader * header, char * func,char * result); +static uint32_t get_oid(HostTbl * host_ptr,ReplicateHeader * header); +static int set_oid(HostTbl * host_ptr,ReplicateHeader * header, uint32_t oid); +static int replicate_lo( PGconn * conn, ReplicateHeader * header, LOArgs * query); +static int notice_abort(HostTbl * host_ptr,ReplicateHeader * header); +static FILE * create_queue_file(void); +static int add_queue_file(char * data, int size); + +static int send_p_parse (PGconn * conn, StringInfo input_message); +static int send_p_bind (PGconn * conn, StringInfo input_message); +static int send_p_describe (PGconn * conn, StringInfo input_message); +static int send_p_execute (PGconn * conn, StringInfo input_message); +static int send_p_sync (PGconn * conn, StringInfo input_message); +static int send_p_close (PGconn * conn, StringInfo input_message); +static void set_string_info(StringInfo input_message, ReplicateHeader * header, char * query); + +int replicate_packet_send_internal(ReplicateHeader * header, char * query,int dest,int recovery_status,bool isHeldLock); +bool PGRis_same_host(char * host1, unsigned short port1 , char * host2, unsigned short port2); +HostTbl * PGRadd_HostTbl(HostTbl * conf_data, int useFlag); +HostTbl * PGRget_master(void); +void PGRset_recovery_status(int status); +int PGRget_recovery_status(void); +int PGRcheck_recovered_host(void); +int PGRset_recovered_host(HostTbl * target,int useFlag); +int PGRinit_recovery(void); +void PGRexit_subprocess(int signo); +void PGRreplicate_exit(int exit_status); +int PGRsend_replicate_packet_to_server( HostTbl * host_ptr, ReplicateHeader * header, char *query , char * result,unsigned int replicationId, bool recovery); +HostTbl * PGRget_HostTbl(char * resolvedName,int port); +int PGRset_queue(ReplicateHeader * header,char * query); +int PGRset_host_status(HostTbl * host_ptr,int status); +void PGRclear_transactions(void); +void PGRclear_connections(); +int PGRset_replication_id(uint32_t id); +int PGRdo_replicate(int sock,ReplicateHeader *header, char * query); +int PGRreturn_result(int dest, char * result,int wait); +int PGRreplicate_packet_send( ReplicateHeader * header, char * query,int dest,int recovery_status); +char * PGRread_packet(int sock, ReplicateHeader *header); +char * PGRread_query(int sock, ReplicateHeader *header); +PGconn * PGRcreateConn( char * host, char * port,char * database, char * userName, char * password, char * md5Salt, char * cryptSalt ); + +unsigned int PGRget_next_query_id(void); +int PGRinit_transaction_table(void); +int PGRsync_oid(ReplicateHeader *header); +int PGRload_replication_id(void); +extern pthread_mutex_t transaction_table_mutex; + +bool +PGRis_same_host(char * host1, unsigned short port1 , char * host2, unsigned short port2) +{ +#ifdef PRINT_DEBUG + char * func = "PGRis_same_host()"; +#endif + unsigned int ip1, ip2; + + if ((host1[0] == '\0' ) || (host2[0] == '\0') || + ( port1 != port2 )) + { +#ifdef PRINT_DEBUG + show_debug("%s:target host",func); +#endif + return false; + } + ip1 = PGRget_ip_by_name( host1); + ip2 = PGRget_ip_by_name( host2); + + if ((ip1 == ip2) && (port1 == port2)) + { + return true; + } + return false; +} + +PGconn * +PGRcreateConn( char * host, char * port,char * database, char * userName, char * password, char * md5Salt, char * cryptSalt ) +{ + char * func = "PGRcreateConn()"; + int cnt = 0; + PGconn * conn = NULL; + char pwd[256]; + + memset(pwd,0,sizeof(pwd)); + if (*password != '\0') + { + if ((strncmp(password,"md5",3) == 0) && (md5Salt != NULL)) + { + sprintf(pwd,"%s(%d)(%d)(%d)(%d)",password, + *md5Salt,*(md5Salt+1),*(md5Salt+2),*(md5Salt+3)); + } + else + { + strncpy(pwd,password,sizeof(pwd)); + } + } + conn = PQsetdbLogin(host, port, NULL, NULL, database, userName, pwd); + /* check to see that the backend Connection was successfully made */ + cnt = 0; + while (PQstatus(conn) == CONNECTION_BAD) + { + if (conn != NULL) + { + PQfinish(conn); + conn = NULL; + } + conn = PQsetdbLogin(host, port, NULL, NULL, database, userName, pwd); + if (cnt > PGR_CONNECT_RETRY_TIME ) + { + if (conn != NULL) + { + PQfinish(conn); + conn = NULL; + } + return (PGconn *)NULL; + } + + if(PQstatus(conn) == CONNECTION_BAD && h_errno==2) + { + show_error("gethostbyname() failed. sleep and retrying..."); + usleep(PGR_SEND_WAIT_MSEC); + cnt ++; + } + else if(!strncasecmp(PQerrorMessage(conn),"FATAL: Sorry, too many clients already",30) || + !strncasecmp(PQerrorMessage(conn),"FATAL: Non-superuser connection limit",30) ) + { + usleep(PGR_SEND_WAIT_MSEC); + show_error("Connection overflow. sleep and retrying..."); + cnt ++; + } + else if(!strncasecmp(PQerrorMessage(conn),"FATAL: The database system is starting up",40) ) + { +#ifdef PRINT_DEBUG + show_debug("waiting for starting up..."); +#endif + usleep(PGR_SEND_WAIT_MSEC); + } + else + { +#ifdef PRINT_DEBUG + show_error("%s:Retry. h_errno is %d,reason is '%s'",func,h_errno,PQerrorMessage(conn)); +#endif + + usleep(PGR_SEND_WAIT_MSEC); + cnt ++; + } + } + return conn; +} + +static TransactionTbl * +setTransactionTbl(HostTbl * host_ptr, ReplicateHeader * header) +{ + char * func = "setTransactionTbl()"; + TransactionTbl * ptr = NULL; + TransactionTbl work ; + char port[8]; + char * hostName = NULL; + char * dbName = NULL; + char * userName = NULL; + char * password = NULL; + char * md5Salt = NULL; + char * cryptSalt = NULL; + + if ((host_ptr == NULL) || (header == NULL)) + { + return (TransactionTbl *)NULL; + } + dbName = (char *)header->dbName; + snprintf(port,sizeof(port),"%d", host_ptr->port); + userName = (char *)(header->userName); + password = (char *)(header->password); + md5Salt = (char *)(header->md5Salt); + cryptSalt = (char *)(header->cryptSalt); + hostName = (char *)(host_ptr->resolvedName); + + ptr = getTransactionTbl(host_ptr,header); + if (ptr != NULL) + { + ptr->transaction_count = 0; + ptr->conn = PGRcreateConn(hostName,port,dbName,userName,password,md5Salt,cryptSalt); + if (ptr->conn == NULL) + { + show_error("%s:Transaction is pooling but PGRcreateConn failed",func); + deleteTransactionTbl(host_ptr, header); + PGRset_host_status(host_ptr,DB_TBL_ERROR); + ptr = NULL; + } + return ptr; + } + + memset(&work,0,sizeof(work)); + strncpy(work.host, hostName, sizeof(work.host)); + strncpy(work.srcHost, header->from_host, sizeof(work.srcHost)); + work.hostIP = PGRget_ip_by_name(hostName); + work.port = host_ptr->port; + work.srcHostIP = PGRget_ip_by_name(header->from_host); + work.pid = ntohs(header->pid); + strncpy(work.dbName,header->dbName,sizeof(work.dbName)); + work.conn = PGRcreateConn(hostName,port,dbName,userName,password,md5Salt,cryptSalt); + if (work.conn == NULL) + { +#ifdef PRINT_DEBUG + show_debug("%s: %s@%s is not ready",func,port,hostName); +#endif + return (TransactionTbl *)NULL; + } + work.useFlag = DB_TBL_USE ; + work.in_transaction = false; + work.transaction_count = 0; + ptr = insertTransactionTbl(host_ptr,&work); + if (ptr == (TransactionTbl *)NULL) + { + show_error("%s:insertTransactionTbl failed",func); + return (TransactionTbl *)NULL; + } + return ptr; +} + +static TransactionTbl * +insertTransactionTbl( HostTbl * host_ptr, TransactionTbl * datap) +{ + char * func = "insertTransactionTbl()"; + TransactionTbl * workp = NULL; + + pthread_mutex_lock(&transaction_table_mutex); + if ((host_ptr == (HostTbl *)NULL) || (datap == (TransactionTbl*)NULL)) + { + show_error("%s:host table or transaction table is NULL",func); + pthread_mutex_unlock(&transaction_table_mutex); + + return (TransactionTbl *)NULL; + } + if (Transaction_Tbl_Begin == NULL) + { + if (PGRinit_transaction_table() != STATUS_OK) + { + pthread_mutex_unlock(&transaction_table_mutex); + + return (TransactionTbl *)NULL; + } + } + + workp = (TransactionTbl *)malloc(sizeof(TransactionTbl)); + memset(workp,0,sizeof(TransactionTbl)); + Transaction_Tbl_End = workp; + workp->hostIP = datap->hostIP; + workp->port = datap->port; + workp->pid = datap->pid; + workp->srcHostIP = datap->srcHostIP; + strncpy(workp->host,datap->host,sizeof(workp->host)); + strncpy(workp->srcHost,datap->srcHost,sizeof(workp->srcHost)); + strncpy(workp->dbName,datap->dbName,sizeof(workp->dbName)); + workp->conn = datap->conn; + workp->useFlag = DB_TBL_USE; + workp->lock = STATUS_OK; + workp->in_transaction =datap->in_transaction; + workp->transaction_count =datap->transaction_count; + DLAddTail(Transaction_Tbl_Begin, DLNewElem(workp)); + + pthread_mutex_unlock(&transaction_table_mutex); + + return workp; +} + +static TransactionTbl * +getTransactionTbl( HostTbl * host_ptr, ReplicateHeader * header) +{ + Dlelem * ptr = NULL; + unsigned int host_ip,srcHost_ip; + unsigned short pid = 0; + + if (Transaction_Tbl_Begin == (Dllist *) NULL) + { + return (TransactionTbl * )NULL; + } + if ((host_ptr == (HostTbl *)NULL) || + (header == (ReplicateHeader *)NULL)) + { + return (TransactionTbl * )NULL; + } + host_ip = get_host_ip_from_tbl(host_ptr->resolvedName); + if (host_ip == 0) + { + host_ip = PGRget_ip_by_name(host_ptr->resolvedName); + } + srcHost_ip = get_srcHost_ip_from_tbl(header->from_host); + if (srcHost_ip == 0) + { + srcHost_ip = PGRget_ip_by_name(header->from_host); + } + pid = ntohs(header->pid); + + pthread_mutex_lock(&transaction_table_mutex); + + ptr = DLGetHead(Transaction_Tbl_Begin); + while (ptr) + { + TransactionTbl *transaction = DLE_VAL(ptr); + if ((transaction->useFlag == DB_TBL_USE) && + (transaction->hostIP == host_ip) && + (transaction->port == host_ptr->port) && + (transaction->srcHostIP == srcHost_ip) && + (!strncasecmp(transaction->dbName,header->dbName,sizeof(transaction->dbName))) && + (transaction->pid == pid)) + { + pthread_mutex_unlock(&transaction_table_mutex); + return transaction; + } + ptr = DLGetSucc(ptr); + } + pthread_mutex_unlock(&transaction_table_mutex); + + return (TransactionTbl * )NULL; +} + +static void +deleteTransactionTbl(HostTbl * host_ptr,ReplicateHeader * header) +{ + TransactionTbl *ptr = NULL; + Dlelem *elem; + + ptr = getTransactionTbl(host_ptr,header); + + pthread_mutex_lock(&transaction_table_mutex); + + if (ptr != NULL) + { + /* + if (ptr->in_transaction) + { + if (host_ptr->transaction_count > 0) + host_ptr->transaction_count--; + } + */ + + if (ptr->conn != NULL) + { + PQfinish(ptr->conn); + } + elem = DLGetHead(Transaction_Tbl_Begin); + while (elem) + { + TransactionTbl *transaction = DLE_VAL(elem); + if (transaction == ptr) { + free(ptr); + DLRemove(elem); + DLFreeElem(elem); + pthread_mutex_unlock(&transaction_table_mutex); + return; + } + elem = DLGetSucc(elem); + } + } + pthread_mutex_unlock(&transaction_table_mutex); +} + +static HostTbl * +deleteHostTbl(HostTbl * ptr) +{ + if (ptr != (HostTbl*)NULL) + { + memset(ptr,0,sizeof(HostTbl)); + } + return ++ptr; +} + +HostTbl * +PGRadd_HostTbl(HostTbl *conf_data, int useFlag) +{ + HostTbl * ptr = NULL; + int cnt = 0; + + ptr = PGRget_HostTbl(conf_data->resolvedName, conf_data->port); + if (ptr != (HostTbl*)NULL) + { + PGRset_host_status(ptr,useFlag); + return ptr; + } + + ptr = Host_Tbl_Begin; + cnt = 1; + while (ptr->useFlag != DB_TBL_END) + { + if (ptr->useFlag == DB_TBL_FREE) + { + break; + } + ptr ++; + cnt ++; + } + if (cnt >= MAX_DB_SERVER) + { + return (HostTbl*)NULL; + } + if (ptr->useFlag == DB_TBL_END) + { + (ptr + 1) -> useFlag = DB_TBL_END; + } + memset(ptr,0,sizeof(HostTbl)); + ptr->hostNum = cnt; + memcpy(ptr->hostName,conf_data->hostName,sizeof(ptr->hostName)); + memcpy(ptr->resolvedName,conf_data->resolvedName,sizeof(ptr->resolvedName)); + ptr->port = conf_data->port; + ptr->recoveryPort = conf_data->recoveryPort; + ptr->transaction_count = 0; + PGRset_host_status(ptr,useFlag); + + return ptr; +} + +HostTbl * +PGRget_master(void) +{ + HostTbl * host_tbl = NULL; + + host_tbl = Host_Tbl_Begin; + while(host_tbl->useFlag != DB_TBL_END) + { + if (host_tbl->useFlag == DB_TBL_USE) + { + return host_tbl; + } + host_tbl ++; + } + return (HostTbl *)NULL; +} + +void +PGRset_recovery_status(int status) +{ + if (RecoverySemID <= 0) + return; + PGRsem_lock(RecoverySemID,SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + Recovery_Status_Inf->recovery_status = status; + + } + PGRsem_unlock(RecoverySemID,SEM_NUM_OF_RECOVERY); +} + +int +PGRget_recovery_status(void) +{ + int status = -1; + + if (RecoverySemID <= 0) + return -1; + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + status = Recovery_Status_Inf->recovery_status; + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); + return status; + +} + +static void +set_transaction_status(int status) +{ + if (RecoverySemID <= 0) + return ; + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + Recovery_Status_Inf->recovery_status = status; + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); +} + +#if 0 +static int +get_transaction_status(void) +{ + int status = 0; + + if (RecoverySemID <= 0) + return 0; + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + status = Recovery_Status_Inf->recovery_status; + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); + return status; + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); + return 0; +} +#endif + +int +PGRcheck_recovered_host(void) +{ + char * func = "PGRcheck_recovered_host()"; + HostTbl * ptr = NULL; + int rtn = STATUS_OK; + + if (RecoverySemID <= 0) + return STATUS_ERROR; + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + if (Recovery_Status_Inf->useFlag != DB_TBL_FREE) + { + ptr = PGRadd_HostTbl((HostTbl *)&(Recovery_Status_Inf->target_host),Recovery_Status_Inf->useFlag); + if (ptr == (HostTbl *) NULL) + { + show_error("%s:PGRadd_HostTbl failed",func); + rtn = STATUS_ERROR; + } + Recovery_Status_Inf->useFlag = DB_TBL_FREE; + memset((HostTbl *)&(Recovery_Status_Inf->target_host),0,sizeof(HostTbl)); + + } + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); + return rtn; +} + +int +PGRset_recovered_host(HostTbl * target, int useFlag) +{ + if (RecoverySemID <= 0) + return -1; + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY); + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + Recovery_Status_Inf->useFlag = useFlag; + if (target != (HostTbl*)NULL) + { + memcpy((HostTbl *)&(Recovery_Status_Inf->target_host),target,sizeof(HostTbl)); + PGRset_host_status(target,useFlag); + } + + } + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY); + return 0; +} + +static bool +is_master_in_recovery(char * host , int port,int recovery_status) +{ + HostTbl * master = NULL; + + int status = PGRget_recovery_status(); + if (status == RECOVERY_CLEARED) + { + master = PGRget_master(); + if (master == (HostTbl *)NULL) + { + return false; + } + return (PGRis_same_host(host, port , master->hostName, master->port)); + } + return false; +} + +int +PGRinit_recovery(void) +{ + char * func = "PGRinit_recovery()"; + int size = 0; + union semun sem_arg; + int i = 0; + + if ((RecoverySemID = semget(IPC_PRIVATE,4,IPC_CREAT | IPC_EXCL | 0600)) < 0) + { + show_error("%s:semget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + for ( i = 0 ; i < 4 ; i ++) + { + semctl(RecoverySemID, i, GETVAL, sem_arg); + sem_arg.val = 1; + semctl(RecoverySemID, i, SETVAL, sem_arg); + } + + size = sizeof(RecoveryStatusInf); + RecoveryShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (RecoveryShmid < 0) + { + show_error("%s:shmget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + Recovery_Status_Inf = (RecoveryStatusInf *)shmat(RecoveryShmid,0,0); + if (Recovery_Status_Inf == (RecoveryStatusInf *)-1) + { + show_error("%s:shmat() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + memset(Recovery_Status_Inf,0,size); + Recovery_Status_Inf->check_point = PGR_CHECK_POINT ; + + size = sizeof(unsigned int); + ReplicateSerializationShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (ReplicateSerializationShmid < 0) + { + show_error("%s:shmget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + + PGR_ReplicateSerializationID = (unsigned int *)shmat(ReplicateSerializationShmid,0,0); + if( PGR_ReplicateSerializationID == (unsigned int *)-1) { + show_error("%s:shmat() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + memset(PGR_ReplicateSerializationID,0,size); + PGRset_recovery_status(RECOVERY_INIT); + PGRset_recovered_host((HostTbl *)NULL, DB_TBL_FREE); + set_transaction_status(0); + + /* + * create message queue + */ + RecoveryMsgShmid = shmget(IPC_PRIVATE,size,IPC_CREAT | IPC_EXCL | 0600); + if (RecoveryMsgShmid < 0) + { + show_error("%s:shmget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + + RecoveryMsgid = (int *)shmat(RecoveryMsgShmid,0,0); + if( RecoveryMsgid < 0) { + show_error("%s:shmat() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + *RecoveryMsgid = msgget (IPC_PRIVATE, 00666 | IPC_CREAT ); + if (*RecoveryMsgid < 0) + { + show_error("%s:msgget() failed. (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + + + return STATUS_OK; +} + +static void +clearHostTbl(void) +{ + + HostTbl * ptr = NULL; + + if (Host_Tbl_Begin == NULL) + return; + /* normal socket close */ + ptr = Host_Tbl_Begin; + while(ptr && ptr->useFlag != DB_TBL_END) + { + ptr = deleteHostTbl(ptr); + } +} + +void +PGRexit_subprocess(int signo) +{ + exit_signo = signo; + PGRreplicate_exit(1); +} + +void +PGRreplicate_exit(int exit_status) +{ + char fname[256]; + int rtn = 0; + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGQUIT); + sigaddset(&mask, SIGCHLD); + sigprocmask(SIG_BLOCK, &mask, NULL); + + kill (0, exit_signo); + + child_wait(0); + + if (RidFp != NULL) + { + rewind(RidFp); + if (Recovery_Status_Inf != NULL) + { + PGRwrite_log_file(RidFp,"%u",Recovery_Status_Inf->replication_id); + } + fflush(RidFp); + fclose(RidFp); + RidFp = NULL; + } + + if (ReplicateSock > 0) + close(ReplicateSock); + + /* recovery status clear */ + if (RecoverySemID > 0) + Recovery_Status_Inf->recovery_status = RECOVERY_INIT; + + /* normal socket close */ + clearHostTbl(); + + if (Host_Tbl_Begin != (HostTbl *)NULL) + { + rtn = shmdt((char *)Host_Tbl_Begin); + shmctl(HostTblShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (Cascade_Tbl != (ReplicateServerInfo *)NULL) + { + rtn = shmdt((char *)Cascade_Tbl); + shmctl(CascadeTblShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (Cascade_Inf != (CascadeInf *)NULL) + { + rtn = shmdt((char *)Cascade_Inf); + shmctl(CascadeInfShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (Commit_Log_Tbl != (CommitLogInf *)NULL) + { + rtn = shmdt((char *)Commit_Log_Tbl); + shmctl(CommitLogShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (Recovery_Status_Inf != (RecoveryStatusInf *)NULL) + { + rtn = shmdt((char *)Recovery_Status_Inf); + shmctl(RecoveryShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + if (PGR_ReplicateSerializationID!=NULL) + { + shmdt(PGR_ReplicateSerializationID); + shmctl(ReplicateSerializationShmid,IPC_RMID,(struct shmid_ds *)NULL); + } + + if (RecoveryMsgid) + { + if (*RecoveryMsgid >= 0) + msgctl(*RecoveryMsgid,IPC_RMID,(struct msqid_ds *)NULL); + + shmdt(RecoveryMsgid); + shmctl(RecoveryMsgShmid, IPC_RMID, NULL); + } + + if (StatusFp != NULL) + { + fflush(StatusFp); + fclose(StatusFp); + StatusFp = NULL; + } + if (LogFp != NULL) + { + fflush(LogFp); + fclose(LogFp); + LogFp = NULL; + } + + if (PGR_Result != NULL) + { + free(PGR_Result); + PGR_Result = NULL; + } + if (PGR_Response_Inf != NULL) + { + free(PGR_Response_Inf); + PGR_Response_Inf = NULL; + } + + if (LoadBalanceTbl != NULL) + { + free(LoadBalanceTbl); + LoadBalanceTbl = NULL; + } + + if (PGR_Log_Header != NULL) + { + free(PGR_Log_Header); + PGR_Log_Header = NULL; + } + + if (PGR_Send_Query_ID != NULL) + { + free(PGR_Send_Query_ID); + PGR_Send_Query_ID = NULL; + } + + if (CascadeSemID > 0) + { + sem_quit(CascadeSemID); + CascadeSemID = 0; + } + if (SemID > 0) + { + sem_quit(SemID); + SemID = 0; + } + if (RecoverySemID > 0) + { + sem_quit(RecoverySemID); + RecoverySemID = 0; + } + if (VacuumSemID > 0) + { + sem_quit(VacuumSemID); + } + + snprintf(fname, sizeof(fname), "%s/%s", PGR_Write_Path, PGREPLICATE_PID_FILE); + unlink(fname); + + /* close socket between rlog process */ + + if (Replicateion_Log->r_log_sock >= 0) + { + close(Replicateion_Log->r_log_sock); + Replicateion_Log->r_log_sock = -1; + } + if (Replicateion_Log->RLog_Sock_Path != NULL) + { + unlink(Replicateion_Log->RLog_Sock_Path); + free(Replicateion_Log->RLog_Sock_Path); + Replicateion_Log->RLog_Sock_Path = NULL; + } + + if (ResolvedName != NULL) + { + free(ResolvedName); + ResolvedName = NULL; + } + exit(exit_status); +} + +static int +send_cluster_status_to_load_balance(HostTbl * host_ptr,int status) +{ + RecoveryPacket packet; + int rtn = 0; + + memset(&packet,0,sizeof(RecoveryPacket)); + packet.packet_no = htons(status); + strncpy(packet.hostName,host_ptr->hostName,sizeof(packet.hostName)); + packet.port = htons(host_ptr->port); + rtn = PGRsend_load_balance_packet(&packet); + return rtn; +} + +int +PGRset_host_status(HostTbl * host_ptr,int status) +{ + if (host_ptr == NULL) + { + return STATUS_ERROR; + } + if (host_ptr->useFlag != status) + { + host_ptr->useFlag = status; + if (status == DB_TBL_ERROR ) + { + host_ptr->transaction_count = 0; + send_cluster_status_to_load_balance(host_ptr,RECOVERY_ERROR_CONNECTION); + } + write_host_status_file(host_ptr); + } + return STATUS_OK; +} + +static void +write_host_status_file(HostTbl * host_ptr) +{ + switch( host_ptr->useFlag) + { + case DB_TBL_FREE: + PGRwrite_log_file(StatusFp,"port(%d) host:%s free", + host_ptr->port, + host_ptr->hostName); + break; + case DB_TBL_INIT: + PGRwrite_log_file(StatusFp,"port(%d) host:%s initialize", + host_ptr->port, + host_ptr->hostName); + break; + case DB_TBL_USE: + PGRwrite_log_file(StatusFp,"port(%d) host:%s start use", + host_ptr->port, + host_ptr->hostName); + break; + case DB_TBL_ERROR: + PGRwrite_log_file(StatusFp,"port(%d) host:%s error", + host_ptr->port, + host_ptr->hostName); + break; + case DB_TBL_END: + PGRwrite_log_file(StatusFp,"port(%d) host:%s end", + host_ptr->port, + host_ptr->hostName); + break; + } +} + +static int +check_result( PGresult * res ) +{ + int status = 0; + + status = PQresultStatus(res); + if ((status == PGRES_NONFATAL_ERROR ) || + (status == PGRES_FATAL_ERROR )) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static bool +compare_results(int *results, int size, int source_id) +{ + int i, prev = 0; + + for (i = 0; i < size; i++) + { + if (i != source_id) + { + prev = results[i]; + break; + } + } + + for (; i < size; i++) + { + if (i == source_id) + continue; + if (prev != results[i]) + return false; + prev = results[i]; + } + return true; +} + +/*-------------------------------------------------- + * SYMBOL + * PGRsend_replicate_packet_to_server() + * NOTES + * Send query data to the cluster DB and recieve result data. + * ARGS + * HostTbl * host_ptr: the record of cluster DB table (target) + * ReplicateHeader * header: header data + * char *query: query data + * char * result: returned result data + * RETURN + * STATUS_OK: OK + * STATUS_ERROR: NG + * STATUS_LOCK_CONFLICT: Lock conflicted + *--------------------------------------------------- + */ +int +PGRsend_replicate_packet_to_server( HostTbl * host_ptr, ReplicateHeader * header, char *query , char * result,unsigned int replicationId, bool recovery) +{ + char * func = "PGRsend_replicate_packet_to_server()"; + TransactionTbl * transaction_tbl = NULL; + char *database = NULL; + char port[8]; + char *userName = NULL; + char * password = NULL; + char * host = NULL; + char * md5Salt = NULL; + char * cryptSalt = NULL; + int rtn = 0; + int current_cluster = 0; + int query_size = 0; + + if ((query == NULL) || (header == NULL)) + { + show_error("%s: query is broken",func); + return STATUS_ERROR; + } + query_size = ntohl(header->query_size); + if (query_size < 0) + { + show_error("%s: query size is broken",func); + return STATUS_ERROR; + } + if (host_ptr == NULL) + { + return STATUS_ERROR; + } + + if (PGR_Response_Inf != NULL) + { + current_cluster = PGR_Response_Inf->current_cluster; + } + + /* + * set up the connection + */ + database = (char *)header->dbName; + snprintf(port,sizeof(port),"%d", host_ptr->port); + userName = (char *)(header->userName); + password = (char *)(header->password); + md5Salt = (char *)(header->md5Salt); + cryptSalt = (char *)(header->cryptSalt); + host = (char *)(host_ptr->resolvedName); + /* + * get the transaction table data + * it has the connection data with each cluster DB + */ + transaction_tbl = getTransactionTbl(host_ptr,header); + /* + * if the transaction process is new one, + * create connection data and add the transaction table + */ + if (transaction_tbl == (TransactionTbl *)NULL) + { + if (recovery == true) + { + int cnt = 0; + while(transaction_tbl == (TransactionTbl *)NULL) + { + transaction_tbl = setTransactionTbl(host_ptr, header); + if (cnt > RECOVERY_TIMEOUT) + { + break; + } + cnt ++; + sleep(1); + } + } + else + { + transaction_tbl = setTransactionTbl(host_ptr, header); + } + if (transaction_tbl == (TransactionTbl *)NULL) + { + show_error("%s:setTransactionTbl failed",func); + if ( header->cmdSts != CMD_STS_NOTICE ) + { + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + return STATUS_ERROR; + } + StartReplication[current_cluster] = true; + } + else + { + /* + * re-use the connection data + */ + if ((transaction_tbl->conn != (PGconn *)NULL) && + (transaction_tbl->conn->sock > 0)) + { + StartReplication[current_cluster] = false; + } + else + { + if (transaction_tbl->conn != (PGconn *)NULL) + { + PQfinish(transaction_tbl->conn); + transaction_tbl->conn = NULL; + } + transaction_tbl->conn = PGRcreateConn(host,port,database,userName,password,md5Salt,cryptSalt); + StartReplication[current_cluster] = true; + } + } + if(header->cmdSts==CMD_STS_OTHER && + header->cmdType==CMD_TYPE_CONNECTION_CLOSE) + { + check_delete_transaction(host_ptr, header); + return STATUS_OK; + } +#ifdef PRINT_DEBUG + show_debug("%s:connect db:%s port:%s user:%s host:%s query:%s", + func, database,port,userName,host,query); +#endif + rtn = send_replicate_packet_to_server( transaction_tbl, current_cluster, host_ptr, header, query ,result ,replicationId, recovery); + return rtn; +} + +static int +send_replicate_packet_to_server( TransactionTbl * transaction_tbl, int current_cluster, HostTbl * host_ptr, ReplicateHeader * header, char *query , char * result,unsigned int replicationId, bool recovery) +{ + char * func = "send_replicate_packet_to_server()"; + PGconn * conn = (PGconn *)NULL; + PGresult * res = (PGresult *)NULL; + char sync_command[256]; + bool sync_command_flg = false; + char * str = NULL; + int rtn = 0; + int query_size = 0; + int hostNum = 0; + StringInfoData input_message; + + if (( transaction_tbl == (TransactionTbl *)NULL) || + ( host_ptr == (HostTbl *) NULL) || + (header == (ReplicateHeader *) NULL) || + (query == NULL) || + ( result == NULL)) + { + show_error("%s:unexpected NULL variable",func); + return STATUS_ERROR; + } + + query_size = ntohl(header->query_size); + if (query_size < 0) + { + show_error("%s: query size is broken",func); + return STATUS_ERROR; + } + +/* + if(header->cmdSts == CMD_STS_OTHER && + header->cmdType == CMD_TYPE_CONNECTION_CLOSE) + { + check_delete_transaction(host_ptr,header); + return STATUS_OK; + } +*/ + conn = transaction_tbl->conn; + if (conn == NULL) + { + show_error("%s:[%d@%s] may be down",func,host_ptr->port,host_ptr->hostName); + if ( header->cmdSts != CMD_STS_NOTICE ) + { + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + return STATUS_ERROR; + } + hostNum = host_ptr->hostNum; + + /* + * When the query is transaction query... + */ + if (is_need_sync_time(header) == true) + { + if (transaction_tbl->transaction_count >1 ) + { + sync_command_flg = false; + } + else + { + sync_command_flg = true; + } + } + if ((header->cmdSts == CMD_STS_TRANSACTION ) || + (header->cmdSts == CMD_STS_SET_SESSION_AUTHORIZATION )) + { + if ((header->cmdSts == CMD_STS_TRANSACTION ) && + ((header->cmdType != CMD_TYPE_BEGIN) || + (transaction_tbl->transaction_count >1 ))) + { + sync_command_flg = false; + } + } + + /* + * execute query + */ + + if (header->rlog > 0 ) + { + + if (is_executed_query( conn, header) == true) + { + return STATUS_OK; + } + else + { +#ifdef PRINT_DEBUG + show_debug("%s:check replication log issue , id=%d,rlog=%d,query=%s status=not_replicated",func,ntohl(header->replicate_id),header->rlog,query); +#endif + } + } + if (( header->cmdSts != CMD_STS_NOTICE ) && + ( header->cmdSts != CMD_STS_PREPARE ) && + ((sync_command_flg == true) || + (StartReplication[current_cluster] == true))) + { + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d,%u,%u,%u,%d,%u) ", + PGR_SYSTEM_COMMAND_FUNC, + PGR_SET_CURRENT_TIME_FUNC_NO, + (unsigned int)ntohl(header->tv.tv_sec), + (unsigned int)ntohl(header->tv.tv_usec), + (unsigned int)ntohl(PGR_Log_Header->replicate_id), + PGR_Response_Inf->response_mode, + *PGR_ReplicateSerializationID); +#ifdef PRINT_DEBUG + show_debug("%s:sync_command(%s)",func,sync_command); +#endif + res = PQexec(conn, sync_command); + if (res != NULL) + PQclear(res); + StartReplication[current_cluster] = false; + } + + res = NULL; + if ((header->cmdType == CMD_TYPE_COPY_DATA) || + (header->cmdType == CMD_TYPE_COPY_DATA_END)) + { + /* copy data replication */ + rtn =PQputnbytes(conn, query,query_size); + if (header->cmdType == CMD_TYPE_COPY_DATA_END) + { + rtn = PQendcopy(conn); + if (rtn == 1) /* failed */ + { + if (transaction_tbl->conn != NULL) + { + PQfinish(transaction_tbl->conn); + transaction_tbl->conn = (PGconn *)NULL; + StartReplication[current_cluster] = true; + } + } + } + *(PGR_Send_Query_ID + hostNum ) = ntohl(header->query_id); + return STATUS_OK; + } + else if (header->cmdSts == CMD_STS_LARGE_OBJECT) + { + replicate_lo(conn, header,(LOArgs *)query); + return STATUS_OK; + } + + else if (header->cmdSts == CMD_STS_PREPARE) + { + + if ( !PGR_Parse_Session_Started) + { + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d,%u,%u,%u,%d,%u) ", + PGR_SYSTEM_COMMAND_FUNC, + PGR_SET_CURRENT_TIME_FUNC_NO, + (unsigned int)ntohl(header->tv.tv_sec), + (unsigned int)ntohl(header->tv.tv_usec), + (unsigned int)ntohl(PGR_Log_Header->replicate_id), + PGR_Response_Inf->response_mode, + *PGR_ReplicateSerializationID); + res = PQexec(conn, sync_command); + if (res != NULL) + { + PQclear(res); + res = NULL; + } + while ((res = PQgetResult(conn)) != NULL) + { + if (res->resultStatus == PGRES_COPY_IN) + { + PQclear(res); + return STATUS_ERROR; + } + else if (res->resultStatus == PGRES_COPY_OUT) + { + conn->asyncStatus = PGASYNC_BUSY; + } + else if (conn->status == CONNECTION_BAD) + { + PQclear(res); + return STATUS_ERROR; + } + PQclear(res); + } + } + set_string_info(&input_message,header,query); + switch (header->cmdType) + { + case CMD_TYPE_P_PARSE : + if (send_p_parse(conn, &input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + case CMD_TYPE_P_BIND : + if (send_p_bind(conn, &input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + case CMD_TYPE_P_DESCRIBE : + if (send_p_describe(conn, &input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + case CMD_TYPE_P_EXECUTE : + if (send_p_execute(conn,&input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + case CMD_TYPE_P_SYNC : + if (send_p_sync(conn, &input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + case CMD_TYPE_P_CLOSE : + if (send_p_close(conn, &input_message) != STATUS_OK) + { + pqHandleSendFailure(conn); + PGR_Parse_Session_Started = false; + return STATUS_ERROR; + } + break; + default : + break; + } + return STATUS_OK; + } + else + { + if (transaction_tbl->lock != STATUS_OK) + { +#ifdef PRINT_DEBUG + show_debug("%s:[%d]transaction_tbl->lock is [%d]",func,current_cluster,transaction_tbl->lock ); +#endif + transaction_tbl->lock = STATUS_OK; + } + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d,%u,%u,%d) ", + PGR_SYSTEM_COMMAND_FUNC, + PGR_SET_CURRENT_REPLICATION_QUERY_ID_NO, + replicationId, + 0, + PGR_Response_Inf->response_mode); + res = PQexec(conn, sync_command); + if (res != NULL) + { + PQclear(res); + res = NULL; + } + res = PQexec(conn, query); + rtn = check_result(res); +#ifdef PRINT_DEBUG + show_debug("%s:PQexec send :%s",func,query); +#endif + + } + + if (res == NULL) + { + StartReplication[current_cluster] = true; + return STATUS_ERROR; + } + + str = PQcmdStatus(res); +#ifdef PRINT_DEBUG + show_debug("%s:PQexec returns :%s",func,str); +#endif + if ((str == NULL) || (*str == '\0')) + { + if ((result != NULL) && (res != NULL) && (res->errMsg != NULL)) + { + snprintf(result,PGR_MESSAGE_BUFSIZE,"E%s",res->errMsg); + } + else + { + strcpy(result,"E"); + } + StartReplication[current_cluster] = true; + } + else + { + if (!strncasecmp(str,PGR_LOCK_CONFLICT_NOTICE_CMD,strlen(PGR_LOCK_CONFLICT_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:LOCK CONFLICT from PQexec",func); +#endif + if (res != NULL) + PQclear(res); + + transaction_tbl->lock = STATUS_LOCK_CONFLICT; + return STATUS_LOCK_CONFLICT; + } + else if (!strncasecmp(str,PGR_DEADLOCK_DETECT_NOTICE_CMD,strlen(PGR_DEADLOCK_DETECT_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:DEADLOCK DETECTED from PQexec",func); +#endif + if (res != NULL) + PQclear(res); + transaction_tbl->lock = STATUS_DEADLOCK_DETECT; + return STATUS_DEADLOCK_DETECT; + } + snprintf(result,PGR_MESSAGE_BUFSIZE,"C%s",str); + } + if (res != NULL) + PQclear(res); + + /* set send query id */ + *(PGR_Send_Query_ID + hostNum ) = ntohl(header->query_id); + + /* + * if the query is end transaction process... + */ + check_delete_transaction(host_ptr,header); + + return STATUS_OK; +} + +static int +check_delete_transaction (HostTbl * host_ptr, ReplicateHeader * header) +{ + char *database = NULL; + + if ((host_ptr == NULL) || (header == NULL)) + { + return STATUS_ERROR; + } + database = (char *)header->dbName; + if(header->cmdSts == CMD_STS_OTHER && + header->cmdType == CMD_TYPE_CONNECTION_CLOSE) + { + notice_abort(host_ptr, header); + deleteTransactionTbl(host_ptr,header); + } + + delete_template(host_ptr, header); + return STATUS_OK; +} + +static void +check_transaction_status(ReplicateHeader * header, + TransactionTbl *transaction) +{ + if (header == (ReplicateHeader *)NULL) + { + return; + } + if (header->cmdSts == CMD_STS_TRANSACTION ) + { + if (header->cmdType == CMD_TYPE_BEGIN ) + { + if (transaction != NULL) + { + transaction->in_transaction = true; + transaction->transaction_count ++; + } + } + else if ((header->cmdType == CMD_TYPE_COMMIT) || + (header->cmdType == CMD_TYPE_ROLLBACK)) + { + if (transaction != NULL) + { + if (transaction->transaction_count > 0) + { + transaction->transaction_count --; + } + if (transaction->transaction_count == 0) + { + transaction->in_transaction = false; + } + } + } + } + else + { + if ( header->cmdType == CMD_TYPE_COPY ) + { + if (transaction != NULL) + { + transaction->exec_copy = true; + } + } + else if (header->cmdType == CMD_TYPE_COPY_DATA_END) + { + if (transaction != NULL) + { + transaction->exec_copy = false; + } + } + } +} + +static HostTbl * +check_host_transaction_status(ReplicateHeader * header, + HostTbl *host) +{ + int recovery_status = 0; + + if ((header == (ReplicateHeader *)NULL) || (host == (HostTbl *)NULL)) + { + return NULL; + } + if (header->cmdType == CMD_TYPE_BEGIN ) + { + host->transaction_count++; + } + else if ((header->cmdType == CMD_TYPE_COMMIT) || + (header->cmdType == CMD_TYPE_ROLLBACK)) + { + if (host->transaction_count > 0) + host->transaction_count--; + } + + recovery_status = PGRget_recovery_status(); + if ((recovery_status == RECOVERY_PREPARE_START) && + (host->transaction_count > 0)) + { + PGRset_recovery_status(RECOVERY_WAIT_CLEAN); + } + else if ((recovery_status == RECOVERY_PREPARE_START) && + (host->transaction_count==0)) + { + PGRset_recovery_status(RECOVERY_CLEARED); + } + else if ((recovery_status == RECOVERY_WAIT_CLEAN) && + (host->transaction_count==0)) + { + PGRset_recovery_status(RECOVERY_CLEARED); + } + return host; +} + +static FILE * +create_queue_file(void) +{ + char * func = "create_queue_file()"; + FILE * fp = NULL; + struct timeval tv; + char fname[FILENAME_MAX_LENGTH]; + int size = 0; + int rtn = 0; + RecoveryQueueFile * msg = NULL; + + if (*RecoveryMsgid < 0) + { + return (FILE *)NULL; + } + /* create uniq file name */ + gettimeofday(&tv,NULL); + memset(fname,0,sizeof(fname)); + snprintf(fname,sizeof(fname),"%s/%s_%u.%u", + PGR_Data_Path, + RECOVERY_QUEUE_FILE, + (uint32_t)tv.tv_sec, + (uint32_t)tv.tv_usec); + + size = sizeof(fname) + sizeof(RecoveryQueueFile); + msg = (RecoveryQueueFile *)malloc(size); + if (msg == NULL) + { + show_error("%s:malloc() failed. reason: %s", func, strerror(errno)); + return (FILE *)NULL; + } + memset(msg,0,size); + msg->mtype = RECOVERY_FILE_MTYPE; + strncpy(msg->mdata,fname,sizeof(fname)); + + fp = fopen(fname,"a"); + if (fp == NULL) + { + show_error("%s:fopen failed: (%s)",func,strerror(errno)); + return (FILE *)NULL; + } + + rtn = msgsnd(*RecoveryMsgid, msg, sizeof(fname), IPC_NOWAIT); + if (rtn < 0) + { + show_error("%s:msgsnd failed. reason: %s", func, strerror(errno)); + free(msg); + msgctl(*RecoveryMsgid, IPC_RMID, NULL); + *RecoveryMsgid = msgget (IPC_PRIVATE, 00666 | IPC_CREAT ); + return (FILE *)NULL; + } + + strncpy(Recovery_Status_Inf->write_file,fname,sizeof(Recovery_Status_Inf->write_file)); + return fp; +} + +static int +add_queue_file(char * data,int size) +{ + int cnt = 0; + + if ((QueueFp == NULL) || (data == NULL) || (size < 0)) + { + return STATUS_ERROR; + } + /*fseek(QueueFp,0,SEEK_END);*/ + while (fwrite(data, size,1,QueueFp) <= 0) + { + fclose(QueueFp); + QueueFp = NULL; + if (cnt > MAX_RETRY_TIMES) + { + return STATUS_ERROR; + } + QueueFp = create_queue_file(); + cnt ++; + } + Recovery_Status_Inf->file_size += size; + return STATUS_OK; +} + +/* + * set query in queue + */ +int +PGRset_queue(ReplicateHeader * header,char * query) +{ + char * func = "PGRset_queue()"; + int header_size = 0; + int query_size = 0; + + if ((Recovery_Status_Inf == NULL) || (header == NULL)) + { + show_error("%s:header is null",func); + return STATUS_ERROR; + } + + query_size = ntohl(header->query_size); + if (query_size < 0) + { + show_error("%s:query size less than 0",func); + return STATUS_ERROR; + } + header_size = sizeof(ReplicateHeader); + + if (RecoverySemID <= 0) + { + show_error("%s:RecoverySemID is not initialized",func); + return STATUS_ERROR; + } + PGRsem_lock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + /* check existance of queue file */ + if (Recovery_Status_Inf->write_file[0] == '\0') + { + /* create new queue file */ + Recovery_Status_Inf->file_size = 0; + QueueFp = create_queue_file(); + } + else + { + /* check size of queue file */ + if (Recovery_Status_Inf->file_size + header_size + query_size > MAX_QUEUE_FILE_SIZE) + { + /* if the file size is over the limit, create new queue file */ + memset(Recovery_Status_Inf->write_file,0,sizeof(Recovery_Status_Inf->write_file)); + fclose(QueueFp); + Recovery_Status_Inf->file_size = 0; + QueueFp = create_queue_file(); + } + else + { + QueueFp= fopen(Recovery_Status_Inf->write_file,"a"); + } + } + if (QueueFp == (FILE *)NULL) + { + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + show_error("%s:QueueFp open failed. error is %s",func,strerror(errno)); + return STATUS_ERROR; + } + header->replicate_id = htonl(*PGR_ReplicateSerializationID); + if (add_queue_file((char *)header,header_size) != STATUS_OK) + { + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + show_error("%s:header add failed into queue file",func); + return STATUS_ERROR; + } + if (query_size > 0) + { + if (add_queue_file((char *)query,query_size) != STATUS_OK) + { + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + show_error("%s:queue add failed into queue file",func); + return STATUS_ERROR; + } + } + fflush(QueueFp); + fclose(QueueFp); + PGRsem_unlock(RecoverySemID, SEM_NUM_OF_RECOVERY_QUEUE); + + return STATUS_OK; +} + +HostTbl * +PGRget_HostTbl(char * resolvedName, int port) +{ + HostTbl * ptr = NULL; + int len = 0; + + if (Host_Tbl_Begin == NULL) + { + return NULL; + } + len = strlen(resolvedName); + ptr = Host_Tbl_Begin; + if (len > sizeof(ptr->resolvedName)) + { + len = sizeof(ptr->resolvedName); + } + while(ptr->useFlag != DB_TBL_END) + { + if ((! memcmp(ptr->resolvedName,resolvedName,len)) && + (ptr->port == port)) + { + return ptr; + } + ptr ++; + } + return (HostTbl*)NULL; +} + +static void +sem_quit(int semid) +{ + semctl(semid, 0, IPC_RMID); +} + +void +PGRclear_connections(void) +{ + Dlelem *ptr = NULL; + + pthread_mutex_lock(&transaction_table_mutex); + ptr = DLGetHead(Transaction_Tbl_Begin); + while (ptr) + { + TransactionTbl *transaction = DLE_VAL(ptr); + if (transaction->conn != NULL) + { + PQfinish(transaction->conn); + transaction->conn = NULL; + } + ptr = DLGetSucc(ptr); + } + pthread_mutex_unlock(&transaction_table_mutex); +} + +void +PGRdestroy_transaction_table(void) +{ + Dlelem *ptr = NULL, *next; + pthread_mutex_lock(&transaction_table_mutex); + ptr = DLGetHead(Transaction_Tbl_Begin); + while (ptr) + { + next = DLGetSucc(ptr); + DLRemove(ptr); + DLFreeElem(ptr); + ptr = next; + } + DLFreeList(Transaction_Tbl_Begin); + Transaction_Tbl_Begin = NULL; + pthread_mutex_unlock(&transaction_table_mutex); +} + +static bool +is_need_sync_time(ReplicateHeader * header) +{ + bool rtn = false; + + if (header->cmdSts == CMD_STS_PREPARE) + { + rtn = false; + } + else if ((header->cmdType == CMD_TYPE_COPY) || + (header->cmdType == CMD_TYPE_COPY_DATA) || + (header->cmdType == CMD_TYPE_COPY_DATA_END)) + { + rtn = false; + } + if ((header->cmdSts == CMD_STS_QUERY ) && + ((header->cmdType == CMD_TYPE_INSERT) || + (header->cmdType == CMD_TYPE_UPDATE) || + (header->cmdType == CMD_TYPE_DELETE) || + (header->cmdType == CMD_TYPE_SET) || + (header->cmdType == CMD_TYPE_EXECUTE))) + { + rtn = true; + } + else + { + if ((header->cmdType == CMD_TYPE_COPY) || + (header->cmdType == CMD_TYPE_SELECT) || + (header->cmdType == CMD_TYPE_VACUUM) || + (header->cmdType == CMD_TYPE_ANALYZE) || + (header->cmdType == CMD_TYPE_BEGIN)) + { + rtn = true; + } + if ((header->cmdSts == CMD_STS_TRANSACTION ) && + (header->cmdType != CMD_TYPE_BEGIN)) + { + rtn = false; + } + } + return rtn; +} + +static bool +is_need_wait_answer(ReplicateHeader * header) +{ + bool rtn = false; + + if (header->cmdSts == CMD_STS_PREPARE) + { + rtn = false; + } + else if ((header->cmdType == CMD_TYPE_COPY) || + (header->cmdType == CMD_TYPE_COPY_DATA) || + (header->cmdType == CMD_TYPE_COPY_DATA_END)) + { + rtn = false; + } + else if ((header->cmdSts == CMD_STS_QUERY ) && + ((header->cmdType == CMD_TYPE_INSERT) || + (header->cmdType == CMD_TYPE_UPDATE) || + (header->cmdType == CMD_TYPE_DELETE) || + (header->cmdType == CMD_TYPE_VACUUM) || + (header->cmdType == CMD_TYPE_ANALYZE) || + (header->cmdType == CMD_TYPE_EXECUTE))) + { + rtn = true; + } + else if ((header->cmdSts == CMD_STS_TRANSACTION ) || + (header->cmdSts == CMD_STS_SET_SESSION_AUTHORIZATION ) || + (header->cmdSts == CMD_STS_TEMP_TABLE ) || + (header->cmdType == CMD_TYPE_SELECT)) + { + rtn = true; + } + + return rtn; +} + +static void +delete_template(HostTbl * ptr, ReplicateHeader * header) +{ + if ((ptr == (HostTbl *)NULL ) || + (header == (ReplicateHeader *)NULL) ) + { + return; + } + + if ((! strncmp(header->dbName,"template1",9)) || + (! strncmp(header->dbName,"template0",9))) + { + if ((header->cmdSts != CMD_STS_TRANSACTION ) && + ( header->cmdSts != CMD_STS_SET_SESSION_AUTHORIZATION ) && + ( header->cmdSts != CMD_STS_TEMP_TABLE )) + { + deleteTransactionTbl(ptr,header); + } + } +} + +/*-------------------------------------------------------------------- + * SYMBOL + * check_copy_command() + * NOTES + * check the query which it is copy command or not + * when the query is 'copy from', set 'stdin' after 'from' + * ARGS + * char * query: query strings(I) + * RETURN + * copy command : changed copy command + * other command : NULL + *-------------------------------------------------------------------- + */ +static char * +check_copy_command(char * query) +{ + char * p; + char * p1, *p2, *wp; + char * buf; + int size; + + if (query == NULL) + return NULL; + size = strlen(query) + strlen(" stdin "); + p = p1 = query; + wp = strstr(p,"FROM"); + if (wp == NULL) + wp = strstr(p,"from"); + + if (wp != NULL) + { + p = wp + strlen("FROM"); + *p = '\0'; + p ++; + while ((isspace(*p)) && (*p != '\0')) p++; + while ((!isspace(*p)) && (*p != '\0')) p++; + p2 = p; + buf = malloc(size); + if (buf == NULL) + { + return NULL; + } + snprintf(buf,size,"%s stdin %s",p1,p2); + return buf; + } + return NULL; +} + +static int +next_replication_id(void) +{ + char * func = "next_replication_id()"; + + if (Recovery_Status_Inf == (RecoveryStatusInf *)NULL) + { + show_error("%s: Recovery_Status_Inf is NULL",func); + return -1; + } + Recovery_Status_Inf->replication_id ++; + Recovery_Status_Inf->check_point --; + return (Recovery_Status_Inf->replication_id); +} + +static void +check_replication_id(void) +{ + char * func = "check_replication_id()"; + + if (Recovery_Status_Inf == (RecoveryStatusInf *)NULL) + { + show_error("%s: Recovery_Status_Inf is NULL",func); + return ; + } + if (Recovery_Status_Inf->check_point < 0) + { + Recovery_Status_Inf->check_point = PGR_CHECK_POINT ; + rewind(RidFp); + PGRwrite_log_file(RidFp,"%u",Recovery_Status_Inf->replication_id + PGR_CHECK_POINT ); + } +} + +int +PGRset_replication_id(uint32_t id) +{ + Recovery_Status_Inf->replication_id = id; + return (Recovery_Status_Inf->replication_id); +} + +int +PGRdo_replicate(int sock,ReplicateHeader *header, char * query) +{ + + char * func = "PGRdo_replicate()"; + + struct timeval tv; + int status = STATUS_OK; + int recovery_status = 0; + char * query_string = NULL; + + if (header->cmdType == CMD_TYPE_COPY) + { + query_string = check_copy_command(query); + if (query_string == NULL) + { + return LOOP_CONTINUE; + } + } + else + { + query_string = query; + if (header->cmdType == CMD_TYPE_SET) + { + if (is_autocommit_off(query_string) == true) + { + PGR_AutoCommit = false; + } + else if (is_autocommit_on(query_string) == true) + { + PGR_AutoCommit = true; + } + } + } + header->isAutoCommit=PGR_AutoCommit ? 1 : 0; + gettimeofday(&tv,NULL); + header->tv.tv_sec = htonl(tv.tv_sec); + header->tv.tv_usec = htonl(tv.tv_usec); +#ifdef PRINT_DEBUG + show_debug("%s:query :: %s",func,query_string); +#endif + + /* set query id */ + header->query_id = htonl(PGRget_next_query_id()); + + /* save header for logging */ + if (is_need_sync_time(header) == true) + { + if (PGR_Log_Header != NULL) + { + memcpy(PGR_Log_Header,header,sizeof(ReplicateHeader)); + if (header->rlog == 0) + { + PGR_Log_Header->replicate_id = htonl(next_replication_id()); + } + } + } + /* check rlog */ + if (header->rlog == CONNECTION_SUSPENDED_TYPE ) + { + if (PGRget_rlog_header(header) == STATUS_OK) + { + header->rlog = CONNECTION_SUSPENDED_TYPE; + + } + } + + /* check recovery mode */ + + recovery_status = PGRget_recovery_status(); + PGRcheck_recovered_host(); + + /* send replication packet */ + status = PGRreplicate_packet_send( header,query_string,sock,recovery_status); + + if ((header->cmdType == CMD_TYPE_COPY) && + (query_string != NULL)) + { + free(query_string); + query_string = NULL; + } + + if (status == STATUS_ABORTED ) + { +#ifdef PRINT_DEBUG + show_debug("%s:status is STATUS_ABORTED",func); +#endif + return LOOP_END; + } + if (status == STATUS_DEADLOCK_DETECT) + { +#ifdef PRINT_DEBUG + show_debug("%s:status is STATUS_DEADLOCK_DETECT",func); +#endif + return LOOP_END; + } + return LOOP_CONTINUE; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * PGRreturn_result() + * NOTES + * Return result of execution + * ARGS + * int dest: socket of destination server (I) + * char *result: result data(I) + * int wait: wait flag (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + * NG: STATUS_LOCK_CONFLICT + * NG: STATUS_DEADLOCK_DETECT + *-------------------------------------------------------------------- + */ +int +PGRreturn_result(int dest, char * result, int wait) +{ + char * func = "PGRreturn_result()"; + fd_set wmask; + struct timeval timeout; + int rtn = 0; + char * send_ptr = NULL; + int send_size= 0; + int buf_size = 0; + int s = 0; + int status = 0; + int flag = 0; + + if (result == NULL) + { + show_error("%s:result is not initialize",func); + return STATUS_ERROR; + } + if (dest < 0) + { + return STATUS_ERROR; + } + send_ptr = result; + buf_size = PGR_MESSAGE_BUFSIZE; + if (buf_size < 1) + buf_size = 1; + + /* + * Wait for something to happen. + */ +#ifdef MSG_DONTWAIT + flag |= MSG_DONTWAIT; +#endif +#ifdef MSG_NOSIGNAL + flag |= MSG_NOSIGNAL; +#endif + + for (;;) + { + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + FD_ZERO(&wmask); + FD_SET(dest,&wmask); + + rtn = select(dest+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + + show_error("%s:select failed ,errno is %s",func , strerror(errno)); + return STATUS_ERROR; + } + else if (rtn && FD_ISSET(dest, &wmask)) + { + s = send(dest,send_ptr + send_size,buf_size - send_size ,flag); + if (s < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:send error: %d(%s)", func, errno, strerror(errno)); + memset(send_ptr, 0, PGR_MESSAGE_BUFSIZE); + return STATUS_ERROR; + } + } + else if (s > 0) + { + send_size += s; + if (send_size == buf_size) + { + + status = STATUS_OK; + if (wait == PGR_WAIT_ANSWER) + { + status = read_answer(dest); + } + return status; + } + } + else /* s == 0 */ + { + show_error("%s:unexpected EOF", func); + memset(send_ptr, 0, PGR_MESSAGE_BUFSIZE); + return STATUS_ERROR; + } + } + } + memset(send_ptr, 0, PGR_MESSAGE_BUFSIZE); + return STATUS_ERROR; +} + +/*-------------------------------------------------------------------- + * SYMBOL + * read_answer() + * NOTES + * Receive answer packet + * ARGS + * int dest: socket of destination server (I) + * RETURN + * OK: STATUS_OK + * NG: STATUS_ERROR + * NG: STATUS_LOCK_CONFLICT + * NG: STATUS_DEADLOCK_DETECT + *-------------------------------------------------------------------- + */ +static int +read_answer(int dest) +{ + char * func = "read_answer()"; + fd_set rmask; + struct timeval timeout; + int rtn; + ReplicateHeader header; + char * answer = NULL; + int status = STATUS_ERROR; + + for(;;) + { + if (answer != NULL) + { + free(answer); + answer = NULL; + } + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + FD_ZERO(&rmask); + FD_SET(dest,&rmask); + rtn = select(dest+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + + show_error("%s:select failed ,errno is %s",func , strerror(errno)); + return STATUS_ERROR; + } + else if (rtn && FD_ISSET(dest, &rmask)) + { + memset(&header,0,sizeof(ReplicateHeader)); + answer = PGRread_packet(dest,&header); + if (answer == NULL) + { + status = STATUS_ERROR; + break; + } + if ((header.cmdSts != CMD_STS_RESPONSE) && + (header.cmdSts != CMD_STS_NOTICE)) + { + show_error("%s:none response packet received",func); + free(answer); + answer = NULL; + status = STATUS_ERROR; + break; + } +#ifdef PRINT_DEBUG + show_debug("%s:answer[%s]",func,answer); +#endif + if (answer != NULL) + { + if (!strncasecmp(answer,PGR_QUERY_DONE_NOTICE_CMD,strlen(PGR_QUERY_DONE_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:QUERY DONE",func); +#endif + status = STATUS_OK; + } + else if (!strncasecmp(answer,PGR_QUERY_ABORTED_NOTICE_CMD,strlen(PGR_QUERY_ABORTED_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:QUERY ABORTED",func); +#endif + status = STATUS_ABORTED; + } + else if (!strncasecmp(answer,PGR_LOCK_CONFLICT_NOTICE_CMD,strlen(PGR_LOCK_CONFLICT_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:LOCK CONFLICT !!",func); +#endif + status = STATUS_LOCK_CONFLICT; + } + else if (!strncasecmp(answer,PGR_DEADLOCK_DETECT_NOTICE_CMD,strlen(PGR_DEADLOCK_DETECT_NOTICE_CMD))) + { +#ifdef PRINT_DEBUG + show_debug("%s:DEADLOCK DETECT !!",func); +#endif + status = STATUS_DEADLOCK_DETECT; + } + free(answer); + answer = NULL; + } + return status; + } + } + return status; +} + +/*-------------------------------------------------- + * SYMBOL + * PGRreplicate_packet_send() + * NOTES + * Send query to each cluster DB servers and return result. + * ARGS + * ReplicateHeader * header : packet header (I) + * char * query : query for replication (I) + * int dest : destination socket for return result (I) + * RETURN + * OK : STATUS_OK + * NG : STATUS_ERROR + * DEADLOCK : STATUS_DEADLOCK_DETECT + *--------------------------------------------------- + */ +int +PGRreplicate_packet_send( ReplicateHeader * header, char * query,int dest,int recovery_status) { + return replicate_packet_send_internal(header,query,dest,recovery_status,false); +} + + +int +replicate_packet_send_internal(ReplicateHeader * header, char * query,int dest,int recovery_status,bool isHeldLock) +{ + char * func = "replicate_packet_send_internal()"; + HostTbl * host_ptr = (HostTbl*)NULL; + HostTbl * source_host_ptr = (HostTbl*)NULL; + int status = STATUS_OK; + int sem_cnt = 0; + int sem_id = 0; + char *database = NULL; + char port[8]; + char *userName = NULL; + char *password = NULL; + char * md5Salt = NULL; + char * cryptSalt = NULL; + char * host = NULL; + char result[PGR_MESSAGE_BUFSIZE]; + + pthread_attr_t attr; + int rc = 0; + int t = 0; + int t_cnt = 0; + int source_t_cnt = -1; + int transaction_count = 0; + int *results_from_thread; + bool reliable_mode = true; + + pthread_t thread[MAX_DB_SERVER]; + ThreadArgInf thread_arg[MAX_DB_SERVER]; + + +#ifdef PRINT_DEBUG + show_debug("cmdSts=%c",header->cmdSts); + if(header->cmdType!='\0') + show_debug("cmdType=%c",header->cmdType); + show_debug("rlog=%d",header->rlog); + show_debug("port=%d",ntohs(header->port)); + show_debug("pid=%d",ntohs(header->pid)); + show_debug("from_host=%s",header->from_host); + show_debug("dbName=%s",header->dbName); + show_debug("userName=%s",header->userName); + show_debug("recieve sec=%u",ntohl(header->tv.tv_sec)); + show_debug("recieve usec=%u",ntohl(header->tv.tv_usec)); + show_debug("query_size=%d",ntohl(header->query_size)); + show_debug("request_id=%d",ntohl(header->request_id)); + show_debug("replicate_id=%d",ntohl(header->replicate_id)); + show_debug("recovery_status=%d",recovery_status); + if (header->cmdSts != CMD_STS_PREPARE) + show_debug("query=%s",query); + +#endif + + /* check rlog type */ + if (header->rlog == FROM_R_LOG_TYPE) + { + if (is_executed_query_in_origin(header) == false) + { +#ifdef PRINT_DEBUG + show_debug("this query is not yet done in source cluster db. so it wait for receive re-replicate request"); +#endif + /* wait re-replicate request */ + return STATUS_SKIP_REPLICATE; + } + } + /* + * loop while registrated cluster DB exist + */ + if (Host_Tbl_Begin == NULL) + { + return STATUS_ERROR; + } + host_ptr = Host_Tbl_Begin; + PGR_Response_Inf->current_cluster = 0; + memset(result,0,sizeof(result)); + sem_cnt = 1; + + if (is_need_queue_jump(header,query) == false) + { + sem_id = SemID; + } + else + { + sem_id = VacuumSemID; + } + if(!isHeldLock) { +#ifdef PRINT_DEBUG + show_debug("sem_lock [%d] req",sem_cnt); +#endif + + PGRsem_lock(sem_id,sem_cnt); +#ifdef PRINT_DEBUG + show_debug("sem_lock [%d] got it",sem_cnt); +#endif + } + ++*PGR_ReplicateSerializationID; + + /* set replication log */ + if (is_need_use_rlog(header) == true) + { + PGRset_rlog(header,query); + } + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + PGR_Response_Inf->current_cluster = 0; + t_cnt = 0; + while(host_ptr->useFlag != DB_TBL_END) + { + /* + * check the status of the cluster DB + */ + if ((host_ptr->useFlag != DB_TBL_USE) && + (host_ptr->useFlag != DB_TBL_INIT)) + { + host_ptr ++; + continue; + } + /* + * skip loop during recover and the host name is master DB + */ + if (is_master_in_recovery(host_ptr->hostName, host_ptr->port,recovery_status) == true) + { + if (PGRset_queue(header,query) != STATUS_OK) + { + show_error("%s:failed to put query to queue.abort to recovery",func); + PGRset_recovery_status(RECOVERY_INIT); + } +#ifdef PRINT_DEBUG + show_debug("%s master is using for recovery",func); +#endif + host_ptr ++; + continue; + } + host_ptr = check_host_transaction_status(header, host_ptr); + /* + * compare with the host name and the exceptional host name + */ + thread_arg[t_cnt].header = header; + thread_arg[t_cnt].query = query; + thread_arg[t_cnt].dest = dest; + thread_arg[t_cnt].host_ptr = host_ptr; + thread_arg[t_cnt].current_cluster = t_cnt; + thread_arg[t_cnt].transaction_tbl = (TransactionTbl *)NULL; + + if (PGRis_same_host(header->from_host,ntohs(header->port),host_ptr->resolvedName, host_ptr->port) == true) + { +#ifdef PRINT_DEBUG + show_debug("source host"); +#endif + /* replication to source cluster db */ + source_host_ptr = host_ptr; + source_t_cnt = t_cnt; + + if (header->rlog == FROM_R_LOG_TYPE ) + { +#ifdef PRINT_DEBUG + show_debug("%s: This simple query was suspended. Therefore this query is not re-replicated to source cluster db.",func); +#endif + } + check_transaction_status(header, thread_arg[t_cnt].transaction_tbl); + t_cnt++; + } + /* replication to other cluster db */ + else + { + if ((header->rlog == CONNECTION_SUSPENDED_TYPE ) && + (header->cmdSts == CMD_STS_TRANSACTION) ) + { +#ifdef PRINT_DEBUG + show_debug("%s: This transaction query was suspended. Therefore this query is not replicated to other cluster dbs.",func); +#endif + } + else + { + /* + * get the transaction table data + * it has the connection data with each cluster DB + */ + thread_arg[t_cnt].transaction_tbl = getTransactionTbl(host_ptr,header); + /* + * if the transaction process is new one, + * create connection data and add the transaction table + */ + if (thread_arg[t_cnt].transaction_tbl == (TransactionTbl *)NULL) + { + thread_arg[t_cnt].transaction_tbl = setTransactionTbl(host_ptr, header); + if (thread_arg[t_cnt].transaction_tbl == (TransactionTbl *)NULL) + { + show_error("%s:setTransactionTbl failed",func); + if ( header->cmdSts != CMD_STS_NOTICE ) + { + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + host_ptr ++; + continue; + } + StartReplication[t_cnt] = true; + } + else + { + /* + * re-use the connection data + */ + if ((thread_arg[t_cnt].transaction_tbl->conn != (PGconn *)NULL) && + (thread_arg[t_cnt].transaction_tbl->conn->sock > 0)) + { + /* + memset(thread_arg[t_cnt].transaction_tbl->conn->inBuffer,0,thread_arg[t_cnt].transaction_tbl->conn->inBufSize); + memset(thread_arg[t_cnt].transaction_tbl->conn->outBuffer,0,thread_arg[t_cnt].transaction_tbl->conn->outBufSize); + */ + StartReplication[t_cnt] = false; + } + else + { + if (thread_arg[t_cnt].transaction_tbl->conn != (PGconn *)NULL) + { + PQfinish(thread_arg[t_cnt].transaction_tbl->conn); + thread_arg[t_cnt].transaction_tbl->conn = NULL; + } + + database = (char *)(header->dbName); + snprintf(port,sizeof(port),"%d", host_ptr->port); + userName = (char *)(header->userName); + password = (char *)(header->password); + md5Salt = (char *)(header->md5Salt); + cryptSalt = (char *)(header->cryptSalt); + host = (char *)(host_ptr->hostName); + + thread_arg[t_cnt].transaction_tbl->conn = PGRcreateConn(host,port,database,userName,password,md5Salt,cryptSalt); + StartReplication[t_cnt] = true; +#ifdef PRINT_DEBUG + show_debug("%s:connect db:%s port:%s user:%s host:%s query:%s", + func, database,port,userName,host,query); +#endif + } + } + check_transaction_status(header, thread_arg[t_cnt].transaction_tbl); + transaction_count = thread_arg[t_cnt].transaction_tbl->transaction_count; + rc = pthread_create(&thread[t_cnt], &attr, thread_send_cluster, (void*)&thread_arg[t_cnt]); + + if (rc) + { + show_error("pthread_create error"); + } + t_cnt++; + } + } + /* + * send replication query to each cluster server + */ + if (host_ptr->useFlag != DB_TBL_USE) + { + PGRset_host_status(host_ptr,DB_TBL_USE); + } + + host_ptr++; + PGR_Response_Inf->current_cluster ++; + status = STATUS_OK; + } + + /* When the query is SELECT, source cluster would not need to wait other cluster's result */ + if ((header->cmdType == CMD_TYPE_SELECT) && (header->cmdSts != CMD_STS_PREPARE)) + { + thread_send_source( (void*)&thread_arg[source_t_cnt]); + reliable_mode = false; + } + + pthread_attr_destroy(&attr); + + results_from_thread = malloc(t_cnt * sizeof(int)); + for ( t = 0 ; t < t_cnt; ) + { + int result; + if (t == source_t_cnt) + { + t++; + continue; + } + rc = pthread_join(thread[t], (void **)&result); + if ((rc != 0) && (errno == EINTR)) + { + usleep(100); + continue; + } + results_from_thread[t] = (int)result; + pthread_detach(thread[t]); + t++; + } + + if (compare_results(results_from_thread, t_cnt, source_t_cnt) == false) + show_error("query results discrepancy between cluster servers: %s", query); + free(results_from_thread); + + thread_arg[source_t_cnt].transaction_count = transaction_count; + /* + * send replication query to source cluster server. + */ + if ((source_t_cnt >= 0) && ( reliable_mode == true )) + { + thread_send_source( (void*)&thread_arg[source_t_cnt]); + } + /* unset replication log */ + if (is_need_use_rlog(header) == true) + { + PGRunset_rlog(header,query); + } + + check_replication_id(); + if (header->cmdSts == CMD_STS_PREPARE) + { + if (header->cmdType != CMD_TYPE_P_SYNC) + { + if (PGR_Parse_Session_Started == false) + { + PGR_Parse_Session_Started = true; + } + } + } + else + { + PGR_Parse_Session_Started = false; + } + + if(!isHeldLock) { +#ifdef PRINT_DEBUG + show_debug("sem_unlock[%d]",sem_cnt); +#endif + PGRsem_unlock(sem_id,sem_cnt); + } + + return status; +} + +static void * +thread_send_source(void * arg) +{ + char * func = "thread_send_source()"; + ThreadArgInf * thread_arg = NULL; + ReplicateHeader * header = (ReplicateHeader*)NULL; + char * query = NULL; + int dest = 0; + HostTbl * host_ptr = (HostTbl*)NULL; + int status = STATUS_OK; + int transaction_count = 0; + char result[PGR_MESSAGE_BUFSIZE]; + bool sync_command_flg = false; + + if (arg == NULL) + { + show_error("%s:arg is NULL",func); + status = STATUS_ERROR; + pthread_exit((void *) status); + } + thread_arg = (ThreadArgInf *)arg; + header = thread_arg->header; + query = thread_arg->query; + dest = thread_arg->dest; + host_ptr = thread_arg->host_ptr; + transaction_count = thread_arg->transaction_count; + + if(header->cmdSts==CMD_STS_OTHER && + header->cmdType==CMD_TYPE_CONNECTION_CLOSE) + { + return (void *)0; + } + + if (header->rlog == FROM_R_LOG_TYPE ) + { + /* It is not necessary to return rlog to source DB. */ +#ifdef PRINT_DEBUG + show_debug("%s: It is not necessary to return rlog to source DB",func); +#endif + status = STATUS_OK; + return (void *)status; + } + + /** + * NOTE: + * We can use PGR_ReplicateSerializationID here , because + * all queries from cluster server isn't recovery query. + * + */ + if (is_need_sync_time(header) == true) + { + if (transaction_count >1 ) + { + sync_command_flg = false; + } + else + { + sync_command_flg = true; + } + } + if (sync_command_flg == true) + { + snprintf(result,PGR_MESSAGE_BUFSIZE, + "%d,%u,%u,%u,%d,%u", + PGR_SET_CURRENT_TIME_FUNC_NO, + (unsigned int)ntohl(header->tv.tv_sec), + (unsigned int)ntohl(header->tv.tv_usec), + (unsigned int)ntohl(PGR_Log_Header->replicate_id), + PGR_Response_Inf->response_mode, + *PGR_ReplicateSerializationID); + } + else + { + snprintf(result,PGR_MESSAGE_BUFSIZE, + "%d,%u,%u,%d", + PGR_SET_CURRENT_REPLICATION_QUERY_ID_NO, + *PGR_ReplicateSerializationID, + 0, + PGR_Response_Inf->response_mode); + } + /* execute query in the exceptional host */ + /* it is not use replication */ + if (is_need_wait_answer(header) == true) + { + status = PGRreturn_result(dest,result, PGR_WAIT_ANSWER); + } + else + { + status = PGRreturn_result(dest, result, PGR_NOWAIT_ANSWER); + } + + /* + if (status == STATUS_ERROR ) + { + show_error("%s: %s[%d] should be down ",func,host_ptr->hostName,host_ptr->port); + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + */ + + /* delete server table when query use template db */ + if (PGR_Response_Inf->response_mode != PGR_RELIABLE_MODE) + { + delete_template(host_ptr,header); + } +#ifdef PRINT_DEBUG + show_debug("end thread_send_source()"); +#endif + return (void *)0; +} + +static void * +thread_send_cluster(void * arg) +{ + char * func = "thread_send_cluster()"; + ThreadArgInf * thread_arg = NULL; + ReplicateHeader * header = (ReplicateHeader*)NULL; + char * query = NULL; + int dest = 0; + HostTbl * host_ptr = (HostTbl*)NULL; + int rtn = 0; + int status = STATUS_OK; + TransactionTbl * transaction_tbl = (TransactionTbl *)NULL; + int current_cluster = 0; + char result[PGR_MESSAGE_BUFSIZE]; + +#ifdef PRINT_DEBUG + show_debug("start thread_send_cluster()"); +#endif + if (arg == NULL) + { + show_error("%s:arg is NULL",func); + status = STATUS_ERROR; + pthread_exit((void *) status); + } + + thread_arg = (ThreadArgInf *)arg; + header = thread_arg->header; + query = thread_arg->query; + dest = thread_arg->dest; + host_ptr = thread_arg->host_ptr; + transaction_tbl = thread_arg->transaction_tbl; + current_cluster = thread_arg->current_cluster; + + + if(header->cmdSts==CMD_STS_OTHER && + header->cmdType==CMD_TYPE_CONNECTION_CLOSE) + { + check_delete_transaction(host_ptr, header); + return (void *)0; + } + + rtn = send_replicate_packet_to_server( transaction_tbl, current_cluster, host_ptr, header, query , result,*PGR_ReplicateSerializationID, false); + +#ifdef PRINT_DEBUG + show_debug("%s:return value from send_replicate_packet_to_server() is %d",func,rtn); +#endif + if (rtn == STATUS_ABORTED) + { + snprintf(result,PGR_MESSAGE_BUFSIZE,"%d", PGR_NOTICE_ABORT_FUNC_NO); + status = PGRreturn_result(dest, result, PGR_NOWAIT_ANSWER); + status = STATUS_ABORTED; + pthread_exit((void *) status); + } + /* delete server table when query use template db */ + delete_template(host_ptr,header); +#ifdef PRINT_DEBUG + show_debug("%s:pthread_exit[%d]",func,current_cluster ); +#endif + + pthread_exit((void *) rtn); +} + +/*-------------------------------------------------- + * SYMBOL + * PGRreplicate_packet_send_each_server() + * NOTES + * Send query to a cluster DB server and return result. + * ARGS + * HostTbl * ptr : cluster server info table (I) + * bool return_response : flag for return result(I) + * ReplicateHeader * header: header data (I) + * char * query : query data (I) + * int dest : socket of destination server(I) + * RETURN + * OK : STATUS_OK + * NG : STATUS_ERROR + *--------------------------------------------------- + */ +int +PGRreplicate_packet_send_each_server( HostTbl * ptr, bool return_response, ReplicateHeader * header, char * query,int dest) +{ + char * func = "PGRreplicate_packet_send_each_server()"; + char * host; + int rtn; + + host = ptr->hostName; + /* + * send query to cluster DB + */ + if (PGR_Result == NULL) + { + show_error("%s:PGR_Result is not initialize",func); + return STATUS_ERROR; + } + + rtn = PGRsend_replicate_packet_to_server( ptr, header,query,PGR_Result, dest, false); + + return rtn; +} + +/*-------------------------------------------------- + * SYMBOL + * PGRread_packet() + * NOTES + * Read packet data and send the query to each cluster DB. + * The packet data has header data and query data. + * ARGS + * int sock : socket (I) + * ReplicateHeader *header : header data (O) + * RETURN + * OK: pointer of read query + * NG: NULL + *--------------------------------------------------- + */ +char * +PGRread_packet(int sock, ReplicateHeader *header) +{ + char * func = "PGRread_packet()"; + int r =0; + int cnt = 0; + char * read_ptr = NULL; + int read_size = 0; + int header_size = 0; + char * query = NULL; + fd_set rmask; + struct timeval timeout; + int rtn; + + if (header == NULL) + { + return NULL; + } + memset(header,0,sizeof(ReplicateHeader)); + read_ptr = (char*)header; + header_size = sizeof(ReplicateHeader); + cnt = 0; + + for (;;){ + /* + * read header data + */ + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + rtn = select(sock+1, &rmask, (fd_set *)NULL,(fd_set *)NULL, &timeout); + + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + + show_error("%s:select failed ,errno is %s",func , strerror(errno)); + return NULL; + } + + if (rtn && FD_ISSET(sock, &rmask)) + { + r = recv(sock,read_ptr + read_size ,header_size - read_size, MSG_WAITALL); + /* + r = recv(sock,read_ptr + read_size ,header_size - read_size, 0); + */ + if (r < 0) + { + show_error("%s:recv failed: (%s)",func,strerror(errno)); + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:recv failed: (%s)",func,strerror(errno)); + return NULL; + } + } + else if (r > 0) + { + read_size += r; + if ( read_size == header_size) + { + query = PGRread_query(sock,header); + return query; + } + } + else if (r == 0) + { + return NULL; + } + } + } + return NULL; +} + +char * +PGRread_query(int sock, ReplicateHeader *header) +{ + char * func = "PGRread_query()"; + int r =0; + int cnt = 0; + char * read_ptr; + int read_size = 0; + int query_size = 0; + char * query = NULL; + + query_size = ntohl(header->query_size); + if (query_size < 0) + { + show_error("%s:receive size less than 0",func); + return NULL; + } + query = malloc(query_size+4); + if (query == NULL) + { + /* + * buffer allocation failed + */ + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return NULL; + } + memset(query,0,query_size+4); + if (query_size == 0) + { + return query; + } + read_size = 0; + cnt = 0; + read_ptr = (char *)query; + for (;;) + { + /* + * read query data + */ + + /*r = recv(sock,read_ptr + read_size ,query_size - read_size, MSG_WAITALL); */ + r = recv(sock,read_ptr + read_size ,query_size - read_size, 0); + if (r < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:recv failed: (%s)",func,strerror(errno)); + free(query); + query = NULL; + return NULL; + } + } + else if (r > 0) + { + read_size += r; + if ( read_size == query_size) + { + return query; + } + } + else /* r == 0 */ + { + show_error("%s:unexpected EOF", func); + free(query); + query = NULL; + return NULL; + } + } + free(query); + query = NULL; + return NULL; +} + +static bool +is_autocommit_off(char * query) +{ + int i; + char buf[256]; + char * p = NULL; + + if (query == NULL) + return false; + memset(buf,0,sizeof(buf)); + p = query; + i = 0; + while ( *p != '\0' ) + { + buf[i++] = toupper(*p); + p++; + if (i >= (sizeof(buf) -2)) + break; + } + p = strstr(buf,"AUTOCOMMIT"); + if ( p == NULL) + { + return false; + } + p = strstr(buf,"OFF"); + if ( p == NULL ) + { + return false; + } + return true; +} + +static bool +is_autocommit_on(char * query) +{ + int i; + char buf[256]; + char * p = NULL; + + if (query == NULL) + return false; + memset(buf,0,sizeof(buf)); + p = query; + i = 0; + while ( *p != '\0' ) + { + buf[i++] = toupper(*p); + p++; + if (i >= (sizeof(buf) -2)) + break; + } + p = strstr(buf,"AUTOCOMMIT"); + if ( p == NULL) + { + return false; + } + p = strstr(buf,"ON"); + if ( p == NULL ) + { + return false; + } + return true; +} + +static unsigned int +get_host_ip_from_tbl(char * host) +{ + Dlelem * ptr = NULL; + + pthread_mutex_lock(&transaction_table_mutex); + if (Transaction_Tbl_Begin == NULL) + { + pthread_mutex_unlock(&transaction_table_mutex); + return 0; + } + ptr = DLGetHead(Transaction_Tbl_Begin); + while (ptr) + { + TransactionTbl *transaction = DLE_VAL(ptr); + if (!strncasecmp(transaction->host,host,sizeof(transaction->host))) + { + pthread_mutex_unlock(&transaction_table_mutex); + return transaction->hostIP; + } + ptr = DLGetSucc(ptr); + } + pthread_mutex_unlock(&transaction_table_mutex); + + return 0; +} + +static unsigned int +get_srcHost_ip_from_tbl(char * srcHost) +{ + Dlelem * ptr = NULL; + + pthread_mutex_lock(&transaction_table_mutex); + + if (Transaction_Tbl_Begin == NULL) + { + pthread_mutex_unlock(&transaction_table_mutex); + + return 0; + } + ptr = DLGetHead(Transaction_Tbl_Begin); + while (ptr) + { + TransactionTbl *transaction = DLE_VAL(ptr); + if (!strncasecmp(transaction->srcHost,srcHost,sizeof(transaction->srcHost))) + { + pthread_mutex_unlock(&transaction_table_mutex); + + return transaction->srcHostIP; + } + ptr = DLGetSucc(ptr); + } + pthread_mutex_unlock(&transaction_table_mutex); + + return 0; +} + +unsigned int +PGRget_next_query_id(void) +{ + if (PGR_Query_ID >= PGR_MAX_QUERY_ID) + { + PGR_Query_ID = 0; + } + PGR_Query_ID ++; + return PGR_Query_ID; +} + + +void +PGRnotice_replication_server(char * hostName, unsigned short portNumber,unsigned short recoveryPortNumber, unsigned short lifecheckPortNumber, char * userName) +{ + char * func ="PGRnotice_replication_server()"; + ReplicateHeader header; + char query[PGR_MESSAGE_BUFSIZE]; + + if (((hostName == NULL) || (*hostName == 0)) || + ((userName == NULL) || (*userName == 0)) || + ((portNumber == 0) || (recoveryPortNumber == 0))) + { +#ifdef PRINT_DEBUG + show_debug("%s: can not connect server[%s][%s][%d][%d]",func,hostName,userName,portNumber,recoveryPortNumber); +#endif + return; + } + memset(&header,0,sizeof(ReplicateHeader)); + memset(query,0,sizeof(query)); + snprintf(query,sizeof(query)-1,"SELECT %s(%d,'%s',%d,%d,%d)", + PGR_SYSTEM_COMMAND_FUNC, + PGR_STARTUP_REPLICATION_SERVER_FUNC_NO, + hostName, + portNumber, + recoveryPortNumber, + lifecheckPortNumber); + header.cmdSys = CMD_SYS_CALL; + header.cmdSts = CMD_STS_NOTICE; + header.query_size = htonl(strlen(query)); + header.query_id = htonl(PGRget_next_query_id()); + strncpy(header.from_host,hostName,sizeof(header.from_host)); + strncpy(header.userName,userName,sizeof(header.userName)); + strcpy(header.dbName,"template1"); + PGRreplicate_packet_send( &header, query, NOTICE_SYSTEM_CALL_TYPE ,RECOVERY_INIT); +} + +static bool +is_need_use_rlog(ReplicateHeader * header) +{ + bool rtn = false; + if ((Cascade_Inf->useFlag != DB_TBL_USE) || + (PGR_Use_Replication_Log != true) || + (header->rlog > 0)) + { + rtn=false; + } + else if ((header->cmdSts == CMD_STS_QUERY ) && + ((header->cmdType == CMD_TYPE_INSERT) || + (header->cmdType == CMD_TYPE_UPDATE) || + (header->cmdType == CMD_TYPE_DELETE) || + (header->cmdType == CMD_TYPE_EXECUTE))) + { + rtn = true; + } + else + { + if ((header->cmdSts == CMD_STS_TRANSACTION ) && + (header->cmdType == CMD_TYPE_COMMIT)) + { + rtn = true; + } + } + return rtn; +} + +int +PGRinit_transaction_table(void) +{ + if (Transaction_Tbl_Begin != NULL) + { + DLFreeList(Transaction_Tbl_Begin); + } + + Transaction_Tbl_Begin = DLNewList(); + + return STATUS_OK; +} + +static bool +is_need_queue_jump( ReplicateHeader * header,char *query) +{ + if (header == NULL) + { + return true; + } + + if (header->cmdSts == CMD_STS_QUERY) + { + if ((header->cmdType == CMD_TYPE_VACUUM ) || + (header->cmdType == CMD_TYPE_ANALYZE )) + { + if ((strstr(query,"full") == NULL) && + (strstr(query,"FULL") == NULL)) + { + return true; + } + } + } + return false; +} + + +static bool +is_executed_query_in_origin( ReplicateHeader *header ) +{ + char *database = NULL; + char port[8]; + char *userName = NULL; + char *password = NULL; + char * md5Salt = NULL; + char * cryptSalt = NULL; + char * host = NULL; + HostTbl * host_ptr = (HostTbl*)NULL; + TransactionTbl * transaction_tbl = (TransactionTbl*)NULL; + PGconn * conn = (PGconn *)NULL; + bool result = false; + + if (Host_Tbl_Begin == NULL) + { + return STATUS_ERROR; + } + host_ptr = Host_Tbl_Begin; + while(host_ptr->useFlag != DB_TBL_END) + { + /* + * check the status of the cluster DB + */ + if (host_ptr->useFlag != DB_TBL_USE) + { + host_ptr ++; + continue; + } + if (PGRis_same_host(header->from_host,ntohs(header->port),host_ptr->hostName, host_ptr->port) == true) + { + break; + } + host_ptr ++; + } + if (host_ptr->useFlag == DB_TBL_END) + { + return false; + } + /* + * set up the connection + */ + transaction_tbl = getTransactionTbl(host_ptr,header); + if (transaction_tbl == (TransactionTbl *)NULL) + { + transaction_tbl = setTransactionTbl(host_ptr, header); + if (transaction_tbl == (TransactionTbl *)NULL) + { + return false; + } + } + else + { + if ((transaction_tbl->conn == (PGconn *)NULL) || + (transaction_tbl->conn->sock <= 0)) + { + database = (char *)(header->dbName); + snprintf(port,sizeof(port),"%d", host_ptr->port); + userName = (char *)(header->userName); + password = (char *)(header->password); + md5Salt = (char *)(header->md5Salt); + cryptSalt = (char *)(header->cryptSalt); + host = (char *)(host_ptr->hostName); + transaction_tbl->conn = PGRcreateConn(host,port,database,userName,password,md5Salt,cryptSalt); + } + } + conn = transaction_tbl->conn; + if (conn == NULL) + { + return false; + } + + result = is_executed_query( conn, header); + deleteTransactionTbl(host_ptr,header); + return result; +} + +static bool +is_executed_query( PGconn *conn, ReplicateHeader * header) +{ + static PGresult * res = (PGresult *)NULL; + char sync_command[PGR_MESSAGE_BUFSIZE]; + char * str = NULL; + + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d,%u,%u,%u,%d) ", + PGR_SYSTEM_COMMAND_FUNC, + PGR_QUERY_CONFIRM_ANSWER_FUNC_NO, + (unsigned int)ntohl(header->tv.tv_sec), + (unsigned int)ntohl(header->tv.tv_usec), + (unsigned int)ntohl(header->replicate_id), + PGR_Response_Inf->response_mode); + + res = PQexec(conn, sync_command); + if (res != NULL) + { + str = PQcmdStatus(res); + if ((str != NULL) && + (!strncasecmp(str,PGR_ALREADY_REPLICATED_NOTICE_CMD,strlen(PGR_ALREADY_REPLICATED_NOTICE_CMD)))) + { + PQclear(res); + return true; + } + PQclear(res); + + } + return false; +} + +static int +replicate_lo( PGconn * conn, ReplicateHeader * header, LOArgs * query) +{ + int status = STATUS_OK; + int mode = 0; + Oid lobjId = 0; + int fd = 0; + char * buf = NULL; + char * filename = NULL; + size_t len = 0; + int offset = 0; + int whence = 0; + + if ((conn == (PGconn *)NULL) || (query == (LOArgs *)NULL) || (header == (ReplicateHeader *)NULL)) + { + return STATUS_ERROR; + } + switch (header->cmdType) + { + case CMD_TYPE_LO_IMPORT : + filename = query->buf; + if (lo_import(conn, filename) > 0 ) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_CREATE : + mode = (int)ntohl(query->arg1); + if (lo_creat(conn, mode) > 0) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_OPEN : + lobjId = (Oid)ntohl(query->arg1); + mode = (int)ntohl(query->arg2); + if (lo_open(conn, lobjId, mode) > 0) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_WRITE : + fd = (int)ntohl(query->arg1); + len = (int)ntohl(query->arg2); + buf = query->buf; + if (lo_write(conn, fd, buf, len) == len ) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_LSEEK : + fd = (int)ntohl(query->arg1); + offset = (int)ntohl(query->arg2); + whence = (int)ntohl(query->arg3); + if (lo_lseek(conn, fd, offset, whence) >= 0) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_CLOSE : + fd = (int)ntohl(query->arg1); + if (lo_close(conn, fd) == 0) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + case CMD_TYPE_LO_UNLINK : + lobjId = (Oid)ntohl(query->arg1); + if (lo_unlink(conn,lobjId) >= 0) + { + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + break; + default : + break; + } + return status; +} + +static int +send_func(HostTbl * host_ptr,ReplicateHeader * header, char * func,char * result) +{ + char * f ="send_func()"; + char *database = NULL; + char port[8]; + char *userName = NULL; + char *password = NULL; + char * md5Salt = NULL; + char * cryptSalt = NULL; + char * host = NULL; + char * str = NULL; + TransactionTbl * transaction_tbl = (TransactionTbl *)NULL; + PGresult * res = (PGresult *)NULL; + PGconn * conn = (PGconn *)NULL; + int rtn = 0; + int current_cluster = 0; + + if ((host_ptr == (HostTbl *)NULL) || + (header == (ReplicateHeader *)NULL) || + (func == NULL) || + (result == NULL)) + { + return STATUS_ERROR; + } + /* + * set up the connection + */ + database = (char *)header->dbName; + snprintf(port,sizeof(port),"%d", host_ptr->port); + userName = (char *)(header->userName); + password = (char *)(header->password); + md5Salt = (char *)(header->md5Salt); + cryptSalt = (char *)(header->cryptSalt); + host = (char *)(host_ptr->hostName); + if (PGR_Response_Inf != NULL) + { + current_cluster = PGR_Response_Inf->current_cluster; + } + + /* + * get the transaction table data + * it has the connection data with each cluster DB + */ + transaction_tbl = getTransactionTbl(host_ptr,header); + /* + * if the transaction process is new one, + * create connection data and add the transaction table + */ + if (transaction_tbl == (TransactionTbl *)NULL) + { + transaction_tbl = setTransactionTbl(host_ptr, header); + if (transaction_tbl == (TransactionTbl *)NULL) + { + StartReplication[current_cluster] = true; + show_error("%s:setTransactionTbl failed",f); + if ( header->cmdSts != CMD_STS_NOTICE ) + { + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + return STATUS_ERROR; + } + } + else + { + /* + * re-use the connection data + */ + if ((transaction_tbl->conn != (PGconn *)NULL) && + (transaction_tbl->conn->sock > 0)) + { + StartReplication[current_cluster] = false; + } + else + { + if (transaction_tbl->conn != (PGconn *)NULL) + { + PQfinish(transaction_tbl->conn); + } + transaction_tbl->conn = PGRcreateConn(host,port,database,userName,password,md5Salt,cryptSalt); + StartReplication[current_cluster] = true; + } + } + conn = transaction_tbl->conn; + + if (conn == NULL) + { + show_error("%s:[%d@%s] may be down",f,host_ptr->port,host_ptr->hostName); + if ( header->cmdSts != CMD_STS_NOTICE ) + { + StartReplication[current_cluster] = true; + PGRset_host_status(host_ptr,DB_TBL_ERROR); + } + return STATUS_ERROR; + } + res = PQexec(conn, func); + if (res == NULL) + { + StartReplication[current_cluster] = true; + return STATUS_ERROR; + } + str = PQcmdStatus(res); + if ((str == NULL) || (*str == '\0')) + { + rtn = STATUS_ERROR; + } + else + { + snprintf(result, PGR_MESSAGE_BUFSIZE, "%s",str); + rtn = STATUS_OK; + } + if (res != NULL) + PQclear(res); + return rtn; +} + +static uint32_t +get_oid(HostTbl * host_ptr,ReplicateHeader * header) +{ + char sync_command[PGR_MESSAGE_BUFSIZE]; + char result[PGR_MESSAGE_BUFSIZE]; + + memset(result,0,sizeof(result)); + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d)", + PGR_SYSTEM_COMMAND_FUNC, PGR_GET_OID_FUNC_NO); + if (send_func(host_ptr, header, sync_command, result) == STATUS_OK) + { + return (strtoul(result, NULL, 10)); + } + return 0; +} + +static int +set_oid(HostTbl * host_ptr,ReplicateHeader * header, uint32_t oid) +{ + char sync_command[PGR_MESSAGE_BUFSIZE]; + char result[PGR_MESSAGE_BUFSIZE]; + + memset(result,0,sizeof(result)); + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d,%u)", + PGR_SYSTEM_COMMAND_FUNC, + PGR_SET_OID_FUNC_NO, + oid); + return ( send_func(host_ptr, header, sync_command, result) ); +} + +/* + * sync oid during cluster DB's + */ +int +PGRsync_oid(ReplicateHeader *header) +{ + HostTbl * host_ptr = (HostTbl*)NULL; + uint32_t max_oid = 0; + uint32_t oid = 0; + int recovery_status = 0; + + /* get current oid of all cluster db's */ + host_ptr = Host_Tbl_Begin; + if (host_ptr == (HostTbl *)NULL) + { + return STATUS_ERROR; + } + recovery_status = PGRget_recovery_status(); + while(host_ptr->useFlag != DB_TBL_END) + { + /* + * check the status of the cluster DB + */ + if (host_ptr->useFlag != DB_TBL_USE) + { + host_ptr ++; + continue; + } + /* + * skip loop during recover and the host name is master DB + */ + if (is_master_in_recovery(host_ptr->hostName, host_ptr->port,recovery_status) == true) + { + host_ptr ++; + continue; + } + oid = get_oid(host_ptr,header); + if (max_oid < oid ) + { + max_oid = oid; + } + host_ptr ++; + } + if (max_oid <= 0) + return STATUS_ERROR; + + /* set oid in cluster db */ + host_ptr = Host_Tbl_Begin; + while(host_ptr->useFlag != DB_TBL_END) + { + /* + * check the status of the cluster DB + */ + if (host_ptr->useFlag != DB_TBL_USE) + { + host_ptr ++; + continue; + } + /* + * skip loop during recover and the host name is master DB + */ + if (is_master_in_recovery(host_ptr->hostName, host_ptr->port,recovery_status) == true) + { + host_ptr ++; + continue; + } + set_oid(host_ptr,header,max_oid); + host_ptr ++; + } + + return STATUS_OK; +} + +int +PGRload_replication_id(void) +{ + char * func = "PGRload_replication_id()"; + char buf[256]; + char *p; + + if (Recovery_Status_Inf == (RecoveryStatusInf *)NULL) + { + show_error("%s: Recovery_Status_Inf is NULL",func); + return STATUS_ERROR; + } + if (RidFp == (FILE *)NULL) + { + show_error("%s: replication id file is not open",func); + return STATUS_ERROR; + } + rewind(RidFp); + if (fgets(buf,sizeof(buf),RidFp) == NULL) + { + Recovery_Status_Inf->replication_id = 0; + } + else + { + p = strrchr(buf,' '); + if (p != NULL) + { + p++; + Recovery_Status_Inf->replication_id = (uint32_t) atol(p); + } + else + { + Recovery_Status_Inf->replication_id = 0; + } + } + return STATUS_OK; +} + +static int +notice_abort(HostTbl * host_ptr,ReplicateHeader * header) +{ + char sync_command[PGR_MESSAGE_BUFSIZE]; + char result[PGR_MESSAGE_BUFSIZE]; + + memset(result,0,sizeof(result)); + snprintf(sync_command,sizeof(sync_command), + "SELECT %s(%d)", + PGR_SYSTEM_COMMAND_FUNC, + PGR_NOTICE_ABORT_FUNC_NO); + return ( send_func(host_ptr, header, sync_command, result) ); +} + +static int +send_p_parse (PGconn * conn, StringInfo input_message) +{ + const char *stmt_name; + const char *query_string; + int numParams; + Oid paramTypes; + + /* get name,query */ + stmt_name = pq_getmsgstring(input_message); + query_string = pq_getmsgstring(input_message); + /* send name,query */ + if (pqPutMsgStart('P', false, conn) < 0 || + pqPuts(stmt_name, conn) < 0 || + pqPuts(query_string, conn) < 0) + { + return STATUS_ERROR; + } + /* get number of parameter */ + numParams = pq_getmsgint(input_message, 2); + /* send number of parameter */ + if (pqPutInt(numParams, 2, conn) < 0) + { + return STATUS_ERROR; + } + if (numParams > 0) + { + int i; + for (i = 0; i < numParams; i++) + { + paramTypes = pq_getmsgint(input_message, 4); + if (pqPutInt(paramTypes, 4, conn) < 0) + { + return STATUS_ERROR; + } + } + } + if (pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +send_p_bind (PGconn * conn, StringInfo input_message) +{ + const char *portal_name; + const char *stmt_name; + int numPFormats; + int16 pformats; + int numParams; + int numRFormats; + int16 rformats; + int i; + + /* Get&Send the fixed part of the message */ + portal_name = pq_getmsgstring(input_message); + stmt_name = pq_getmsgstring(input_message); + if (pqPutMsgStart('B', false, conn) < 0 || + pqPuts(portal_name, conn) < 0 || + pqPuts(stmt_name, conn) < 0) + { + return STATUS_ERROR; + } + + /* Get&Send the parameter format codes */ + numPFormats = pq_getmsgint(input_message, 2); + if (pqPutInt(numPFormats, 2, conn) < 0) + { + return STATUS_ERROR; + } + if (numPFormats > 0) + { + for (i = 0; i < numPFormats; i++) + { + pformats = pq_getmsgint(input_message, 2); + if (pqPutInt(pformats, 2, conn) < 0) + { + return STATUS_ERROR; + } + } + } + + /* Get&Send the parameter value count */ + numParams = pq_getmsgint(input_message, 2); + if (pqPutInt(numParams, 2, conn) < 0) + { + return STATUS_ERROR; + } + if (numParams > 0) + { + int32 plength; + for (i = 0 ; i < numParams ; i ++) + { + plength = pq_getmsgint(input_message, 4); + if (plength != -1) + { + const char *pvalue = pq_getmsgbytes(input_message, plength); + if (pqPutInt(plength, 4, conn) < 0 || + pqPutnchar(pvalue, plength, conn) < 0) + { + return STATUS_ERROR; + } + } + else + { + if (pqPutInt(plength, 4, conn) < 0) + { + return STATUS_ERROR; + } + } + } + } + + /* Get&Send the result format codes */ + numRFormats = pq_getmsgint(input_message, 2); + if (pqPutInt(numRFormats, 2, conn) < 0 ) + { + return STATUS_ERROR; + } + if (numRFormats > 0) + { + for (i = 0; i < numRFormats; i++) + { + rformats = pq_getmsgint(input_message, 2); + if (pqPutInt(rformats, 2, conn) < 0) + { + return STATUS_ERROR; + } + } + } + if (pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +send_p_describe (PGconn * conn, StringInfo input_message) +{ + + int describe_type; + const char *describe_target; + + describe_type = pq_getmsgbyte(input_message); + describe_target = pq_getmsgstring(input_message); + + /* construct the Describe Portal message */ + if (pqPutMsgStart('D', false, conn) < 0 || + pqPutc(describe_type, conn) < 0 || + pqPuts(describe_target, conn) < 0 || + pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +send_p_execute (PGconn * conn, StringInfo input_message) +{ + const char *portal_name; + int max_rows; + + portal_name = pq_getmsgstring(input_message); + max_rows = pq_getmsgint(input_message, 4); + /* construct the Execute message */ + if (pqPutMsgStart('E', false, conn) < 0 || + pqPuts(portal_name, conn) < 0 || + pqPutInt(max_rows, 4, conn) < 0 || + pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} + +static int +send_p_sync (PGconn * conn, StringInfo input_message) +{ + PGresult *result; + PGresult *lastResult; + + /* construct the Sync message */ + if (pqPutMsgStart('S', false, conn) < 0 || + pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + /* remember we are using extended query protocol */ + conn->queryclass = PGQUERY_EXTENDED; + + /* + * Give the data a push. In nonblock mode, don't complain if we're unable + * to send it all; PQgetResult() will do any additional flushing needed. + */ + if (pqFlush(conn) < 0) + { + return STATUS_ERROR; + } + + /* OK, it's launched! */ + conn->asyncStatus = PGASYNC_BUSY; + + lastResult = NULL; + while ((result = PQgetResult(conn)) != NULL) + { + if (lastResult) + { + if (lastResult->resultStatus == PGRES_FATAL_ERROR && + result->resultStatus == PGRES_FATAL_ERROR) + { + PQclear(result); + result = lastResult; + } + else + PQclear(lastResult); + } + lastResult = result; + if (result->resultStatus == PGRES_COPY_IN || + result->resultStatus == PGRES_COPY_OUT || + conn->status == CONNECTION_BAD) + break; + } + if (lastResult != NULL) + { + PQclear(lastResult); + } + return STATUS_OK; +} + +static int +send_p_close (PGconn * conn, StringInfo input_message) +{ + + int close_type; + const char *close_target; + + close_type = pq_getmsgbyte(input_message); + close_target = pq_getmsgstring(input_message); + if (pqPutMsgStart('C', false, conn) < 0 || + pqPutc(close_type, conn) < 0 || + pqPuts(close_target, conn) < 0 || + pqPutMsgEnd(conn) < 0) + { + return STATUS_ERROR; + } + return STATUS_OK; +} +static void +set_string_info(StringInfo input_message, ReplicateHeader * header, char * query) +{ + int len; + len = ntohl(header->query_size); + input_message->data = query; + input_message->maxlen = len; + input_message->len = len -1; + input_message->cursor = 0; +} diff -aruN postgresql-8.2.4/src/pgcluster/pgrp/rlog.c pgcluster-1.7.0rc7/src/pgcluster/pgrp/rlog.c --- postgresql-8.2.4/src/pgcluster/pgrp/rlog.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/pgrp/rlog.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,1260 @@ +/*-------------------------------------------------------------------- + * FILE: + * rlog.c + * + * NOTE: + * This file is composed of the functions to call with the source + * at pgreplicate for replicate ahead log. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + *-------------------------------------------------------------------- + */ +#ifdef USE_REPLICATION + +#include "postgres.h" +#include "postgres_fe.h" + +#include +#include +#include +#include +#include +#ifdef HAVE_FCNTL_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#include +#include + +#ifdef HAVE_CRYPT_H +#include +#endif + +#ifdef MULTIBYTE +#include "mb/pg_wchar.h" +#endif + +#include "libpq-fe.h" +#include "libpq-int.h" +#include "fe-auth.h" +#include "access/xact.h" +#include "replicate_com.h" +#include "pgreplicate.h" + +static int RLog_Recv_Sock = -1; +/*-------------------------------------- + * PROTOTYPE DECLARATION + *-------------------------------------- + */ +static int set_query_log(ReplicateHeader * header, char * query); +static QueryLogType * get_query_log_by_header(ReplicateHeader * header); +static QueryLogType * get_query_log(ReplicateHeader * header); +static void delete_query_log(ReplicateHeader * header); +static int set_commit_log(ReplicateHeader * header); +static CommitLogInf * get_commit_log(ReplicateHeader * header); +static void delete_commit_log(ReplicateHeader * header); +static bool was_committed_transaction(ReplicateHeader * header); +static int create_recv_rlog_socket(void); +static int do_rlog(int fd); +static int recv_message(int sock,char * buf, int len); +static int send_message(int sock, char * msg, int len); +static void exit_rlog(int sig); +static int reconfirm_commit(ReplicateHeader * header); +#if 0 +static int PGRget_sync_data(ReplicateHeader *header); +static int PGRdelete_sync_data(ReplicateHeader *header); +#endif /* #if 0 */ + + +int +PGRwrite_rlog(ReplicateHeader * header, char * query) +{ + char * func = "PGRwrite_rlog()"; + + if (header == NULL) + { + show_error("%s:header is null",func); + return STATUS_ERROR; + } + switch (header->cmdSts) + { + case CMD_STS_QUERY: +#ifdef PRINT_DEBUG + show_debug("%s:set_query_log",func); +#endif + set_query_log(header,query); + break; + case CMD_STS_DELETE_QUERY: +#ifdef PRINT_DEBUG + show_debug("%s:delete_query_log",func); +#endif + delete_query_log(header); + break; + case CMD_STS_TRANSACTION: + if (header->cmdType == CMD_TYPE_COMMIT) + { +#ifdef PRINT_DEBUG + show_debug("%s:set_commit_log call",func); +#endif + set_commit_log(header); + } + break; + case CMD_STS_DELETE_TRANSACTION: + if (header->cmdType == CMD_TYPE_COMMIT) + { +#ifdef PRINT_DEBUG + show_debug("%s:delete_commit_log call",func); +#endif + delete_commit_log(header); + } + break; + default: + show_error("%s:unknown status %c",func,header->cmdSts); + break; + } + return STATUS_OK; +} + +ReplicateHeader * +PGRget_requested_query(ReplicateHeader * header) +{ + QueryLogType * query_log = NULL; + + if (Query_Log_Top == NULL) + { + return NULL; + } + query_log = Query_Log_Top; + while(query_log != (QueryLogType *)NULL) + { + if ((query_log->header->request_id == header->request_id) && + (query_log->header->pid == header->pid) && + (query_log->header->port == header->port) && + (!strncmp(query_log->header->from_host,header->from_host,sizeof(header->from_host)))) + { + return query_log->header; + } + query_log = (QueryLogType *)(query_log->next); + } + return (ReplicateHeader *)NULL; +} + +static int +set_query_log(ReplicateHeader * header, char * query) +{ + char * func = "set_query_log()"; + int size = 0; + QueryLogType * query_log = NULL; + + if (Query_Log_Top == NULL) + { + Query_Log_Top = (QueryLogType *)malloc(sizeof(QueryLogType)); + if (Query_Log_Top == (QueryLogType *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + Query_Log_Top->next = NULL; + Query_Log_Top->last = NULL; + Query_Log_End = Query_Log_Top; + Query_Log_End->next = NULL; + Query_Log_End->last = NULL; + query_log = Query_Log_Top; + } + else + { + query_log = (QueryLogType *)malloc(sizeof(QueryLogType)); + if (query_log == (QueryLogType *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + Query_Log_End->next = (char *)query_log; + query_log->last = (char *)Query_Log_End; + query_log->next = NULL; + Query_Log_End = query_log; + } + query_log->header = (ReplicateHeader *)malloc(sizeof(ReplicateHeader)); + if (query_log->header == (ReplicateHeader *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + size = ntohl(header->query_size); + + query_log->query = (char *)malloc(size+4); + if (query_log->query == (char *)NULL) + { + show_error("%s:malloc failed: (%s)",func,strerror(errno)); + return STATUS_ERROR; + } + memset(query_log->query,0,size+4); + memcpy(query_log->header,header,sizeof(ReplicateHeader)); + query_log->header->rlog = FROM_R_LOG_TYPE ; + memcpy(query_log->query,query,size); + + return STATUS_OK; +} + +static QueryLogType * +get_query_log_by_header(ReplicateHeader * header) +{ + QueryLogType * query_log = NULL; + + if (Query_Log_Top == NULL) + { + return (QueryLogType *)NULL; + } + query_log = Query_Log_Top; + show_debug("get_query_log_by_header:header is %d,%d,%d,%s", + header->request_id, + header->pid, + header->port, + header->from_host); + + while(query_log != (QueryLogType *)NULL) + { + show_debug("get_query_log_by_header:comparing to %d,%d,%d,%s", + query_log->header->request_id, + query_log->header->pid, + query_log->header->port, + query_log->header->from_host); + + if ((query_log->header->request_id == header->request_id) && + (query_log->header->pid == header->pid) && + (query_log->header->port == header->port) && + (!strncmp(query_log->header->from_host,header->from_host,sizeof(header->from_host)))) + { + return query_log; + } + query_log = (QueryLogType *)(query_log->next); + } + return (QueryLogType *)NULL; +} + +static QueryLogType * +get_query_log(ReplicateHeader * header) +{ + QueryLogType * query_log = NULL; + + if (Query_Log_Top == NULL) + { + return NULL; + } + query_log = Query_Log_Top; + while(query_log != (QueryLogType *)NULL) + { + show_debug("get_qurey_log: comparing in log is %d,header is %d",query_log->header->replicate_id,header->replicate_id); + if (query_log->header->replicate_id == header->replicate_id) + { + return query_log; + } + query_log = (QueryLogType *)(query_log->next); + } + return (QueryLogType*)NULL; +} + +static void +delete_query_log(ReplicateHeader * header) +{ + QueryLogType * query_log = NULL; + QueryLogType * last = NULL; + QueryLogType * next = NULL; + + query_log = get_query_log(header); + + if (query_log == NULL) + { + return ; + } + last = (QueryLogType *)query_log->last; + next = (QueryLogType *)query_log->next; + + /* change link */ + if (last != (QueryLogType *)NULL) + { + last->next = (char *)next; + } + else + { + Query_Log_Top = next; + } + if (next != (QueryLogType *)NULL) + { + next->last = (char *)last; + } + else + { + Query_Log_End = last; + } + + /* delete contents */ + if (query_log->header != NULL) + { + free(query_log->header); + } + if (query_log->query != NULL) + { + free(query_log->query); + } + free(query_log); +} + +static int +set_commit_log(ReplicateHeader * header) +{ + + CommitLogInf * commit_log = NULL; + ReplicateHeader * c_header; + + if (Commit_Log_Tbl == NULL) + { + return STATUS_ERROR; + } + commit_log = Commit_Log_Tbl + 1; + while ( commit_log->inf.useFlag != DB_TBL_END ) + { + if (commit_log->inf.useFlag != DB_TBL_USE) + { + commit_log->inf.useFlag = DB_TBL_USE; + c_header = &(commit_log->header); + memcpy(c_header,header,sizeof(ReplicateHeader)); + Commit_Log_Tbl->inf.commit_log_num ++; + break; + } + commit_log ++; + } + return STATUS_OK; +} + +static CommitLogInf * +get_commit_log(ReplicateHeader * header) +{ + CommitLogInf * commit_log = NULL; + ReplicateHeader * c_header; + int cnt = 0; + + if (Commit_Log_Tbl == NULL) + { + return (CommitLogInf *)NULL; + } + commit_log = Commit_Log_Tbl + 1; + while ( commit_log->inf.useFlag != DB_TBL_END ) + { + if (commit_log->inf.useFlag == DB_TBL_USE) + { + cnt ++; + c_header = &(commit_log->header); + if (c_header == NULL) + { + commit_log ++; + continue; + } + if (c_header->replicate_id == header->replicate_id) + { + return commit_log; + } + } + else + { + } + if (cnt >= Commit_Log_Tbl->inf.commit_log_num) + { + break; + } + commit_log ++; + } + return (CommitLogInf *)NULL; +} + +static void +delete_commit_log(ReplicateHeader * header) +{ + CommitLogInf * commit_log = NULL; + + commit_log = get_commit_log(header); + if (commit_log != NULL) + { + memset(&(commit_log->header),0,sizeof(commit_log->header)); + commit_log->inf.useFlag = DB_TBL_INIT; + Commit_Log_Tbl->inf.commit_log_num --; + } +} + +static bool +was_committed_transaction(ReplicateHeader * header) +{ + CommitLogInf * commit_log = NULL; + + commit_log = get_commit_log(header); + if (commit_log != NULL) + { + return true; + } + return false; +} + +void +PGRreconfirm_commit(int sock, ReplicateHeader * header) +{ + int result = PGR_NOT_YET_COMMIT; + + if (Replicateion_Log == NULL) + { + return ; + } + + if (Replicateion_Log->r_log_sock > 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + } + Replicateion_Log->r_log_sock = PGRcreate_send_rlog_socket(); + if (Replicateion_Log->r_log_sock == -1) + return; + + header->query_size = 0; + PGRsend_rlog_packet(Replicateion_Log->r_log_sock,header,""); + PGRrecv_rlog_result(Replicateion_Log->r_log_sock,&result, sizeof(result)); + + + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + + snprintf(PGR_Result,PGR_MESSAGE_BUFSIZE,"%d,%d", PGR_TRANSACTION_CONFIRM_ANSWER_FUNC_NO,result); + + PGRreturn_result(sock, PGR_Result,PGR_NOWAIT_ANSWER); +} + +static int +reconfirm_commit(ReplicateHeader * header) +{ + char * func = "reconfirm_commit()"; + int result = PGR_NOT_YET_COMMIT; + + /* check the transaction was committed */ + if (was_committed_transaction(header) == true) + { + result = PGR_ALREADY_COMMITTED; +#ifdef PRINT_DEBUG + show_debug("%s:PGR_ALREADY_COMMITTED",func); +#endif + } + else + { +#ifdef PRINT_DEBUG + show_debug("%s:PGR_NOT_YET_COMMIT",func); +#endif + } + return result; +} + +void +PGRset_rlog(ReplicateHeader * header, char * query) +{ + char * func = "PGRset_rlog()"; + int status = STATUS_OK; + bool send_flag = false; + + if (PGR_Log_Header == NULL) + { + return; + } + switch (header->cmdSts) + { + case CMD_STS_QUERY: + send_flag = true; + break; + case CMD_STS_TRANSACTION: + if (header->cmdType == CMD_TYPE_COMMIT) + { + send_flag = true; + PGR_Log_Header->cmdType = header->cmdType; + PGR_Log_Header->query_size = htonl(strlen(query)); + } + break; + } + if (send_flag != true) + { + show_error("%s:send_flag is false",func); + return; + } + PGR_Log_Header->cmdSys = CMD_SYS_LOG; + if (Cascade_Inf->useFlag == DB_TBL_USE) + { + /* save log data in remote server */ + show_debug("%s:set rlog %s",func,query); + status = PGRsend_lower_cascade(PGR_Log_Header, query); + if (status == STATUS_OK) { + status=PGRwait_notice_rlog_done(); + } + if (status != STATUS_OK) + { +#ifdef PRINT_DEBUG + show_debug("%s:PGRsend_lower_cascade failed",func); +#endif + PGRwrite_rlog(PGR_Log_Header, query); + } + } + else + { + /* save log data in local server */ + PGRwrite_rlog(PGR_Log_Header, query); + } +} + +void +PGRunset_rlog(ReplicateHeader * header, char * query) +{ + int status = STATUS_OK; + bool send_flag = false; + + if (PGR_Log_Header == NULL) + { + return; + } + switch (header->cmdSts) + { + case CMD_STS_QUERY: + send_flag = true; + PGR_Log_Header->cmdSts = CMD_STS_DELETE_QUERY; + break; + case CMD_STS_TRANSACTION: + if (PGR_Log_Header->cmdType == CMD_TYPE_COMMIT) + { + PGR_Log_Header->cmdSts = CMD_STS_DELETE_TRANSACTION; + PGR_Log_Header->query_size = htonl(strlen(query)); + send_flag = true; + } + break; + } + if (send_flag != true) + { + return; + } + PGR_Log_Header->cmdSys = CMD_SYS_LOG; + if (Cascade_Inf->useFlag == DB_TBL_USE) + { + /* save log data in remote server */ + show_debug("unset rlog %s",query); + + status = PGRsend_lower_cascade(PGR_Log_Header, query); + if (status == STATUS_OK) + { + status=PGRwait_notice_rlog_done(); + } + if (status != STATUS_OK) + { +#ifdef PRINT_DEBUG + show_debug("PGRsend_lower_cascade recv failed"); +#endif + PGRwrite_rlog(PGR_Log_Header, query); + } + } + else + { + /* save log data in local server */ + PGRwrite_rlog(PGR_Log_Header, query); + } +} + +int +PGRresend_rlog_to_db(void) +{ + char *func="PGRresend_rlog_to_db"; + QueryLogType * query_log = NULL; + QueryLogType * next = NULL; + int status = STATUS_OK; + int dest = 0; + + show_debug("%s:enter.",func); + + query_log = Query_Log_Top; + + while (query_log != NULL) + { + + + show_debug("%s:processing qlog,query=%s",func,query_log->query); + if (query_log->header->rlog != FROM_R_LOG_TYPE ) + { + query_log = (QueryLogType *)query_log->next; + continue; + } + status = replicate_packet_send_internal(query_log->header,query_log->query, dest,RECOVERY_INIT,false); + show_debug("%s:status=%d",func,status); + + if (status == STATUS_SKIP_REPLICATE ) + { + Query_Log_Top = query_log; + query_log = (QueryLogType *)query_log->next; + } + else + { + if (query_log->header != NULL) + { + free(query_log->header ); + } + if (query_log->query != NULL) + { + free(query_log->query ); + } + next = (QueryLogType *)query_log->next; + free(query_log); + query_log = next; + Query_Log_Top = query_log; + } + if (query_log != NULL) + { + Query_Log_End = (QueryLogType *)query_log->next; + } + else + { + Query_Log_End = (QueryLogType *)NULL; + } + } + + show_debug("%s:exit.",func); + + return STATUS_OK; +} + +pid_t +PGR_RLog_Main(void) +{ + char * func = "PGR_RLog_Main()"; + int afd = -1; + int rtn; + struct sockaddr addr; + socklen_t addrlen; + pid_t pid = 0; + pid_t pgid = 0; + + extern int fork_wait_time; + + if (Replicateion_Log == NULL) + { + show_error("%s:Replicateion_Log is NULL",func); + return -1; + } + pgid = getpgid(0); + if ((pid = fork()) != 0 ) + { + return pid; + } + PGRsignal(SIGTERM,exit_rlog); + PGRsignal(SIGINT,exit_rlog); + PGRsignal(SIGQUIT,exit_rlog); + PGRsignal(SIGPIPE,SIG_IGN); + + if (PGRinit_transaction_table() != STATUS_OK) + { + show_error("RLog process transaction table memory allocate failed"); + return -1; + } + + setpgid(0,pgid); + RLog_Recv_Sock = create_recv_rlog_socket(); + if(RLog_Recv_Sock == -1) + { + show_error("rlog socket creation failure.quit all process."); + kill(pgreplicate_pid, SIGINT); + exit_rlog(0); + } + + if (fork_wait_time > 0) { +#ifdef PRINT_DEBUG + show_debug("rlog process: wait fork(): pid = %d", getpid()); +#endif + sleep(fork_wait_time); + } + + for (;;) + { + fd_set rmask; + struct timeval timeout; + + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + Idle_Flag = IDLE_MODE ; + if (Exit_Request) + { + exit_rlog(0); + } + /* + * Wait for something to happen. + */ + FD_ZERO(&rmask); + FD_SET(RLog_Recv_Sock,&rmask); + rtn = select(RLog_Recv_Sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + } + else if (rtn && FD_ISSET(RLog_Recv_Sock, &rmask)) + { + Idle_Flag = BUSY_MODE ; + addrlen = sizeof(addr); + afd = accept(RLog_Recv_Sock, &addr, &addrlen); + if (afd < 0) + { + continue; + } + else + { + do_rlog(afd); + close(afd); + } + } + } + exit(0); +} + +static int +create_recv_rlog_socket(void) +{ + char * func = "create_recv_socket()"; + struct sockaddr_un addr; + int fd; + int status; + int len; + + /* set unix domain socket path */ + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + { + show_error("%s:Failed to create UNIX domain socket. reason: %s",func, strerror(errno)); + return -1; + } + memset((char *) &addr, 0, sizeof(addr)); + ((struct sockaddr *)&addr)->sa_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/.s.PGRLOG.%d", + PGR_Write_Path, + Replicateion_Log->RLog_Port_Number); +fprintf(stderr,"addr.sun_path[%s]\n",addr.sun_path); + if (Replicateion_Log->RLog_Sock_Path == NULL) + { + Replicateion_Log->RLog_Sock_Path = strdup(addr.sun_path); +fprintf(stderr,"Replicateion_Log->RLog_Sock_Path[%s]\n",Replicateion_Log->RLog_Sock_Path); + } + len = sizeof(struct sockaddr_un); + status = bind(fd, (struct sockaddr *)&addr, len); + if (status == -1) + { + show_error("%s: bind() failed. reason: %s", func, strerror(errno)); + return -1; + } + + if (chmod(addr.sun_path, 0770) == -1) + { + show_error("%s: chmod() failed. reason: %s", func, strerror(errno)); + return -1; + } + + status = listen(fd, 1000000); + if (status < 0) + { + show_error("%s: listen() failed. reason: %s", func, strerror(errno)); + return -1; + } + return fd; +} + +int +PGRcreate_send_rlog_socket(void) +{ + char * func = "create_recv_socket()"; + struct sockaddr_un addr; + int fd; + int len; + + /* set unix domain socket path */ + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + { + show_error("%s:Failed to create UNIX domain socket. reason: %s",func, strerror(errno)); + return -1; + } + memset((char *) &addr, 0, sizeof(addr)); + ((struct sockaddr *)&addr)->sa_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/.s.PGRLOG.%d", + PGR_Write_Path, + Replicateion_Log->RLog_Port_Number); + if (Replicateion_Log->RLog_Sock_Path == NULL) + { + Replicateion_Log->RLog_Sock_Path = strdup(addr.sun_path); + } + len = sizeof(struct sockaddr_un); + if (connect(fd, (struct sockaddr *)&addr, len) < 0) + { + close(fd); + return -1; + } + return fd; +} + +static int +do_rlog(int fd) +{ + char * func = "do_rlog()"; + QueryLogType * query_log = NULL; + ReplicateHeader header; + char * query = NULL; + int status = STATUS_OK; + + memset(&header,0,sizeof(header)); + query = PGRread_packet(fd, &header); + show_debug("%s:got result:cmdSys='%c'",func,header.cmdSys); + if (header.cmdSys == 0) + { + return STATUS_ERROR; + } + switch (header.cmdSys) + { + case CMD_SYS_REPLICATE: + if (header.cmdSts != CMD_STS_DELETE_QUERY) + { + query_log = get_query_log_by_header(&header); + if (query_log != (QueryLogType*)NULL) + { + memcpy(&header,query_log->header,sizeof(ReplicateHeader)); + } + send_message(fd,(char *)&header,sizeof(ReplicateHeader)); + header.cmdSts = CMD_STS_DELETE_QUERY; + PGRwrite_rlog(&header, NULL); + } + else + { + status = PGRwrite_rlog((ReplicateHeader*)&header,(char *)NULL); + send_message(fd,(char *)&status,sizeof(status)); + } + break; + case CMD_SYS_LOG: + status = PGRwrite_rlog((ReplicateHeader*)&header, query); + send_message(fd,(char *)&status,sizeof(status)); + break; + case CMD_SYS_CALL: + if (header.cmdSts == CMD_STS_TRANSACTION_ABORT) + { +#ifdef PRINT_DEBUG + show_debug("%s: CMD_STS_TRANSACTION_ABORT",func); +#endif + status = reconfirm_commit(&header); + } + else if (header.cmdSts == CMD_STS_QUERY_SUSPEND) + { +#ifdef PRINT_DEBUG + show_debug("%s: CMD_STS_QUERY_SUSPEND",func); +#endif + // status = PGRresend_rlog_to_db(); + } +#ifdef PRINT_DEBUG + show_debug("%s: SYS_CALL process done",func); +#endif + send_message(fd,(char *)&status,sizeof(status)); + break; + } + show_debug("%s:process result done:cmdSys='%c'",func,header.cmdSys); + return STATUS_OK; +} + +#if 0 +static int +PGRget_sync_data(ReplicateHeader *header) +{ + ReplicateHeader rlog_header; + + if (header == NULL) + { + return STATUS_ERROR; + } + + + if (Replicateion_Log->r_log_sock > 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + } + Replicateion_Log->r_log_sock = PGRcreate_send_rlog_socket(); + if (Replicateion_Log->r_log_sock == -1) + return STATUS_ERROR; + + memset(&rlog_header,0,sizeof(ReplicateHeader)); + send_message( Replicateion_Log->r_log_sock, (char *)header,sizeof(ReplicateHeader)); + recv_message( Replicateion_Log->r_log_sock, (char *)&rlog_header,sizeof(ReplicateHeader)); + if (rlog_header.cmdSts != 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + memcpy(header,&rlog_header, sizeof(ReplicateHeader)); + return STATUS_OK; + } + + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + + return STATUS_ERROR; + +} + +static int +PGRdelete_sync_data(ReplicateHeader *header) +{ + int status; + char cmdSts; + + cmdSts = header->cmdSts; + header->cmdSts = CMD_STS_DELETE_QUERY; + if (header == NULL) + { + return STATUS_ERROR; + } + + if (Replicateion_Log->r_log_sock > 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + } + Replicateion_Log->r_log_sock = PGRcreate_send_rlog_socket(); + if (Replicateion_Log->r_log_sock == -1) + return STATUS_ERROR; + + send_message( Replicateion_Log->r_log_sock, (char *)header,sizeof(ReplicateHeader)); + recv_message( Replicateion_Log->r_log_sock, (char *)&status,sizeof(status)); + header->cmdSts = cmdSts; + + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + + return status; + +} +#endif /* #if 0 */ + +int +PGRsend_rlog_packet(int sock,ReplicateHeader * header, const char * query_string) +{ + char * buf = NULL; + int buf_size = 0; + int header_size = 0; + int query_size = 0; + int rtn = 0; + + /* check parameter */ + if ((sock <= 0) || (header == NULL)) + { + return STATUS_ERROR; + } + if (query_string != NULL) + { + query_size = ntohl(header->query_size); + } + header_size = sizeof(ReplicateHeader); + buf_size = header_size + query_size + 4; + buf = (char *)malloc(buf_size); + if (buf == (char *)NULL) + { + return STATUS_ERROR; + } + memset(buf,0,buf_size); + buf_size -= 4; + memcpy(buf,header,header_size); + if (query_size > 0) + { + memcpy((char *)(buf+header_size),query_string,query_size+1); + } + rtn = send_message(sock,buf,buf_size); + free(buf); + return rtn; +} + +int +PGRrecv_rlog_result(int sock,void * result, int size) +{ + char *func = "PGRrecv_rlog_result"; + fd_set rmask; + struct timeval timeout; + int rtn; + + if ((result == (void *)NULL) || (size <= 0)) + { + return -1; + } + + /* + * Wait for something to happen. + */ + rtn = 1; + for (;;) + { + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + FD_ZERO(&rmask); + FD_SET(sock,&rmask); + rtn = select(sock+1, &rmask, (fd_set *)NULL, (fd_set *)NULL, &timeout); + if (rtn < 0) + { + if (errno != EINTR || errno != EAGAIN) + { + show_error("%s: select() failed (%s)",func,strerror(errno)); + return -1; + } + } + else if (rtn && FD_ISSET(sock, &rmask)) + { + return (recv_message(sock, (char*)result, size)); + } + } + return -1; +} + + +static int +recv_message(int sock,char * buf, int len) +{ + char *func = "recv_message"; + int cnt = 0; + int r = 0; + char * read_ptr; + int read_size = 0; + cnt = 0; + read_ptr = buf; + + for (;;) + { + r = recv(sock,read_ptr + read_size ,len - read_size, 0); + if (r < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:recv failed: %d(%s)",func, errno, strerror(errno)); + return -1; + } + } + else if (r > 0) + { + read_size += r; + if (read_size == len) + return read_size; + } + else /* r == 0 */ + { + show_error("%s:unexpected EOF", func); + return -1; + } + } + return -1; +} + +int +PGRsend_rlog_to_local(ReplicateHeader * header,char * query) +{ + int status = STATUS_OK; + + if (Replicateion_Log == NULL) + { + return STATUS_ERROR; + } + + if (Replicateion_Log->r_log_sock > 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + } + + Replicateion_Log->r_log_sock = PGRcreate_send_rlog_socket(); + if (Replicateion_Log->r_log_sock == -1) + return STATUS_ERROR; + + show_debug("send_to_local %s",query); + status = PGRsend_rlog_packet(Replicateion_Log->r_log_sock,header,query); + show_debug("send_to_local result is %d,errno=%d(%s)",status,errno ,strerror(errno)); + + if (status != STATUS_ERROR) + { + PGRrecv_rlog_result(Replicateion_Log->r_log_sock,&status, sizeof(status)); + } + + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + + return status; +} + +int +PGRget_rlog_header(ReplicateHeader * header) +{ + int status = STATUS_OK; + ReplicateHeader rlog_header; + + if ((Replicateion_Log == NULL) || + (header == NULL)) + { + return STATUS_ERROR; + } + + if (Replicateion_Log->r_log_sock > 0) + { + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + } + Replicateion_Log->r_log_sock = PGRcreate_send_rlog_socket(); + if (Replicateion_Log->r_log_sock == -1) + return STATUS_ERROR; + + memcpy(&rlog_header,header,sizeof(ReplicateHeader)); + rlog_header.cmdSys = CMD_SYS_REPLICATE; + rlog_header.query_size = 0; + status =PGRsend_rlog_packet(Replicateion_Log->r_log_sock,&rlog_header,""); + if (status != STATUS_ERROR) + { + status = PGRrecv_rlog_result(Replicateion_Log->r_log_sock,&rlog_header, sizeof(ReplicateHeader)); + if (status > 0) + { + memcpy(header,&rlog_header,sizeof(ReplicateHeader)); + status = STATUS_OK; + } + else + { + status = STATUS_ERROR; + } + } + + close(Replicateion_Log->r_log_sock ); + Replicateion_Log->r_log_sock = -1; + + return status; +} + +static int +send_message(int sock, char * msg, int len) +{ + char * func = "send_message()"; + fd_set wmask; + struct timeval timeout; + int rtn = 0; + char * send_ptr = NULL; + int send_size= 0; + int buf_size = 0; + int s = 0; + int flag = 0; + + if ((msg == NULL) || (len <= 0) || (sock <= 0)) + { + return STATUS_ERROR; + } + send_ptr = msg; + buf_size = len; + + /* + * Wait for something to happen. + */ +#ifdef MSG_DONTWAIT + flag |= MSG_DONTWAIT; +#endif +#ifdef MSG_NOSIGNAL + flag |= MSG_NOSIGNAL; +#endif + + for (;;) + { + timeout.tv_sec = PGR_Replication_Timeout; + timeout.tv_usec = 0; + + FD_ZERO(&wmask); + FD_SET(sock,&wmask); + rtn = select(sock+1, (fd_set *)NULL, &wmask, (fd_set *)NULL, &timeout); + + if (rtn < 0 ) + { + if (errno == EAGAIN || errno == EINTR) + continue; + + show_error("%s:send-select error: %d(%s)",func,errno,strerror(errno)); + return STATUS_ERROR; + } + else if (rtn & FD_ISSET(sock, &wmask)) + { + s = send(sock,send_ptr + send_size,buf_size - send_size ,flag); + if (s < 0) + { + if (errno == EINTR || errno == EAGAIN) + continue; + else + { + show_error("%s:send error: %d(%s)",func,errno,strerror(errno)); + memset(send_ptr, 0, len); + return STATUS_ERROR; + } + } + else if (s == 0) + { + show_error("%s:unexpected EOF"); + memset(send_ptr, 0, len); + return STATUS_ERROR; + } + else /* s > 0 */ + { + send_size += s; + if (send_size == buf_size) + { + return STATUS_OK; + } + } + } + } + show_error("%s:send-select unknown error: %d(%s)", + func,errno,strerror(errno)); + return STATUS_ERROR; +} + +static void +exit_rlog(int sig) +{ + sigset_t mask; + + Exit_Request = true; + if (sig == SIGTERM) + { + if (Idle_Flag == BUSY_MODE) + { + return; + } + } + + sigemptyset(&mask); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGQUIT); + sigprocmask(SIG_BLOCK, &mask, NULL); + + if (RLog_Recv_Sock >= 0) + { + close(RLog_Recv_Sock); + RLog_Recv_Sock = -1; + } + if (Replicateion_Log->RLog_Sock_Path != NULL) + { + unlink(Replicateion_Log->RLog_Sock_Path); + free(Replicateion_Log->RLog_Sock_Path); + } + exit(0); +} +#endif /* USE_REPLICATION */ diff -aruN postgresql-8.2.4/src/pgcluster/tool/Makefile pgcluster-1.7.0rc7/src/pgcluster/tool/Makefile --- postgresql-8.2.4/src/pgcluster/tool/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/tool/Makefile 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/pgcluster/pgrp +# +#------------------------------------------------------------------------- + +subdir = src/pgcluster/tool +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS= pgcbench.o + +# EXTRA_OBJS = $(top_builddir)/src/backend/libpq/replicate_com.o + +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) -DBINDIR=\"$(bindir)\" + +all: pgcbench + +pgcbench: $(OBJS) $(libpq_builddir)/libpq.a + $(CC) $(CFLAGS) $(OBJS) $(EXTRA_OBJS) $(libpq) $(LDFLAGS) $(LIBS) -o $@ + +install: all installdirs + $(INSTALL_PROGRAM) pgcbench$(X) $(DESTDIR)$(bindir)/pgcbench$(X) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + +uninstall: + rm -f $(addprefix $(DESTDIR)$(bindir)/, pgcbench$(X)) + +clean distclean maintainer-clean: + rm -f pgcbench$(X) $(OBJS) diff -aruN postgresql-8.2.4/src/pgcluster/tool/README.jp pgcluster-1.7.0rc7/src/pgcluster/tool/README.jp --- postgresql-8.2.4/src/pgcluster/tool/README.jp 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/tool/README.jp 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,296 @@ +$Id$ + +■ pgcbench とは + +pgcbench は PGCluster のベンチマークテストを行うプログラムです。もちろ +ん、PGCluster だけではなく PostgreSQL のベンチマークを実施することもで +きます。 + +pgcbench は SELECT、UPDATE、INSERT を含むトランザクションを実行し、全 +体の実行時間と実際に完了したであろうトランザクションの数から 1 秒間に +実行できたトランザクション数 (tps) を表示します。処理の対象となるテー +ブルはデフォルトでは 10 万行のデータを含みます。 + +実際に表示は以下のようになります。 + + number of clients: 4 + number of transactions actually processed: 100 + run time (sec) = 4.416423 + tps = 22.642759 (including connections establishing) + +※ 注意 + + pgcbench では、トランザクションが実際に完了したかどうかに関係なく、 + 最初に指定されたトランザクションの数をもとに tps を求めているため、 + ベンチマークが途中で終了してしまった場合、表示される tps が信用でき + ない可能性があることに注意してください。 + +なお、pgcbench は pgbench という PostgreSQL 用に書かれたベンチマークテ +ストを行なうプログラムをもとに作成されました。 + + +■ pgbench との違い + +・ マルチユーザ環境の実現方法 + + pgbench が libpq の非同期処理機能を使ったシングルプロセスによってシ + ミュレートしているのに対して、pgcbench では fork を使ったマルチプロ + セスによって実現しています。これは、シングルプロセスではロックが発 + 生すると、PGCluster が止まってしまうことを避けるためです。 + +・ オプションの変更 + + pgcbench には、PGCluster のベンチマークテストを行なうのに便利な、ト + ランザクションの内容を指定するオプションが、いくつか追加されていま + す. + + +■ pgcbench のインストール + +1. PGCluster を configure、make します。 + + pgcbench のインストールだけが目的であれば、PGCluster のすべてをコン + パイルする必要はありません。PGCluster ソースのトップディレクトリで + configure をした後、src/interface/libpq で make all を実行すれば準 + 備完了です。 + +2. このディレクトリ (src/pgcluster/tool) で make を実行します。 + + そうすると、pgcbench という実行プログラムができます。そのまま実行し + ても構いませんし、make install を実行して PGCluster の標準実行プロ + グラムディレクトリ (デフォルトでは /usr/local/pgsql/bin) にインストー + ルすることもできます。 + + +■ pgcbench の使い方 + + pgcbench [オプション] [データベース名] + +データベース名を省略すると、ログイン名と同じデータベース名を指定したも +のと見なします。なお、pgcbench でデフォルトのベンチマークを実施するた +めには、後述の -i オプションを使用してデータベースをあらかじめ初期化し +ておく必要があります。 + +pgcbench にはいろいろなオプションがあります。 + +-h ホスト名 + + PostgreSQLの データベースデーモン postmaster の動作しているホスト名 + を指定します。省略すると環境変数 PGHOST に設定したホスト名が指定さ + れます。PGHOST も指定されていないと自ホストに Unix ドメインソケット + で接続します。 + +-p ポート番号 + + postmaster の使用するポート番号を指定します。省略すると環境変数 + PGPORT に設定したポート番号が指定されます。PGPORT も指定されていな + いと 5432 が指定されたものと見なします。 + +-c クライアント数 + + 同時実行クライアント数を指定します。省略時は 1 となります。pgcbench + は同時実行クライアントごとにファイルディスクリプタを使用するので、 + 使用可能ファイルディスクリプタ数を越えるクライアント数は指定できま + せん。使用可能ファイルディスクリプタ数は limit や ulimit コマンドで + 確認することができます。 + +-t トランザクション数 + + トランザクション数を指定します。各クライアントが実行するトランザク + ション数はこれをクライアント数で割った数となります。省略時は 10 と + なります。 + +-s スケーリングファクター + + -i オプションとともに使用します。スケーリングファクターは 1 以上の + 整数です。スケーリングファクターを変えることにより、テストの対象と + なるテーブルの大きさが 10 万 × スケーリングファクターになります。 + デフォルトのスケーリングファクターは 1 です。 + +-u ログイン名 + + DB ユーザのログイン名を指定します。省略すると環境変数 PGUSER に設定 + したログイン名が指定されます。 + +-P パスワード + + パスワードを指定します。なお、このオプションを使うと、パスワードを + ps コマンドで見られるなど、セキュリティホールになる可能性があるので、 + テスト用にのみお使い下さい。 + +-n + + このオプションを指定すると、ベンチマーク開始前に VACUUM と history + テーブルのクリアを実行しません。 + +-v + + このオプションを指定すると、ベンチマーク開始前に VACUUM と history + テーブルのクリアを実行します。-v と -n を省略すると、最小限の + VACUUM などを実行します。すなわち、history テーブルのクリアと、 + branches、tellers、history テーブルの VACUUM を実行します。これは、 + VACUUM の時間を最小限にしながら、パフォーマンスに影響するゴミ掃除を + 効果的に実行します。通常は -v と -n を省略することを推奨します。 + +-I + + 挿入のみのトランザクションを実行します。挿入スピードを測定したいと + きに使います。 + +-U + + 更新のみのトランザクションを実行します。更新スピードを測定したいと + きに使います。 + +-S + + 検索のみのトランザクションを実行します。検索スピードを測定したいと + きに使います。 + +-f ファイル名 + + トランザクションの内容が記述されたファイル名を指定します。このオプ + ションを指定すると、ファイルに記述された内容のトランザクションを実 + 行します。ベンチマークで使用するテーブルはあらかじめ初期化しておく + 必要があります。入力ファイルのフォーマットは後述します。 + +-T + + BEGIN と END で囲まれたトランザクションブロック内でトランザクション + を実行します。 + +-C + + このオプションを指定すると、最初に確立したコネクションを使い回すの + ではなく、各トランザクションごとに DB への接続を行います。コネクショ + ンのオーバーへッドを測定するのに有効です。 + +-l + + 個々のトランザクションの実行時間を記録します。記録先はカレントディ + レクトリ以下の pgbench_log.xxx というファイルです。ファイルのフォー + マットは以下のようになります。時間はマイクロ秒単位です。 + + <クライアント ID> <トランザクション番号> <時間> + +-d + + デバッグオプション。様々な情報が表示されます。 + + +■ データベースの初期化 + +pgcbench でデフォルトのベンチマークテストを実施するためには、あらかじ +めデータベースを初期化し、テストデータを作る必要があります。 + + pgcbench -i [データベース名] + +これにより以下のテーブルが作成されます (スケーリングファクターが 1 の +場合)。 + + テーブル名 | 行数 + ------------+-------- + branches | 1 + tellers | 10 + accounts | 100000 + history | 0 + +※ 注意 + + 同じ名前のテーブルがあると削除されてしまうのでご注意下さい。 + +スケーリングファクターを 10、100、1000 などに変更すると、上記行数はそ +れに応じて 10 倍、100 倍、1000 倍になります。例えば、スケーリングファ +クターを 100 とすると以下のようになります。 + + テーブル名 | 行数 + ------------+---------- + branches | 100 + tellers | 1000 + accounts | 10000000 + history | 0 + + +■ 入力ファイルのフォーマット + +pgcbench では、-f オプションを指定してトランザクションに含まれる SQL +コマンドの内容を記述したファイルを読み込むことができます。入力ファイル +には 1 行につき 1 つのコマンドを記述します。空行は無視され、二重ハイフ +ンで始まる行はコメントを意味します。 + +コマンドには、SQL コマンドに加え、バックスラッシュで始まるメタコマンド +を記述することができます。メタコマンドは pgcbench 自身によって実行され +ます。メタコマンドの形式はバックスラッシュ、その直後にコマンドの動詞、 +その次に引数が続きます。動詞コマンドと引数、またそれぞれの引数は空白文 +字によって区別されます。 + +今のところ、以下のメタコマンドが定義されています。 + +\setrandom name min max + + 最小値 min と最大値 max の間の値を取る乱数を、name 変数に設定します。 + +変数に乱数を設定するには、\setrandom メタコマンドを使用して以下のよう +に記述します。 + + \setrandom aid 1 100000 + +これは、変数 aid に 1 から 100000 の間の乱数を設定します。また、変数の +値を SQL コマンドに埋め込むには、以下のようにその名前の前にコロンを付 +けます。 + + SELECT abalance FROM accounts WHERE aid = :aid + +例えば、TPC-B に似たベンチマークを行うには、以下のようにトランザクショ +ンの内容をファイルに記述し、-f オプションによってそのファイルを指定し +て pgcbench を実行します。 + + \setrandom aid 1 100000 + \setrandom bid 1 1 + \setrandom tid 1 10 + \setrandom delta 1 1000 + BEGIN + UPDATE accounts SET abalance = abalance + :delta WHERE aid = :aid + SELECT abalance FROM accounts WHERE aid = :aid + UPDATE tellers SET tbalance = tbalance + :delta WHERE tid = :tid + UPDATE branches SET bbalance = bbalance + :delta WHERE bid = :bid + INSERT INTO history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, 'now') + +なお、この例では、-i オプションを使って初期化したデータベース (スケー +リングファクターが 1 の場合) に対してベンチマークを行うことを仮定して +います。 + + +■ トランザクションの定義 + +pgcbench のデフォルトのベンチマークでは、以下の SQL コマンドを全部完了 +して 1 トランザクションと数えています。 + +1. SELECT abalance FROM accounts WHERE aid = :aid + + :aid は 1 からスケーリングファクター × 10 万までの値を取る乱数です。 + ここでは 1 件だけ検索されます。以下、乱数の値はそれぞれこのトランザ + クションの中では同じ値を使います。 + +2. UPDATE accounts SET abalance = abalance + :delta WHERE aid = :aid + + :delta は 1 から 1000 までの値を取る乱数です。 + +3. SELECT abalance FROM accounts WHERE aid = :aid + +4. INSERT INTO history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, 'now') + + :tid は 1 からスケーリングファクター × 10 までの値を取る乱数、:bid + は 1 からスケーリングファクターまでの値を取る乱数です。 + +5. INSERT INTO history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, 'now') + +6. INSERT INTO history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, 'now') + +7. SELECT abalance FROM accounts WHERE aid = :aid + +-T オプションを指定してトランザクションブロック内でトランザクションを +実行する場合、1 を BEGINに、7 を END に置き換えた SQL コマンドが実行さ +れます。また、トランザクションとして実行される SQL コマンドは、-I オプ +ション (挿入のみ) であれば 4、-U (更新のみ) であれば 2、-S (検索のみ) +であれば 1 となります。 diff -aruN postgresql-8.2.4/src/pgcluster/tool/pgcbench.c pgcluster-1.7.0rc7/src/pgcluster/tool/pgcbench.c --- postgresql-8.2.4/src/pgcluster/tool/pgcbench.c 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/tool/pgcbench.c 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,1625 @@ +/* + * pgbench: a simple benchmark program for PGCluster + * This program was written based on pgbench by Tatsuo Ishii. + * + * Portions Copyright (c) 2003-2006, Atsushi Mitani + * Portions Copyright (c) 2000-2006, Tatsuo Ishii + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that the above copyright notice appear in all + * copies and that both that copyright notice and this permission + * notice appear in supporting documentation, and that the name of the + * author not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. The author makes no representations about the + * suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + */ +#include "postgres_fe.h" + +#include "libpq-fe.h" + +#include + +#ifdef WIN32 +#include "win32.h" +#else +#include +#include + +#ifdef HAVE_GETOPT_H +#include +#endif + +#ifdef HAVE_SYS_SELECT_H +#include +#endif + +/* for getrlimit */ +#include +#endif /* ! WIN32 */ + +#include +#include + +#include +#include + +extern char *optarg; +extern int optind; + +#ifdef WIN32 +#undef select +#endif + + +/******************************************************************** + * some configurable parameters */ + +#define MAXCLIENTS 4096 /* max number of clients allowed */ + +int nclients = 1; /* default number of simulated clients */ +int nxacts = 10; /* default number of transactions per + * clients */ + +/* + * scaling factor. for example, tps = 10 will make 1000000 tuples of + * accounts table. + */ +int tps = 1; + +/* + * end of configurable parameters + *********************************************************************/ + +#define nbranches 1 +#define ntellers 10 +#define naccounts 100000 + +#define SELECT_ONLY (1) +#define INSERT_ONLY (2) +#define UPDATE_ONLY (3) +#define WITH_TRANSACTION (4) +#define TPC_B_LIKE (5) +#define CUSTOM_QUERY (6) + +#define SQL_COMMAND 1 +#define META_COMMAND 2 + +FILE *LOGFILE = NULL; + +bool use_log = false; /* log transaction latencies to a file */ + +int remains; /* number of remaining clients */ + +int is_connect; /* establish connection for each + * transaction */ + +char *pghost = ""; +char *pgport = NULL; +char *pgoptions = NULL; +char *pgtty = NULL; +char *login = NULL; +char *pwd = NULL; +char *dbName; + +typedef struct +{ + char *name; + char *value; +} Variable; + +typedef struct +{ + PGconn *con; /* connection handle to DB */ + int id; /* client No. */ + int state; /* state No. */ + int cnt; /* xacts count */ + int ecnt; /* error count */ + int maxAct; + int listen; /* 0 indicates that an async query has + * been sent */ + int aid; /* account id for this transaction */ + int bid; /* branch id for this transaction */ + int tid; /* teller id for this transaction */ + int delta; + int abalance; + void *variables; + struct timeval txn_begin; /* used for measuring latencies */ +} CState; + +typedef struct +{ + int type; + int argc; + char **argv; +} Command; + +Command **commands = NULL; + +static void +usage(void) +{ + fprintf(stderr, "usage: pgcbench [-h hostname][-p port][-c nclients][-t ntransactions][-s scaling_factor][-I(insert only)][-U(update only)][-S(select only)][-f filename][-u login][-P password][-d(debug)][dbname]\n"); + fprintf(stderr, "(initialize mode): pgcbench -i [-h hostname][-p port][-s scaling_factor][-u login][-P password][-d(debug)][dbname]\n"); +} + +/* random number generator */ +static int +getrand(int min, int max ) +{ + + return (min + (int) (max * 1.0 * rand() / (RAND_MAX + 1.0))); +} + +/* set up a connection to the backend */ +static PGconn * +doConnect(void) +{ + PGconn *con; + PGresult *res; + + con = PQsetdbLogin(pghost, pgport, pgoptions, pgtty, dbName, + login, pwd); + if (con == NULL) + { + fprintf(stderr, "Connection to database '%s' failed.\n", dbName); + fprintf(stderr, "Memory allocatin problem?\n"); + return (NULL); + } + + if (PQstatus(con) == CONNECTION_BAD) + { + fprintf(stderr, "Connection to database '%s' failed.\n", dbName); + + if (PQerrorMessage(con)) + fprintf(stderr, "%s", PQerrorMessage(con)); + else + fprintf(stderr, "No explanation from the backend\n"); + + return (NULL); + } + + res = PQexec(con, "SET search_path = public"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + return (con); +} + +/* throw away response from backend */ +static void +discard_response(CState * state) +{ + PGresult *res; + + do + { + res = PQgetResult(state->con); + if (res) + PQclear(res); + } while (res); +} + +/* check to see if the SQL result was good */ +static int +check(CState * st, PGresult *res, int good) +{ + if (res && PQresultStatus(res) != good) + { + fprintf(stderr, "aborted in state %d: %s", st->state, PQerrorMessage(st->con)); + PQfinish(st->con); + st->con = NULL; + return (-1); + } + return (0); /* OK */ +} + +static int +compareVariables(const void *v1, const void *v2) +{ + return strcmp(((Variable *)v1)->name, ((Variable *)v2)->name); +} + +static char * +getVariable(CState * st, char *name) +{ + Variable key = { name }, *var; + + var = tfind(&key, &st->variables, compareVariables); + if (var != NULL) + return (*(Variable **)var)->value; + else + return NULL; +} + +static int +putVariable(CState * st, char *name, char *value) +{ + Variable key = { name }, *var; + + var = tfind(&key, &st->variables, compareVariables); + if (var == NULL) + { + if ((var = malloc(sizeof(Variable))) == NULL) + return false; + + var->name = NULL; + var->value = NULL; + + if ((var->name = strdup(name)) == NULL + || (var->value = strdup(value)) == NULL + || tsearch(var, &st->variables, compareVariables) == NULL) + { + free(var->name); + free(var->value); + free(var); + return false; + } + } + else + { + free((*(Variable **)var)->value); + if (((*(Variable **)var)->value = strdup(value)) == NULL) + return false; + } + + return true; +} + +static char * +assignVariables(CState * st, char *sql) +{ + int i, j; + char *p, *name, *val; + void *tmp; + + i = 0; + while ((p = strchr(&sql[i], ':')) != NULL) + { + i = j = p - sql; + do + i++; + while (isalnum(sql[i]) != 0 || sql[i] == '_'); + if (i == j + 1) + continue; + + name = malloc(i - j); + if (name == NULL) + return NULL; + memcpy(name, &sql[j + 1], i - (j + 1)); + name[i - (j + 1)] = '\0'; + val = getVariable(st, name); + free(name); + if (val == NULL) + continue; + + if (strlen(val) > i - j) + { + tmp = realloc(sql, strlen(sql) - (i - j) + strlen(val) + 1); + if (tmp == NULL) + { + free(sql); + return NULL; + } + sql = tmp; + } + + if (strlen(val) != i - j) + memmove(&sql[j + strlen(val)], &sql[i], strlen(&sql[i]) + 1); + + strncpy(&sql[j], val, strlen(val)); + + if (strlen(val) < i - j) + { + tmp = realloc(sql, strlen(sql) + 1); + if (tmp == NULL) + { + free(sql); + return NULL; + } + sql = tmp; + } + + i = j + strlen(val); + } + + return sql; +} + +/* process a transaction */ +static void +doMix(CState * st, int debug, int ttype) +{ + char sql[256]; + PGresult *res; + + if (st->listen) + { /* are we receiver? */ + if (debug) + fprintf(stderr, "client receiving\n"); + if (!PQconsumeInput(st->con)) + { /* there's something wrong */ + fprintf(stderr, "Client aborted in state %d. Probably the backend died while processing.\n", st->state); + PQfinish(st->con); + st->con = NULL; + return; + } + if (PQisBusy(st->con)) + return; /* don't have the whole result yet */ + + switch (st->state) + { + case 0: /* response to "begin" */ + res = PQgetResult(st->con); + if (ttype == WITH_TRANSACTION) + { + if (check(st, res, PGRES_COMMAND_OK)) + return; + } + else + { + if (check(st, res, PGRES_TUPLES_OK)) + return; + } + PQclear(res); + discard_response(st); + break; + case 1: /* response to "update accounts..." */ + res = PQgetResult(st->con); + if (check(st, res, PGRES_COMMAND_OK)) + return; + PQclear(res); + discard_response(st); + break; + case 2: /* response to "select abalance ..." */ + res = PQgetResult(st->con); + if (check(st, res, PGRES_TUPLES_OK)) + return; + PQclear(res); + discard_response(st); + break; + case 3: /* response to "update tellers ..." */ + res = PQgetResult(st->con); + if (check(st, res, PGRES_COMMAND_OK)) + return; + PQclear(res); + discard_response(st); + break; + case 4: /* response to "update branches ..." */ + res = PQgetResult(st->con); + if (check(st, res, PGRES_COMMAND_OK)) + return; + PQclear(res); + discard_response(st); + break; + case 5: /* response to "insert into history ..." */ + res = PQgetResult(st->con); + if (check(st, res, PGRES_COMMAND_OK)) + return; + PQclear(res); + discard_response(st); + break; + case 6: /* response to "end" */ + + /* + * transaction finished: record the time it took in the + * log + */ + if (use_log) + { + double diff; + struct timeval now; + + gettimeofday(&now, NULL); + diff = (int) (now.tv_sec - st->txn_begin.tv_sec) * 1000000.0 + + (int) (now.tv_usec - st->txn_begin.tv_usec); + + fprintf(LOGFILE, "%d %d %.0f\n", st->id, st->cnt, diff); + } + + res = PQgetResult(st->con); + if (ttype == WITH_TRANSACTION) + { + if (check(st, res, PGRES_COMMAND_OK)) + return; + } + else + { + if (check(st, res, PGRES_TUPLES_OK)) + return; + } + PQclear(res); + discard_response(st); + + if (is_connect) + { + PQfinish(st->con); + st->con = NULL; + } + if (++st->cnt >= st->maxAct) + { + remains--; /* I've done */ + if (st->con != NULL) + { + PQfinish(st->con); + st->con = NULL; + } + return; + } + break; + } + + /* increment state counter */ + st->state++; + if (st->state > 6) + { + st->state = 0; + remains--; /* I've done */ + } + } + + if (st->con == NULL) + { + if ((st->con = doConnect()) == NULL) + { + fprintf(stderr, "Client aborted in establishing connection.\n"); + remains--; /* I've aborted */ + PQfinish(st->con); + st->con = NULL; + return; + } + } + + switch (st->state) + { + case 0: /* about to start */ + if (ttype == WITH_TRANSACTION) + { + strcpy(sql, "begin"); + } + else + { + st->aid = getrand(1, naccounts * tps); + snprintf(sql, 256, "select abalance from accounts where aid = %d", st->aid); + } + st->aid = getrand(1, naccounts * tps); + st->bid = getrand(1, nbranches * tps); + st->tid = getrand(1, ntellers * tps); + st->delta = getrand(1, 1000); + if (use_log) + gettimeofday(&(st->txn_begin), NULL); + break; + case 1: + snprintf(sql, 256, "update accounts set abalance = abalance + %d where aid = %d\n", st->delta, st->aid); + break; + case 2: + snprintf(sql, 256, "select abalance from accounts where aid = %d", st->aid); + break; + case 3: + if (ttype == 0) + { + snprintf(sql, 256, "update tellers set tbalance = tbalance + %d where tid = %d\n", + st->delta, st->tid); + break; + } + case 4: + if (ttype == 0) + { + snprintf(sql, 256, "update branches set bbalance = bbalance + %d where bid = %d", st->delta, st->bid); + break; + } + case 5: + snprintf(sql, 256, "insert into history(tid,bid,aid,delta,mtime) values(%d,%d,%d,%d,'now')", + st->tid, st->bid, st->aid, st->delta); + break; + case 6: + if (ttype == WITH_TRANSACTION) + { + strcpy(sql, "end"); + } + else + { + st->aid = getrand(1, naccounts * tps); + snprintf(sql, 256, "select abalance from accounts where aid = %d", st->aid); + } + break; + } + + if (debug) + fprintf(stderr, "client sending %s\n", sql); + + if (PQsendQuery(st->con, sql) == 0) + { + if (debug) + fprintf(stderr, "PQsendQuery(%s)failed\n", sql); + st->ecnt++; + } + else + { + st->listen++; /* flags that should be listened */ + } +} + +/* process a select only transaction */ +static void +doOne(CState * st, int debug, int ttype ) +{ + char sql[256]; + PGresult *res; + + if (st->listen) + { /* are we receiver? */ + if (debug) + fprintf(stderr, "client receiving\n"); + if (!PQconsumeInput(st->con)) + { /* there's something wrong */ + fprintf(stderr, "Client aborted in state %d. Probably the backend died while processing.\n", st->state); + remains--; /* I've aborted */ + PQfinish(st->con); + st->con = NULL; + return; + } + if (PQisBusy(st->con)) + return; /* don't have the whole result yet */ + + switch (st->state) + { + case 0: /* response to "select abalance ..." */ + res = PQgetResult(st->con); + if (ttype == SELECT_ONLY) + { + if (check(st, res, PGRES_TUPLES_OK)) + return; + } + else + { + if (check(st, res, PGRES_COMMAND_OK)) + return; + } + PQclear(res); + discard_response(st); + + if (is_connect) + { + PQfinish(st->con); + st->con = NULL; + } + + if (++st->cnt >= st->maxAct) + { + remains--; /* I've done */ + if (st->con != NULL) + { + PQfinish(st->con); + st->con = NULL; + } + return; + } + break; + } + + /* increment state counter */ + st->state++; + if (st->state > 0) + { + st->state = 0; + remains--; /* I've done */ + } + } + + if (st->con == NULL) + { + if ((st->con = doConnect()) == NULL) + { + fprintf(stderr, "Client aborted in establishing connection.\n"); + PQfinish(st->con); + st->con = NULL; + return; + } + } + + switch (st->state) + { + case 0: + st->aid = getrand(1, naccounts * tps); + st->bid = getrand(1, nbranches * tps); + st->tid = getrand(1, ntellers * tps); + st->delta = getrand(1, 1000); + if ( ttype == SELECT_ONLY) + { + snprintf(sql, 256, "select abalance from accounts where aid = %d", st->aid); + } + if ( ttype == UPDATE_ONLY) + { + snprintf(sql, 256, "update accounts set abalance = abalance + %d where aid = %d\n", st->delta, st->aid); + } + if ( ttype == INSERT_ONLY) + { + snprintf(sql, 256, "insert into history(tid,bid,aid,delta,mtime) values(%d,%d,%d,%d,'now')", + st->tid, st->bid, st->aid, st->delta); + } + break; + } + + if (debug) + fprintf(stderr, "client sending %s\n", sql); + + if (PQsendQuery(st->con, sql) == 0) + { + if (debug) + fprintf(stderr, "PQsendQuery(%s)failed\n", sql); + st->ecnt++; + } + else + { + st->listen++; /* flags that should be listened */ + } +} + +static void +doCustom(CState * st, int debug, int ttype ) +{ + PGresult *res; + + if (st->listen) + { /* are we receiver? */ + if (commands[st->state]->type == SQL_COMMAND) + { + if (debug) + fprintf(stderr, "client receiving\n"); + if (!PQconsumeInput(st->con)) + { /* there's something wrong */ + fprintf(stderr, "Client aborted in state %d. Probably the backend died while processing.\n", st->state); + PQfinish(st->con); + st->con = NULL; + return; + } + if (PQisBusy(st->con)) + return; /* don't have the whole result yet */ + } + + /* + * transaction finished: record the time it took in the + * log + */ + if (use_log && commands[st->state + 1] == NULL) + { + double diff; + struct timeval now; + + gettimeofday(&now, NULL); + diff = (int) (now.tv_sec - st->txn_begin.tv_sec) * 1000000.0 + + (int) (now.tv_usec - st->txn_begin.tv_usec); + + fprintf(LOGFILE, "%d %d %.0f\n", st->id, st->cnt, diff); + } + + if (commands[st->state]->type == SQL_COMMAND) + { + res = PQgetResult(st->con); + if (strncasecmp(commands[st->state]->argv[0], "select", 6) != 0) + { + if (check(st, res, PGRES_COMMAND_OK)) + return; + } + else + { + if (check(st, res, PGRES_TUPLES_OK)) + return; + } + PQclear(res); + discard_response(st); + } + + if (commands[st->state + 1] == NULL) + { + if (is_connect) + { + PQfinish(st->con); + st->con = NULL; + } + if (++st->cnt >= st->maxAct) + { + remains--; /* I've done */ + if (st->con != NULL) + { + PQfinish(st->con); + st->con = NULL; + } + return; + } + } + + /* increment state counter */ + st->state++; + if (commands[st->state] == NULL) + { + st->state = 0; + remains--; /* I've done */ + } + } + + if (st->con == NULL) + { + if ((st->con = doConnect()) == NULL) + { + fprintf(stderr, "Client aborted in establishing connection.\n"); + remains--; /* I've aborted */ + PQfinish(st->con); + st->con = NULL; + return; + } + } + + if (use_log && st->state == 0) + gettimeofday(&(st->txn_begin), NULL); + + if (commands[st->state]->type == SQL_COMMAND) + { + char *sql; + + if ((sql = strdup(commands[st->state]->argv[0])) == NULL + || (sql = assignVariables(st, sql)) == NULL) + { + fprintf(stderr, "out of memory\n"); + st->ecnt++; + return; + } + + if (debug) + fprintf(stderr, "client sending %s\n", sql); + + if (PQsendQuery(st->con, sql) == 0) + { + if (debug) + fprintf(stderr, "PQsendQuery(%s)failed\n", sql); + st->ecnt++; + } + else + { + st->listen++; /* flags that should be listened */ + } + + free(sql); + } + else if (commands[st->state]->type == META_COMMAND) + { + int argc = commands[st->state]->argc, i; + char **argv = commands[st->state]->argv; + + if (debug) + { + fprintf(stderr, "client executing \\%s", argv[0]); + for (i = 1; i < argc; i++) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n"); + } + + if (strcasecmp(argv[0], "setrandom") == 0) + { + char *val; + + if ((val = malloc(strlen(argv[3]) + 1)) == NULL) + { + fprintf(stderr, "%s: out of memory\n", argv[0]); + st->ecnt++; + return; + } + + sprintf(val, "%d", getrand(atoi(argv[2]), atoi(argv[3]))); + + if (putVariable(st, argv[1], val) == false) + { + fprintf(stderr, "%s: out of memory\n", argv[0]); + free(val); + st->ecnt++; + return; + } + + free(val); + st->listen++; + } + } +} + +/* discard connections */ +static void +disconnect_all(CState * state) +{ + if (state->con) + PQfinish(state->con); +} + +/* create tables and setup data */ +static void +init(void) +{ + PGconn *con; + PGresult *res; + static char *DDLs[] = { + "drop table branches", + "create table branches(bid int not null,bbalance int,filler char(88))", + "drop table tellers", + "create table tellers(tid int not null,bid int,tbalance int,filler char(84))", + "drop table accounts", + "create table accounts(aid int not null,bid int,abalance int,filler char(84))", + "drop table history", + "create table history(tid int,bid int,aid int,delta int,mtime timestamp,filler char(22))"}; + static char *DDLAFTERs[] = { + "alter table branches add primary key (bid)", + "alter table tellers add primary key (tid)", + "alter table accounts add primary key (aid)"}; + + + char sql[256]; + + int i; + + if ((con = doConnect()) == NULL) + exit(1); + + for (i = 0; i < (sizeof(DDLs) / sizeof(char *)); i++) + { + res = PQexec(con, DDLs[i]); + if (strncmp(DDLs[i], "drop", 4) && PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + } + + res = PQexec(con, "begin"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + for (i = 0; i < nbranches * tps; i++) + { + snprintf(sql, 256, "insert into branches(bid,bbalance) values(%d,0)", i + 1); + res = PQexec(con, sql); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + } + + for (i = 0; i < ntellers * tps; i++) + { + snprintf(sql, 256, "insert into tellers(tid,bid,tbalance) values (%d,%d,0)" + ,i + 1, i / ntellers + 1); + res = PQexec(con, sql); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + } + + res = PQexec(con, "end"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + /* + * occupy accounts table with some data + */ + fprintf(stderr, "creating tables...\n"); + for (i = 0; i < naccounts * tps; i++) + { + int j = i + 1; + + if (j % 10000 == 1) + { + res = PQexec(con, "copy accounts from stdin"); + if (PQresultStatus(res) != PGRES_COPY_IN) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + } + + snprintf(sql, 256, "%d\t%d\t%d\t\n", j, i / naccounts + 1, 0); + if (PQputline(con, sql)) + { + fprintf(stderr, "PQputline failed\n"); + exit(1); + } + + if (j % 10000 == 0) + { + /* + * every 10000 tuples, we commit the copy command. this should + * avoid generating too much WAL logs + */ + fprintf(stderr, "%d tuples done.\n", j); + if (PQputline(con, "\\.\n")) + { + fprintf(stderr, "very last PQputline failed\n"); + exit(1); + } + + if (PQendcopy(con)) + { + fprintf(stderr, "PQendcopy failed\n"); + exit(1); + } + +#ifdef NOT_USED + + /* + * do a checkpoint to purge the old WAL logs + */ + res = PQexec(con, "checkpoint"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); +#endif /* NOT_USED */ + } + } + fprintf(stderr, "set primary key...\n"); + for (i = 0; i < (sizeof(DDLAFTERs) / sizeof(char *)); i++) + { + res = PQexec(con, DDLAFTERs[i]); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + } + + /* vacuum */ + fprintf(stderr, "vacuum..."); + res = PQexec(con, "vacuum analyze"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + fprintf(stderr, "done.\n"); + + PQfinish(con); +} + +static int +process_file(char *filename) +{ + const char delim[] = " \f\n\r\t\v"; + + FILE *fd; + int lineno, i, j; + char buf[BUFSIZ], *p, *tok; + void *tmp; + + if (strcmp(filename, "-") == 0) + fd = stdin; + else if ((fd = fopen(filename, "r")) == NULL) + { + fprintf(stderr, "%s: %s\n", strerror(errno), filename); + return false; + } + + fprintf(stderr, "processing file...\n"); + + lineno = 1; + i = 0; + while (fgets(buf, sizeof(buf), fd) != NULL) + { + if ((p = strchr(buf, '\n')) != NULL) + *p = '\0'; + p = buf; + while (isspace(*p)) + p++; + if (*p == '\0' || strncmp(p, "--", 2) == 0) + { + lineno++; + continue; + } + + if ((tmp = realloc(commands, sizeof(Command *) * (i + 1))) == NULL) + { + i--; + goto error; + } + commands = tmp; + + if ((commands[i] = malloc(sizeof(Command))) == NULL) + goto error; + + commands[i]->argv = NULL; + commands[i]->argc = 0; + + if (*p == '\\') + { + commands[i]->type = META_COMMAND; + + j = 0; + tok = strtok(++p, delim); + while (tok != NULL) + { + tmp = realloc(commands[i]->argv, sizeof(char *) * (j + 1)); + if (tmp == NULL) + goto error; + commands[i]->argv = tmp; + + if ((commands[i]->argv[j] = strdup(tok)) == NULL) + goto error; + + commands[i]->argc++; + + j++; + tok = strtok(NULL, delim); + } + + if (strcasecmp(commands[i]->argv[0], "setrandom") == 0) + { + int min, max; + + if (commands[i]->argc < 4) + { + fprintf(stderr, "%s: %d: \\%s: missing argument\n", filename, lineno, commands[i]->argv[0]); + goto error; + } + + for (j = 4; j < commands[i]->argc; j++) + fprintf(stderr, "%s: %d: \\%s: extra argument \"%s\" ignored\n", filename, lineno, commands[i]->argv[0], commands[i]->argv[j]); + + if ((min = atoi(commands[i]->argv[2])) < 0) + { + fprintf(stderr, "%s: %d: \\%s: invalid minimum number %s\n", filename, lineno, commands[i]->argv[0], commands[i]->argv[2]); + goto error; + } + + if ((max = atoi(commands[i]->argv[3])) < min || max > RAND_MAX) + { + fprintf(stderr, "%s: %d: \\%s: invalid maximum number %s\n", filename, lineno, commands[i]->argv[0], commands[i]->argv[3]); + goto error; + } + } + else + { + fprintf(stderr, "%s: %d: invalid command \\%s\n", filename, lineno, commands[i]->argv[0]); + goto error; + } + } + else + { + commands[i]->type = SQL_COMMAND; + + if ((commands[i]->argv = malloc(sizeof(char *))) == NULL) + goto error; + + if ((commands[i]->argv[0] = strdup(p)) == NULL) + goto error; + + commands[i]->argc++; + } + + i++; + lineno++; + } + fclose(fd); + + if ((tmp = realloc(commands, sizeof(Command *) * (i + 1))) == NULL) + goto error; + commands = tmp; + + commands[i] = NULL; + + return true; + +error: + if (errno == ENOMEM) + fprintf(stderr, "%s: %d: out of memory\n", filename, lineno); + + fclose(fd); + + if (commands == NULL) + return false; + + while (i >= 0) + { + if (commands[i] != NULL) + { + for (j = 0; j < commands[i]->argc; j++) + free(commands[i]->argv[j]); + + free(commands[i]->argv); + free(commands[i]); + } + + i--; + } + free(commands); + + return false; +} + +/* print out results */ +static void +printResults( + int ttype, int normal_xacts, + struct timeval * tv1, struct timeval * tv2, + struct timeval * tv3) +{ + double t1, + t2; + char *s; + + t1 = (tv3->tv_sec - tv1->tv_sec) * 1000000.0 + (tv3->tv_usec - tv1->tv_usec); + t1 = t1 / 1000000.0 ; + + t2 = (tv3->tv_sec - tv1->tv_sec) * 1000000.0 + (tv3->tv_usec - tv1->tv_usec); + t2 = normal_xacts * 1000000.0 / t2; + +#define SELECT_ONLY (1) +#define INSERT_ONLY (2) +#define UPDATE_ONLY (3) +#define WITH_TRANSACTION (4) + switch (ttype) + { + case 0: + s = "TPC-B (sort of)"; + break; + case SELECT_ONLY : + s = "SELECT only"; + break; + case INSERT_ONLY : + s = "INSERT only"; + break; + case UPDATE_ONLY : + s = "UPDATE only"; + break; + case CUSTOM_QUERY : + s = "Custom query"; + break; + default: + s = "Mix query"; + break; + } + + + printf("transaction type: %s\n", s); + printf("scaling factor: %d\n", tps); + printf("number of clients: %d\n", nclients); + printf("number of transactions actually processed: %d\n", normal_xacts ); + printf("run time (sec) = %f \n", t1); + printf("tps = %f (including connections establishing)\n", t2); +} + +static int +doChild(int clientId, int min, int max, int debug, int ttype) +{ + CState state; /* status of clients */ + + struct timeval tv1; /* start up time */ + fd_set input_mask; + int nsocks = 0; /* return from select(2) */ + int sock = 0; + + gettimeofday(&tv1, NULL); + srand((unsigned int) tv1.tv_usec + clientId ); + + memset((char *)&state,0,sizeof(CState)); + /* make connections to the database */ + state.id = clientId; + if ((state.con = doConnect()) == NULL) + exit(1); + + state.maxAct = max - min + 1; + /* send start up queries in async manner */ + switch (ttype) + { + case WITH_TRANSACTION : + case TPC_B_LIKE : + doMix(&state, debug, ttype); + break; + case CUSTOM_QUERY : + doCustom(&state, debug, ttype); + break; + default : + doOne(&state, debug, ttype); + break; + } + + remains = max; + for (;;) + { + if (remains < min || !state.con) + { + break; + } + + FD_ZERO(&input_mask); + + if (ttype != CUSTOM_QUERY || commands[state.state]->type != META_COMMAND) + { + if (state.con == NULL) + { + if ((state.con = doConnect()) == NULL) + { + exit(1); + } + } + sock = PQsocket(state.con); + + if (sock < 0) + { + fprintf(stderr, "Client %d: PQsocket failed\n", clientId); + disconnect_all(&state); + exit(1); + } + FD_SET(sock, &input_mask); + + if ((nsocks = select(sock + 1, &input_mask, (fd_set *) NULL, + (fd_set *) NULL, (struct timeval *) NULL)) < 0) + { + if (errno == EINTR) + continue; + /* must be something wrong */ + disconnect_all(&state); + fprintf(stderr, "select failed: %s\n", strerror(errno)); + exit(1); + } + else if (nsocks == 0) + { /* timeout */ + fprintf(stderr, "select timeout\n"); + fprintf(stderr, "client %d:state %d cnt %d ecnt %d listen %d\n", + clientId, state.state, state.cnt, state.ecnt, state.listen); + exit(0); + } + } + + /* ok, backend returns reply */ + if (state.con && (FD_ISSET(PQsocket(state.con), &input_mask) + || (ttype == CUSTOM_QUERY + && commands[state.state]->type == META_COMMAND))) + { + switch (ttype) + { + case WITH_TRANSACTION : + case TPC_B_LIKE : + doMix(&state, debug, ttype); + break; + case CUSTOM_QUERY : + doCustom(&state, debug, ttype); + break; + default : + doOne(&state, debug, ttype); + break; + } + } + } + disconnect_all(&state); + return 1; +} + +static int +doClient(int debug, int ttype) +{ + pid_t pid; + int i; + int min,max; + int base,mo; + + base = nxacts / nclients; + mo = nxacts % nclients; + min = max = 0; + for ( i = 0 ; i < nclients ; i ++) + { + min = max + 1; + max += base; + if (mo > 0) + { + max += 1; + mo --; + } + pid = fork(); + if (pid == 0) + { + doChild(i, min, max, debug, ttype); + exit(0); + } + } + while ( wait(NULL) > 0) + ; + return 1; +} + +int +main(int argc, char **argv) +{ + int c; + int is_init_mode = 0; /* initialize mode? */ + int is_no_vacuum = 0; /* no vacuum at all before + * testing? */ + int is_full_vacuum = 0; /* do full vacuum before testing? */ + int debug = 0; /* debug flag */ + int ttype = TPC_B_LIKE; /* transaction type */ + char *filename = NULL; + + struct timeval tv1; /* start up time */ + struct timeval tv2; /* after establishing all connections to + * the backend */ + struct timeval tv3; /* end time */ + +#if !(defined(__CYGWIN__) || defined(__MINGW32__)) + struct rlimit rlim; +#endif + + PGconn *con; + PGresult *res; + char *env; + + if ((env = getenv("PGHOST")) != NULL && *env != '\0') + pghost = env; + if ((env = getenv("PGPORT")) != NULL && *env != '\0') + pgport = env; + else if ((env = getenv("PGUSER")) != NULL && *env != '\0') + login = env; + + while ((c = getopt(argc, argv, "ih:nvp:dc:t:s:u:P:CNSlTUIf:")) != -1) + { + switch (c) + { + case 'i': + is_init_mode++; + break; + case 'h': + pghost = optarg; + break; + case 'n': + is_no_vacuum++; + break; + case 'v': + is_full_vacuum++; + break; + case 'p': + pgport = optarg; + break; + case 'd': + debug++; + break; + case 'S': + ttype = SELECT_ONLY; + break; + case 'I': + ttype = INSERT_ONLY; + break; + case 'U': + ttype = UPDATE_ONLY; + break; + case 'T': + ttype = WITH_TRANSACTION; + break; + case 'c': + nclients = atoi(optarg); + if (nclients <= 0 || nclients > MAXCLIENTS) + { + fprintf(stderr, "invalid number of clients: %d\n", nclients); + exit(1); + } +#if !(defined(__CYGWIN__) || defined(__MINGW32__)) +#ifdef RLIMIT_NOFILE /* most platform uses RLIMIT_NOFILE */ + if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) + { +#else /* but BSD doesn't ... */ + if (getrlimit(RLIMIT_OFILE, &rlim) == -1) + { +#endif /* HAVE_RLIMIT_NOFILE */ + fprintf(stderr, "getrlimit failed. reason: %s\n", strerror(errno)); + exit(1); + } + if (rlim.rlim_cur <= (nclients + 2)) + { + fprintf(stderr, "You need at least %d open files resource but you are only allowed to use %ld.\n", nclients + 2, (long) rlim.rlim_cur); + fprintf(stderr, "Use limit/ulimt to increase the limit before using pgbench.\n"); + exit(1); + } +#endif /* #if !(defined(__CYGWIN__) || defined(__MINGW32__)) */ + break; + case 'C': + is_connect = 1; + break; + case 's': + tps = atoi(optarg); + if (tps <= 0) + { + fprintf(stderr, "invalid scaling factor: %d\n", tps); + exit(1); + } + break; + case 't': + nxacts = atoi(optarg); + if (nxacts <= 0) + { + fprintf(stderr, "invalid number of transactions: %d\n", nxacts); + exit(1); + } + break; + case 'u': + login = optarg; + break; + case 'P': + pwd = optarg; + break; + case 'l': + use_log = true; + break; + case 'f': + ttype = CUSTOM_QUERY; + filename = optarg; + break; + default: + usage(); + exit(1); + break; + } + } + + if (argc > optind) + dbName = argv[optind]; + else + { + if ((env = getenv("PGDATABASE")) != NULL && *env != '\0') + dbName = env; + else if (login != NULL && *login != '\0') + dbName = login; + else + dbName = ""; + } + + if (is_init_mode) + { + init(); + exit(0); + } + + if (use_log) + { + char logpath[64]; + + snprintf(logpath, 64, "pgbench_log.%d", getpid()); + LOGFILE = fopen(logpath, "w"); + + if (LOGFILE == NULL) + { + fprintf(stderr, "Couldn't open logfile \"%s\": %s", logpath, strerror(errno)); + exit(1); + } + } + + if (debug) + { + printf("pghost: %s pgport: %s nclients: %d nxacts: %d dbName: %s\n", + pghost, pgport, nclients, nxacts, dbName); + } + + /* opening connection... */ + con = doConnect(); + if (con == NULL) + exit(1); + + if (PQstatus(con) == CONNECTION_BAD) + { + fprintf(stderr, "Connection to database '%s' failed.\n", dbName); + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + + if (ttype == CUSTOM_QUERY) + { + PQfinish(con); + if (process_file(filename) == false) + exit(1); + } + else + { + /* + * get the scaling factor that should be same as count(*) from + * branches... + */ + res = PQexec(con, "select count(*) from branches"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + tps = atoi(PQgetvalue(res, 0, 0)); + if (tps < 0) + { + fprintf(stderr, "count(*) from branches invalid (%d)\n", tps); + exit(1); + } + PQclear(res); + + if (!is_no_vacuum) + { + fprintf(stderr, "starting vacuum..."); + res = PQexec(con, "vacuum branches"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + res = PQexec(con, "vacuum tellers"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + res = PQexec(con, "delete from history"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + res = PQexec(con, "vacuum history"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + + fprintf(stderr, "end.\n"); + + if (is_full_vacuum) + { + fprintf(stderr, "starting full vacuum..."); + res = PQexec(con, "vacuum analyze accounts"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, "%s", PQerrorMessage(con)); + exit(1); + } + PQclear(res); + fprintf(stderr, "end.\n"); + } + } + PQfinish(con); + } + + /* set random seed */ + gettimeofday(&tv1, NULL); + srand((unsigned int) tv1.tv_usec); + /* get start up time */ + gettimeofday(&tv1, NULL); + /* time after connections set up */ + gettimeofday(&tv2, NULL); + + doClient(debug, ttype); + + /* get end time */ + gettimeofday(&tv3, NULL); + printResults(ttype, nxacts, &tv1, &tv2, &tv3); + if (LOGFILE) + fclose(LOGFILE); + return 1; +} diff -aruN postgresql-8.2.4/src/pgcluster/tool/pgcbench.sh pgcluster-1.7.0rc7/src/pgcluster/tool/pgcbench.sh --- postgresql-8.2.4/src/pgcluster/tool/pgcbench.sh 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/tool/pgcbench.sh 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,30 @@ +#! /bin/bash + +set -e + +while getopts ih:nvp:dc:t:s:u:P:CNSlTUIf: opt; do + case $opt in + f) + filename=$OPTARG + ;; + *) + opts=(${opts[@]} -$opt $OPTARG) + ;; + esac +done +shift $(($OPTIND - 1)) +dbname=$1 + +tps=$(psql -At -c "SELECT count(*) FROM branches" $dbname) + +vacuumdb -t branches $dbname +vacuumdb -t tellers $dbname +psql -c "DELETE FROM history" $dbname +vacuumdb -t history $dbname + +if [ -z $filename ]; then + pgcbench ${opts[@]} $@ +else + perl -pe "BEGIN { \$tps = $tps } s/\`([^\`]+)\`/eval \$1/eg" $filename \ + | pgcbench ${opts[@]} -f - $@ +fi diff -aruN postgresql-8.2.4/src/pgcluster/tool/tpc-b_like.sql pgcluster-1.7.0rc7/src/pgcluster/tool/tpc-b_like.sql --- postgresql-8.2.4/src/pgcluster/tool/tpc-b_like.sql 1970-01-01 01:00:00.000000000 +0100 +++ pgcluster-1.7.0rc7/src/pgcluster/tool/tpc-b_like.sql 2007-02-18 22:52:17.000000000 +0100 @@ -0,0 +1,11 @@ +\setrandom aid 1 `100000 * $tps` +\setrandom bid 1 `1 * $tps` +\setrandom tid 1 `10 * $tps` +\setrandom delta 1 1000 +BEGIN +UPDATE accounts SET abalance = abalance + :delta WHERE aid = :aid +SELECT abalance FROM accounts WHERE aid = :aid +UPDATE tellers SET tbalance = tbalance + :delta WHERE tid = :tid +UPDATE branches SET bbalance = bbalance + :delta WHERE bid = :bid +INSERT INTO history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, current_timestamp) +END