--- /dev/null
+*** dbinc_auto/int_def.in 2002/09/03 17:27:19 1.70
+--- dbinc_auto/int_def.in 2002/09/18 19:01:43
+***************
+*** 1172,1177 ****
+--- 1172,1178 ----
+ #define __txn_force_abort __txn_force_abort@DB_VERSION_UNIQUE_NAME@
+ #define __txn_preclose __txn_preclose@DB_VERSION_UNIQUE_NAME@
+ #define __txn_reset __txn_reset@DB_VERSION_UNIQUE_NAME@
++ #define __txn_updateckp __txn_updateckp@DB_VERSION_UNIQUE_NAME@
+ #define __txn_regop_log __txn_regop_log@DB_VERSION_UNIQUE_NAME@
+ #define __txn_regop_getpgnos __txn_regop_getpgnos@DB_VERSION_UNIQUE_NAME@
+ #define __txn_regop_print __txn_regop_print@DB_VERSION_UNIQUE_NAME@
+*** dbinc_auto/txn_ext.h 2002/09/03 17:27:20 1.32
+--- dbinc_auto/txn_ext.h 2002/09/18 19:01:43
+***************
+*** 20,25 ****
+--- 20,26 ----
+ int __txn_force_abort __P((DB_ENV *, u_int8_t *));
+ int __txn_preclose __P((DB_ENV *));
+ int __txn_reset __P((DB_ENV *));
++ void __txn_updateckp __P((DB_ENV *, DB_LSN *));
+ int __txn_regop_log __P((DB_ENV *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, int32_t));
+ int __txn_regop_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ int __txn_regop_print __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+*** rep/rep_record.c 2002/09/11 19:39:11 1.111
+--- rep/rep_record.c 2002/09/18 19:01:58
+***************
+*** 1198,1203 ****
+--- 1198,1206 ----
+ * replica get flushed now and again.
+ */
+ ret = dbenv->log_flush(dbenv, &ckp_lsn);
++ /* Update the last_ckp in the txn region. */
++ if (ret == 0)
++ __txn_updateckp(dbenv, &rp->lsn);
+ break;
+ case DB___txn_regop:
+ if (!F_ISSET(dbenv, DB_ENV_REP_LOGSONLY))
+*** txn/txn.c 2002/08/29 17:41:17 11.179
+--- txn/txn.c 2002/09/18 19:02:05
+***************
+*** 1209,1226 ****
+ return (ret);
+ }
+
+! /*
+! * We want to make sure last_ckp only moves forward; since
+! * we drop locks above and in log_put, it's possible
+! * for two calls to __txn_ckp_log to finish in a different
+! * order from how they were called.
+! */
+! R_LOCK(dbenv, &mgr->reginfo);
+! if (log_compare(®ion->last_ckp, &ckp_lsn) < 0) {
+! region->last_ckp = ckp_lsn;
+! (void)time(®ion->time_ckp);
+! }
+! R_UNLOCK(dbenv, &mgr->reginfo);
+ }
+ return (0);
+ }
+--- 1209,1215 ----
+ return (ret);
+ }
+
+! __txn_updateckp(dbenv, &ckp_lsn);
+ }
+ return (0);
+ }
+***************
+*** 1403,1406 ****
+--- 1392,1428 ----
+ DB_ASSERT(LOGGING_ON(dbenv));
+ return (__txn_recycle_log(dbenv,
+ NULL, &scrap, 0, TXN_MINIMUM, TXN_MAXIMUM));
++ }
++
++ /*
++ * __txn_updateckp --
++ * Update the last_ckp field in the transaction region. This happens
++ * at the end of a normal checkpoint and also when a replication client
++ * receives a checkpoint record.
++ *
++ * PUBLIC: void __txn_updateckp __P((DB_ENV *, DB_LSN *));
++ */
++ void
++ __txn_updateckp(dbenv, lsnp)
++ DB_ENV *dbenv;
++ DB_LSN *lsnp;
++ {
++ DB_TXNMGR *mgr;
++ DB_TXNREGION *region;
++
++ mgr = dbenv->tx_handle;
++ region = mgr->reginfo.primary;
++
++ /*
++ * We want to make sure last_ckp only moves forward; since
++ * we drop locks above and in log_put, it's possible
++ * for two calls to __txn_ckp_log to finish in a different
++ * order from how they were called.
++ */
++ R_LOCK(dbenv, &mgr->reginfo);
++ if (log_compare(®ion->last_ckp, lsnp) < 0) {
++ region->last_ckp = *lsnp;
++ (void)time(®ion->time_ckp);
++ }
++ R_UNLOCK(dbenv, &mgr->reginfo);
+ }
--- /dev/null
+*** mp/mp_fopen.c.orig 2002/08/26 15:22:01 11.90
+--- mp/mp_fopen.c 2002/09/27 15:40:36
+***************
+*** 345,350 ****
+--- 345,367 ----
+ }
+
+ /*
++ * Figure out the file's size.
++ *
++ * !!!
++ * We can't use off_t's here, or in any code in the mainline library
++ * for that matter. (We have to use them in the os stubs, of course,
++ * as there are system calls that take them as arguments.) The reason
++ * is some customers build in environments where an off_t is 32-bits,
++ * but still run where offsets are 64-bits, and they pay us a lot of
++ * money.
++ */
++ if ((ret = __os_ioinfo(
++ dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
++ __db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
++ goto err;
++ }
++
++ /*
+ * Get the file id if we weren't given one. Generated file id's
+ * don't use timestamps, otherwise there'd be no chance of any
+ * other process joining the party.
+***************
+*** 470,475 ****
+--- 487,493 ----
+ F_SET(mfp, MP_DIRECT);
+ if (LF_ISSET(DB_EXTENT))
+ F_SET(mfp, MP_EXTENT);
++ F_SET(mfp, MP_CAN_MMAP);
+
+ if (path == NULL)
+ F_SET(mfp, MP_TEMP);
+***************
+*** 479,499 ****
+ * and find the number of the last page in the file, all the
+ * time being careful not to overflow 32 bits.
+ *
+- * !!!
+- * We can't use off_t's here, or in any code in the mainline
+- * library for that matter. (We have to use them in the os
+- * stubs, of course, as there are system calls that take them
+- * as arguments.) The reason is that some customers build in
+- * environments where an off_t is 32-bits, but still run where
+- * offsets are 64-bits, and they pay us a lot of money.
+- */
+- if ((ret = __os_ioinfo(
+- dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
+- __db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+- goto err;
+- }
+-
+- /*
+ * During verify or recovery, we might have to cope with a
+ * truncated file; if the file size is not a multiple of the
+ * page size, round down to a page, we'll take care of the
+--- 497,502 ----
+***************
+*** 582,588 ****
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+! #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ if (path == NULL)
+ F_CLR(mfp, MP_CAN_MMAP);
+--- 585,591 ----
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+! #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ if (path == NULL)
+ F_CLR(mfp, MP_CAN_MMAP);
--- /dev/null
+*** dbinc/os.h.orig 2002/03/27 04:34:55 11.14
+--- dbinc/os.h 2002/09/26 18:10:10
+***************
+*** 22,29 ****
+ int fd; /* POSIX file descriptor. */
+ char *name; /* File name. */
+
+ u_int32_t log_size; /* XXX: Log file size. */
+! u_int32_t pagesize; /* XXX: Page size. */
+
+ #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */
+ #define DB_FH_UNLINK 0x02 /* Unlink on close */
+--- 22,34 ----
+ int fd; /* POSIX file descriptor. */
+ char *name; /* File name. */
+
++ u_int32_t pagesize; /* Underlying page size. */
++
+ u_int32_t log_size; /* XXX: Log file size. */
+!
+! u_int32_t pgno; /* Last seek. */
+! u_int32_t pgsize;
+! u_int32_t offset;
+
+ #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */
+ #define DB_FH_UNLINK 0x02 /* Unlink on close */
+*** os/os_rw.c.orig 2002/07/12 18:56:52 11.24
+--- os/os_rw.c 2002/09/16 20:46:14 11.25
+***************
+*** 35,40 ****
+--- 35,43 ----
+ {
+ int ret;
+
++ /* Check for illegal usage. */
++ DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1);
++
+ #if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
+ switch (op) {
+ case DB_IO_READ:
+***************
+*** 95,100 ****
+--- 98,106 ----
+ int ret;
+ u_int8_t *taddr;
+
++ /* Check for illegal usage. */
++ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
++
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nr, offset += nr) {
+ retry: if ((nr = DB_GLOBAL(j_read) != NULL ?
+***************
+*** 131,136 ****
+--- 137,145 ----
+ ssize_t nw;
+ int ret;
+ u_int8_t *taddr;
++
++ /* Check for illegal usage. */
++ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
+
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nw, offset += nw)
+*** os/os_rw.c.orig 2002/09/16 20:46:14 11.25
+--- os/os_rw.c 2002/09/26 18:10:20
+***************
+*** 13,18 ****
+--- 13,19 ----
+
+ #ifndef NO_SYSTEM_INCLUDES
+ #include <sys/types.h>
++ #include <sys/stat.h>
+
+ #include <string.h>
+ #include <unistd.h>
+***************
+*** 20,25 ****
+--- 21,31 ----
+
+ #include "db_int.h"
+
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ static int __os_zerofill __P((DB_ENV *, DB_FH *));
++ #endif
++ static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
++
+ /*
+ * __os_io --
+ * Do an I/O.
+***************
+*** 49,54 ****
+--- 55,64 ----
+ case DB_IO_WRITE:
+ if (DB_GLOBAL(j_write) != NULL)
+ goto slow;
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ if (__os_fs_notzero())
++ goto slow;
++ #endif
+ *niop = pwrite(db_iop->fhp->fd, db_iop->buf,
+ db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
+ break;
+***************
+*** 133,145 ****
+ size_t len;
+ size_t *nwp;
+ {
+ size_t offset;
+ ssize_t nw;
+ int ret;
+ u_int8_t *taddr;
+
+! /* Check for illegal usage. */
+! DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
+
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nw, offset += nw)
+--- 143,189 ----
+ size_t len;
+ size_t *nwp;
+ {
++ /* Check for illegal usage. */
++ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
++
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ /* Zero-fill as necessary. */
++ if (__os_fs_notzero()) {
++ int ret;
++ if ((ret = __os_zerofill(dbenv, fhp)) != 0)
++ return (ret);
++ }
++ #endif
++ return (__os_physwrite(dbenv, fhp, addr, len, nwp));
++ }
++
++ /*
++ * __os_physwrite --
++ * Physical write to a file handle.
++ */
++ static int
++ __os_physwrite(dbenv, fhp, addr, len, nwp)
++ DB_ENV *dbenv;
++ DB_FH *fhp;
++ void *addr;
++ size_t len;
++ size_t *nwp;
++ {
+ size_t offset;
+ ssize_t nw;
+ int ret;
+ u_int8_t *taddr;
+
+! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
+! if (__os_fs_notzero()) {
+! struct stat sb;
+! off_t cur_off;
+!
+! DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
+! (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
+! cur_off <= sb.st_size);
+! }
+! #endif
+
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nw, offset += nw)
+***************
+*** 155,157 ****
+--- 199,288 ----
+ *nwp = len;
+ return (0);
+ }
++
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ /*
++ * __os_zerofill --
++ * Zero out bytes in the file.
++ *
++ * Pages allocated by writing pages past end-of-file are not zeroed,
++ * on some systems. Recovery could theoretically be fooled by a page
++ * showing up that contained garbage. In order to avoid this, we
++ * have to write the pages out to disk, and flush them. The reason
++ * for the flush is because if we don't sync, the allocation of another
++ * page subsequent to this one might reach the disk first, and if we
++ * crashed at the right moment, leave us with this page as the one
++ * allocated by writing a page past it in the file.
++ */
++ static int
++ __os_zerofill(dbenv, fhp)
++ DB_ENV *dbenv;
++ DB_FH *fhp;
++ {
++ off_t stat_offset, write_offset;
++ size_t blen, nw;
++ u_int32_t bytes, mbytes;
++ int group_sync, need_free, ret;
++ u_int8_t buf[8 * 1024], *bp;
++
++ /* Calculate the byte offset of the next write. */
++ write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
++
++ /* Stat the file. */
++ if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
++ return (ret);
++ stat_offset = (off_t)mbytes * MEGABYTE + bytes;
++
++ /* Check if the file is large enough. */
++ if (stat_offset >= write_offset)
++ return (0);
++
++ /* Get a large buffer if we're writing lots of data. */
++ #undef ZF_LARGE_WRITE
++ #define ZF_LARGE_WRITE (64 * 1024)
++ if (write_offset - stat_offset > ZF_LARGE_WRITE) {
++ if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
++ return (ret);
++ blen = ZF_LARGE_WRITE;
++ need_free = 1;
++ } else {
++ bp = buf;
++ blen = sizeof(buf);
++ need_free = 0;
++ memset(buf, 0, sizeof(buf));
++ }
++
++ /* Seek to the current end of the file. */
++ if ((ret = __os_seek(
++ dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
++ goto err;
++
++ /*
++ * Hash is the only access method that allocates groups of pages. Hash
++ * uses the existence of the last page in a group to signify the entire
++ * group is OK; so, write all the pages but the last one in the group,
++ * flush them to disk, then write the last one to disk and flush it.
++ */
++ for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
++ if (write_offset - stat_offset <= blen) {
++ blen = (size_t)(write_offset - stat_offset);
++ if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
++ goto err;
++ }
++ if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
++ goto err;
++ stat_offset += blen;
++ }
++ if ((ret = __os_fsync(dbenv, fhp)) != 0)
++ goto err;
++
++ /* Seek back to where we started. */
++ mbytes = (u_int32_t)(write_offset / MEGABYTE);
++ bytes = (u_int32_t)(write_offset % MEGABYTE);
++ ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
++
++ err: if (need_free)
++ __os_free(dbenv, bp);
++ return (ret);
++ }
++ #endif
+*** os/os_seek.c.orig Mon Jul 15 22:03:38 2002
+--- os/os_seek.c Thu Sep 26 14:13:52 2002
+***************
+*** 68,74 ****
+ } while (ret == EINTR);
+ }
+
+! if (ret != 0)
+ __db_err(dbenv, "seek: %lu %d %d: %s",
+ (u_long)pgsize * pageno + relative,
+ isrewind, db_whence, strerror(ret));
+--- 68,78 ----
+ } while (ret == EINTR);
+ }
+
+! if (ret == 0) {
+! fhp->pgsize = pgsize;
+! fhp->pgno = pageno;
+! fhp->offset = relative;
+! } else
+ __db_err(dbenv, "seek: %lu %d %d: %s",
+ (u_long)pgsize * pageno + relative,
+ isrewind, db_whence, strerror(ret));
+*** os_win32/os_rw.c.orig 2002/08/06 04:56:19 11.28
+--- os_win32/os_rw.c 2002/09/26 18:10:20
+***************
+*** 20,25 ****
+--- 20,30 ----
+
+ #include "db_int.h"
+
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ static int __os_zerofill __P((DB_ENV *, DB_FH *));
++ #endif
++ static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
++
+ /*
+ * __os_io --
+ * Do an I/O.
+***************
+*** 54,59 ****
+--- 59,68 ----
+ case DB_IO_WRITE:
+ if (DB_GLOBAL(j_write) != NULL)
+ goto slow;
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ if (__os_fs_notzero())
++ goto slow;
++ #endif
+ if (!WriteFile(db_iop->fhp->handle,
+ db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
+ goto slow;
+***************
+*** 149,154 ****
+--- 158,185 ----
+ size_t len;
+ size_t *nwp;
+ {
++ int ret;
++
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ /* Zero-fill as necessary. */
++ if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
++ return (ret);
++ #endif
++ return (__os_physwrite(dbenv, fhp, addr, len, nwp));
++ }
++
++ /*
++ * __os_physwrite --
++ * Physical write to a file handle.
++ */
++ static int
++ __os_physwrite(dbenv, fhp, addr, len, nwp)
++ DB_ENV *dbenv;
++ DB_FH *fhp;
++ void *addr;
++ size_t len;
++ size_t *nwp;
++ {
+ size_t offset;
+ DWORD nw;
+ int ret;
+***************
+*** 180,182 ****
+--- 211,300 ----
+ *nwp = len;
+ return (0);
+ }
++
++ #ifdef HAVE_FILESYSTEM_NOTZERO
++ /*
++ * __os_zerofill --
++ * Zero out bytes in the file.
++ *
++ * Pages allocated by writing pages past end-of-file are not zeroed,
++ * on some systems. Recovery could theoretically be fooled by a page
++ * showing up that contained garbage. In order to avoid this, we
++ * have to write the pages out to disk, and flush them. The reason
++ * for the flush is because if we don't sync, the allocation of another
++ * page subsequent to this one might reach the disk first, and if we
++ * crashed at the right moment, leave us with this page as the one
++ * allocated by writing a page past it in the file.
++ */
++ static int
++ __os_zerofill(dbenv, fhp)
++ DB_ENV *dbenv;
++ DB_FH *fhp;
++ {
++ unsigned __int64 stat_offset, write_offset;
++ size_t blen, nw;
++ u_int32_t bytes, mbytes;
++ int group_sync, need_free, ret;
++ u_int8_t buf[8 * 1024], *bp;
++
++ /* Calculate the byte offset of the next write. */
++ write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
++
++ /* Stat the file. */
++ if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
++ return (ret);
++ stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
++
++ /* Check if the file is large enough. */
++ if (stat_offset >= write_offset)
++ return (0);
++
++ /* Get a large buffer if we're writing lots of data. */
++ #undef ZF_LARGE_WRITE
++ #define ZF_LARGE_WRITE (64 * 1024)
++ if (write_offset - stat_offset > ZF_LARGE_WRITE) {
++ if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
++ return (ret);
++ blen = ZF_LARGE_WRITE;
++ need_free = 1;
++ } else {
++ bp = buf;
++ blen = sizeof(buf);
++ need_free = 0;
++ memset(buf, 0, sizeof(buf));
++ }
++
++ /* Seek to the current end of the file. */
++ if ((ret = __os_seek(
++ dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
++ goto err;
++
++ /*
++ * Hash is the only access method that allocates groups of pages. Hash
++ * uses the existence of the last page in a group to signify the entire
++ * group is OK; so, write all the pages but the last one in the group,
++ * flush them to disk, then write the last one to disk and flush it.
++ */
++ for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
++ if (write_offset - stat_offset <= blen) {
++ blen = (size_t)(write_offset - stat_offset);
++ if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
++ goto err;
++ }
++ if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
++ goto err;
++ stat_offset += blen;
++ }
++ if ((ret = __os_fsync(dbenv, fhp)) != 0)
++ goto err;
++
++ /* Seek back to where we started. */
++ mbytes = (u_int32_t)(write_offset / MEGABYTE);
++ bytes = (u_int32_t)(write_offset % MEGABYTE);
++ ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
++
++ err: if (need_free)
++ __os_free(dbenv, bp);
++ return (ret);
++ }
++ #endif
+*** os_win32/os_seek.c.orig 2002/08/06 04:56:20 11.17
+--- os_win32/os_seek.c 2002/09/26 18:10:20
+***************
+*** 79,88 ****
+ __os_win32_errno() : 0;
+ }
+
+! if (ret != 0)
+ __db_err(dbenv, "seek: %lu %d %d: %s",
+ (u_long)pgsize * pageno + relative,
+ isrewind, db_whence, strerror(ret));
+
+ return (ret);
+ }
+--- 79,93 ----
+ __os_win32_errno() : 0;
+ }
+
+! if (ret == 0) {
+! fhp->pgsize = pgsize;
+! fhp->pgno = pageno;
+! fhp->offset = relative;
+! } else {
+ __db_err(dbenv, "seek: %lu %d %d: %s",
+ (u_long)pgsize * pageno + relative,
+ isrewind, db_whence, strerror(ret));
++ }
+
+ return (ret);
+ }