[packages/db.git] / patch.4.1.24.5

*** dbinc/os.h.orig	2002/03/27 04:34:55	11.14
--- dbinc/os.h	2002/09/26 18:10:10
***************
*** 22,29 ****
  	int	  fd;			/* POSIX file descriptor. */
  	char	*name;			/* File name. */
  
  	u_int32_t log_size;		/* XXX: Log file size. */
! 	u_int32_t pagesize;		/* XXX: Page size. */
  
  #define	DB_FH_NOSYNC	0x01		/* Handle doesn't need to be sync'd. */
  #define	DB_FH_UNLINK	0x02		/* Unlink on close */
--- 22,34 ----
  	int	  fd;			/* POSIX file descriptor. */
  	char	*name;			/* File name. */
  
+ 	u_int32_t pagesize;		/* Underlying page size. */
+ 
  	u_int32_t log_size;		/* XXX: Log file size. */
! 
! 	u_int32_t pgno;			/* Last seek. */
! 	u_int32_t pgsize;
! 	u_int32_t offset;
  
  #define	DB_FH_NOSYNC	0x01		/* Handle doesn't need to be sync'd. */
  #define	DB_FH_UNLINK	0x02		/* Unlink on close */
*** os/os_rw.c.orig	2002/07/12 18:56:52	11.24
--- os/os_rw.c	2002/09/16 20:46:14	11.25
***************
*** 35,40 ****
--- 35,43 ----
  {
  	int ret;
  
+ 	/* Check for illegal usage. */
+ 	DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1);
+ 
  #if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
  	switch (op) {
  	case DB_IO_READ:
***************
*** 95,100 ****
--- 98,106 ----
  	int ret;
  	u_int8_t *taddr;
  
+ 	/* Check for illegal usage. */
+ 	DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
+ 
  	for (taddr = addr,
  	    offset = 0; offset < len; taddr += nr, offset += nr) {
  retry:		if ((nr = DB_GLOBAL(j_read) != NULL ?
***************
*** 131,136 ****
--- 137,145 ----
  	ssize_t nw;
  	int ret;
  	u_int8_t *taddr;
+ 
+ 	/* Check for illegal usage. */
+ 	DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
  
  	for (taddr = addr,
  	    offset = 0; offset < len; taddr += nw, offset += nw)
*** os/os_rw.c.orig	2002/09/16 20:46:14	11.25
--- os/os_rw.c	2002/09/26 18:10:20
***************
*** 13,18 ****
--- 13,19 ----
  
  #ifndef NO_SYSTEM_INCLUDES
  #include <sys/types.h>
+ #include <sys/stat.h>
  
  #include <string.h>
  #include <unistd.h>
***************
*** 20,25 ****
--- 21,31 ----
  
  #include "db_int.h"
  
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ static int __os_zerofill __P((DB_ENV *, DB_FH *));
+ #endif
+ static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
+ 
  /*
   * __os_io --
   *	Do an I/O.
***************
*** 49,54 ****
--- 55,64 ----
  	case DB_IO_WRITE:
  		if (DB_GLOBAL(j_write) != NULL)
  			goto slow;
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ 		if (__os_fs_notzero())
+ 			goto slow;
+ #endif
  		*niop = pwrite(db_iop->fhp->fd, db_iop->buf,
  		    db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
  		break;
***************
*** 133,145 ****
  	size_t len;
  	size_t *nwp;
  {
  	size_t offset;
  	ssize_t nw;
  	int ret;
  	u_int8_t *taddr;
  
! 	/* Check for illegal usage. */
! 	DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
  
  	for (taddr = addr,
  	    offset = 0; offset < len; taddr += nw, offset += nw)
--- 143,189 ----
  	size_t len;
  	size_t *nwp;
  {
+ 	/* Check for illegal usage. */
+ 	DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
+ 
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ 	/* Zero-fill as necessary. */
+ 	if (__os_fs_notzero()) {
+ 		int ret;
+ 		if ((ret = __os_zerofill(dbenv, fhp)) != 0)
+ 			return (ret);
+ 	}
+ #endif
+ 	return (__os_physwrite(dbenv, fhp, addr, len, nwp));
+ }
+ 
+ /*
+  * __os_physwrite --
+  *	Physical write to a file handle.
+  */
+ static int
+ __os_physwrite(dbenv, fhp, addr, len, nwp)
+ 	DB_ENV *dbenv;
+ 	DB_FH *fhp;
+ 	void *addr;
+ 	size_t len;
+ 	size_t *nwp;
+ {
  	size_t offset;
  	ssize_t nw;
  	int ret;
  	u_int8_t *taddr;
  
! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
! 	if (__os_fs_notzero()) {
! 		struct stat sb;
! 		off_t cur_off;
! 
! 		DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
! 		    (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
! 		    cur_off <= sb.st_size);
! 	}
! #endif
  
  	for (taddr = addr,
  	    offset = 0; offset < len; taddr += nw, offset += nw)
***************
*** 155,157 ****
--- 199,288 ----
  	*nwp = len;
  	return (0);
  }
+ 
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ /*
+  * __os_zerofill --
+  *	Zero out bytes in the file.
+  *
+  *	Pages allocated by writing pages past end-of-file are not zeroed,
+  *	on some systems.  Recovery could theoretically be fooled by a page
+  *	showing up that contained garbage.  In order to avoid this, we
+  *	have to write the pages out to disk, and flush them.  The reason
+  *	for the flush is because if we don't sync, the allocation of another
+  *	page subsequent to this one might reach the disk first, and if we
+  *	crashed at the right moment, leave us with this page as the one
+  *	allocated by writing a page past it in the file.
+  */
+ static int
+ __os_zerofill(dbenv, fhp)
+ 	DB_ENV *dbenv;
+ 	DB_FH *fhp;
+ {
+ 	off_t stat_offset, write_offset;
+ 	size_t blen, nw;
+ 	u_int32_t bytes, mbytes;
+ 	int group_sync, need_free, ret;
+ 	u_int8_t buf[8 * 1024], *bp;
+ 
+ 	/* Calculate the byte offset of the next write. */
+ 	write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
+ 
+ 	/* Stat the file. */
+ 	if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+ 		return (ret);
+ 	stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+ 
+ 	/* Check if the file is large enough. */
+ 	if (stat_offset >= write_offset)
+ 		return (0);
+ 
+ 	/* Get a large buffer if we're writing lots of data. */
+ #undef	ZF_LARGE_WRITE
+ #define	ZF_LARGE_WRITE	(64 * 1024)
+ 	if (write_offset - stat_offset > ZF_LARGE_WRITE) {
+ 		if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
+ 			    return (ret);
+ 		blen = ZF_LARGE_WRITE;
+ 		need_free = 1;
+ 	} else {
+ 		bp = buf;
+ 		blen = sizeof(buf);
+ 		need_free = 0;
+ 		memset(buf, 0, sizeof(buf));
+ 	}
+ 
+ 	/* Seek to the current end of the file. */
+ 	if ((ret = __os_seek(
+ 	    dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
+ 		goto err;
+ 
+ 	/*
+ 	 * Hash is the only access method that allocates groups of pages.  Hash
+ 	 * uses the existence of the last page in a group to signify the entire
+ 	 * group is OK; so, write all the pages but the last one in the group,
+ 	 * flush them to disk, then write the last one to disk and flush it.
+ 	 */
+ 	for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
+ 		if (write_offset - stat_offset <= blen) {
+ 			blen = (size_t)(write_offset - stat_offset);
+ 			if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
+ 				goto err;
+ 		}
+ 		if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
+ 			goto err;
+ 		stat_offset += blen;
+ 	}
+ 	if ((ret = __os_fsync(dbenv, fhp)) != 0)
+ 		goto err;
+ 
+ 	/* Seek back to where we started. */
+ 	mbytes = (u_int32_t)(write_offset / MEGABYTE);
+ 	bytes = (u_int32_t)(write_offset % MEGABYTE);
+ 	ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
+ 
+ err:	if (need_free)
+ 		__os_free(dbenv, bp);
+ 	return (ret);
+ }
+ #endif
*** os/os_seek.c.orig	Mon Jul 15 22:03:38 2002
--- os/os_seek.c	Thu Sep 26 14:13:52 2002
***************
*** 68,74 ****
  		} while (ret == EINTR);
  	}
  
! 	if (ret != 0)
  		__db_err(dbenv, "seek: %lu %d %d: %s",
  		    (u_long)pgsize * pageno + relative,
  		    isrewind, db_whence, strerror(ret));
--- 68,78 ----
  		} while (ret == EINTR);
  	}
  
! 	if (ret == 0) {
! 		fhp->pgsize = pgsize;
! 		fhp->pgno = pageno;
! 		fhp->offset = relative;
! 	} else
  		__db_err(dbenv, "seek: %lu %d %d: %s",
  		    (u_long)pgsize * pageno + relative,
  		    isrewind, db_whence, strerror(ret));
*** os_win32/os_rw.c.orig	2002/08/06 04:56:19	11.28
--- os_win32/os_rw.c	2002/09/26 18:10:20
***************
*** 20,25 ****
--- 20,30 ----
  
  #include "db_int.h"
  
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ static int __os_zerofill __P((DB_ENV *, DB_FH *));
+ #endif
+ static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
+ 
  /*
   * __os_io --
   *	Do an I/O.
***************
*** 54,59 ****
--- 59,68 ----
  		case DB_IO_WRITE:
  			if (DB_GLOBAL(j_write) != NULL)
  				goto slow;
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ 			if (__os_fs_notzero())
+ 				goto slow;
+ #endif
  			if (!WriteFile(db_iop->fhp->handle,
  			    db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
  				goto slow;
***************
*** 149,154 ****
--- 158,185 ----
  	size_t len;
  	size_t *nwp;
  {
+ 	int ret;
+ 
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ 	/* Zero-fill as necessary. */
+ 	if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
+ 		return (ret);
+ #endif
+ 	return (__os_physwrite(dbenv, fhp, addr, len, nwp));
+ }
+ 
+ /*
+  * __os_physwrite --
+  *	Physical write to a file handle.
+  */
+ static int
+ __os_physwrite(dbenv, fhp, addr, len, nwp)
+ 	DB_ENV *dbenv;
+ 	DB_FH *fhp;
+ 	void *addr;
+ 	size_t len;
+ 	size_t *nwp;
+ {
  	size_t offset;
  	DWORD nw;
  	int ret;
***************
*** 180,182 ****
--- 211,300 ----
  	*nwp = len;
  	return (0);
  }
+ 
+ #ifdef HAVE_FILESYSTEM_NOTZERO
+ /*
+  * __os_zerofill --
+  *	Zero out bytes in the file.
+  *
+  *	Pages allocated by writing pages past end-of-file are not zeroed,
+  *	on some systems.  Recovery could theoretically be fooled by a page
+  *	showing up that contained garbage.  In order to avoid this, we
+  *	have to write the pages out to disk, and flush them.  The reason
+  *	for the flush is because if we don't sync, the allocation of another
+  *	page subsequent to this one might reach the disk first, and if we
+  *	crashed at the right moment, leave us with this page as the one
+  *	allocated by writing a page past it in the file.
+  */
+ static int
+ __os_zerofill(dbenv, fhp)
+ 	DB_ENV *dbenv;
+ 	DB_FH *fhp;
+ {
+ 	unsigned __int64 stat_offset, write_offset;
+ 	size_t blen, nw;
+ 	u_int32_t bytes, mbytes;
+ 	int group_sync, need_free, ret;
+ 	u_int8_t buf[8 * 1024], *bp;
+ 
+ 	/* Calculate the byte offset of the next write. */
+ 	write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
+ 
+ 	/* Stat the file. */
+ 	if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+ 		return (ret);
+ 	stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
+ 
+ 	/* Check if the file is large enough. */
+ 	if (stat_offset >= write_offset)
+ 		return (0);
+ 
+ 	/* Get a large buffer if we're writing lots of data. */
+ #undef	ZF_LARGE_WRITE
+ #define	ZF_LARGE_WRITE	(64 * 1024)
+ 	if (write_offset - stat_offset > ZF_LARGE_WRITE) {
+ 		if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
+ 			    return (ret);
+ 		blen = ZF_LARGE_WRITE;
+ 		need_free = 1;
+ 	} else {
+ 		bp = buf;
+ 		blen = sizeof(buf);
+ 		need_free = 0;
+ 		memset(buf, 0, sizeof(buf));
+ 	}
+ 
+ 	/* Seek to the current end of the file. */
+ 	if ((ret = __os_seek(
+ 	    dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
+ 		goto err;
+ 
+ 	/*
+ 	 * Hash is the only access method that allocates groups of pages.  Hash
+ 	 * uses the existence of the last page in a group to signify the entire
+ 	 * group is OK; so, write all the pages but the last one in the group,
+ 	 * flush them to disk, then write the last one to disk and flush it.
+ 	 */
+ 	for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
+ 		if (write_offset - stat_offset <= blen) {
+ 			blen = (size_t)(write_offset - stat_offset);
+ 			if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
+ 				goto err;
+ 		}
+ 		if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
+ 			goto err;
+ 		stat_offset += blen;
+ 	}
+ 	if ((ret = __os_fsync(dbenv, fhp)) != 0)
+ 		goto err;
+ 
+ 	/* Seek back to where we started. */
+ 	mbytes = (u_int32_t)(write_offset / MEGABYTE);
+ 	bytes = (u_int32_t)(write_offset % MEGABYTE);
+ 	ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
+ 
+ err:	if (need_free)
+ 		__os_free(dbenv, bp);
+ 	return (ret);
+ }
+ #endif
*** os_win32/os_seek.c.orig	2002/08/06 04:56:20	11.17
--- os_win32/os_seek.c	2002/09/26 18:10:20
***************
*** 79,88 ****
  		    __os_win32_errno() : 0;
  	}
  
! 	if (ret != 0)
  		__db_err(dbenv, "seek: %lu %d %d: %s",
  		    (u_long)pgsize * pageno + relative,
  		    isrewind, db_whence, strerror(ret));
  
  	return (ret);
  }
--- 79,93 ----
  		    __os_win32_errno() : 0;
  	}
  
! 	if (ret == 0) {
! 		fhp->pgsize = pgsize;
! 		fhp->pgno = pageno;
! 		fhp->offset = relative;
! 	} else {
  		__db_err(dbenv, "seek: %lu %d %d: %s",
  		    (u_long)pgsize * pageno + relative,
  		    isrewind, db_whence, strerror(ret));
+ 	}
  
  	return (ret);
  }
Commit	Line	Data
4bc83bc6 AM	1	*** dbinc/os.h.orig 2002/03/27 04:34:55 11.14
	2	--- dbinc/os.h 2002/09/26 18:10:10
	3	***************
	4	* 22,29 **
	5	int fd; /* POSIX file descriptor. */
	6	char name; / File name. */
	7
	8	u_int32_t log_size; /* XXX: Log file size. */
	9	! u_int32_t pagesize; /* XXX: Page size. */
	10
	11	#define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */
	12	#define DB_FH_UNLINK 0x02 /* Unlink on close */
	13	--- 22,34 ----
	14	int fd; /* POSIX file descriptor. */
	15	char name; / File name. */
	16
	17	+ u_int32_t pagesize; /* Underlying page size. */
	18	+
	19	u_int32_t log_size; /* XXX: Log file size. */
	20	!
	21	! u_int32_t pgno; /* Last seek. */
	22	! u_int32_t pgsize;
	23	! u_int32_t offset;
	24
	25	#define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */
	26	#define DB_FH_UNLINK 0x02 /* Unlink on close */
	27	*** os/os_rw.c.orig 2002/07/12 18:56:52 11.24
	28	--- os/os_rw.c 2002/09/16 20:46:14 11.25
	29	***************
	30	* 35,40 **
	31	--- 35,43 ----
	32	{
	33	int ret;
	34
	35	+ /* Check for illegal usage. */
	36	+ DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1);
	37	+
	38	#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
	39	switch (op) {
	40	case DB_IO_READ:
	41	***************
	42	* 95,100 **
	43	--- 98,106 ----
	44	int ret;
	45	u_int8_t *taddr;
	46
	47	+ /* Check for illegal usage. */
	48	+ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
	49	+
	50	for (taddr = addr,
	51	offset = 0; offset < len; taddr += nr, offset += nr) {
	52	retry: if ((nr = DB_GLOBAL(j_read) != NULL ?
	53	***************
	54	* 131,136 **
	55	--- 137,145 ----
	56	ssize_t nw;
	57	int ret;
	58	u_int8_t *taddr;
	59	+
	60	+ /* Check for illegal usage. */
	61	+ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
	62
	63	for (taddr = addr,
	64	offset = 0; offset < len; taddr += nw, offset += nw)
65	*** os/os_rw.c.orig 2002/09/16 20:46:14 11.25
66	--- os/os_rw.c 2002/09/26 18:10:20
67	***************
68	* 13,18 **
69	--- 13,19 ----
70
71	#ifndef NO_SYSTEM_INCLUDES
72	#include <sys/types.h>
73	+ #include <sys/stat.h>
74
75	#include <string.h>
76	#include <unistd.h>
77	***************
78	* 20,25 **
79	--- 21,31 ----
80
81	#include "db_int.h"
82
83	+ #ifdef HAVE_FILESYSTEM_NOTZERO
84	+ static int __os_zerofill __P((DB_ENV , DB_FH ));
85	+ #endif
86	+ static int __os_physwrite __P((DB_ENV , DB_FH , void , size_t, size_t ));
87	+
88	/*
89	* __os_io --
90	* Do an I/O.
91	***************
92	* 49,54 **
93	--- 55,64 ----
94	case DB_IO_WRITE:
95	if (DB_GLOBAL(j_write) != NULL)
96	goto slow;
97	+ #ifdef HAVE_FILESYSTEM_NOTZERO
98	+ if (__os_fs_notzero())
99	+ goto slow;
100	+ #endif
101	*niop = pwrite(db_iop->fhp->fd, db_iop->buf,
102	db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
103	break;
104	***************
105	* 133,145 **
106	size_t len;
107	size_t *nwp;
108	{
109	size_t offset;
110	ssize_t nw;
111	int ret;
112	u_int8_t *taddr;
113
114	! /* Check for illegal usage. */
115	! DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
116
117	for (taddr = addr,
118	offset = 0; offset < len; taddr += nw, offset += nw)
119	--- 143,189 ----
120	size_t len;
121	size_t *nwp;
122	{
123	+ /* Check for illegal usage. */
124	+ DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
125	+
126	+ #ifdef HAVE_FILESYSTEM_NOTZERO
127	+ /* Zero-fill as necessary. */
128	+ if (__os_fs_notzero()) {
129	+ int ret;
130	+ if ((ret = __os_zerofill(dbenv, fhp)) != 0)
131	+ return (ret);
132	+ }
133	+ #endif
134	+ return (__os_physwrite(dbenv, fhp, addr, len, nwp));
135	+ }
136	+
137	+ /*
138	+ * __os_physwrite --
139	+ * Physical write to a file handle.
140	+ */
141	+ static int
142	+ __os_physwrite(dbenv, fhp, addr, len, nwp)
143	+ DB_ENV *dbenv;
144	+ DB_FH *fhp;
145	+ void *addr;
146	+ size_t len;
147	+ size_t *nwp;
148	+ {
149	size_t offset;
150	ssize_t nw;
151	int ret;
152	u_int8_t *taddr;
153
154	! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
155	! if (__os_fs_notzero()) {
156	! struct stat sb;
157	! off_t cur_off;
158	!
159	! DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
160	! (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
161	! cur_off <= sb.st_size);
162	! }
163	! #endif
164
165	for (taddr = addr,
166	offset = 0; offset < len; taddr += nw, offset += nw)
167	***************
168	* 155,157 **
169	--- 199,288 ----
170	*nwp = len;
171	return (0);
172	}
173	+
174	+ #ifdef HAVE_FILESYSTEM_NOTZERO
175	+ /*
176	+ * __os_zerofill --
177	+ * Zero out bytes in the file.
178	+ *
179	+ * Pages allocated by writing pages past end-of-file are not zeroed,
180	+ * on some systems. Recovery could theoretically be fooled by a page
181	+ * showing up that contained garbage. In order to avoid this, we
182	+ * have to write the pages out to disk, and flush them. The reason
183	+ * for the flush is because if we don't sync, the allocation of another
184	+ * page subsequent to this one might reach the disk first, and if we
185	+ * crashed at the right moment, leave us with this page as the one
186	+ * allocated by writing a page past it in the file.
187	+ */
188	+ static int
189	+ __os_zerofill(dbenv, fhp)
190	+ DB_ENV *dbenv;
191	+ DB_FH *fhp;
192	+ {
193	+ off_t stat_offset, write_offset;
194	+ size_t blen, nw;
195	+ u_int32_t bytes, mbytes;
196	+ int group_sync, need_free, ret;
197	+ u_int8_t buf[8 * 1024], *bp;
198	+
199	+ /* Calculate the byte offset of the next write. */
200	+ write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
201	+
202	+ /* Stat the file. */
203	+ if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
204	+ return (ret);
205	+ stat_offset = (off_t)mbytes * MEGABYTE + bytes;
206	+
207	+ /* Check if the file is large enough. */
208	+ if (stat_offset >= write_offset)
209	+ return (0);
210	+
211	+ /* Get a large buffer if we're writing lots of data. */
212	+ #undef ZF_LARGE_WRITE
213	+ #define ZF_LARGE_WRITE (64 * 1024)
214	+ if (write_offset - stat_offset > ZF_LARGE_WRITE) {
215	+ if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
216	+ return (ret);
217	+ blen = ZF_LARGE_WRITE;
218	+ need_free = 1;
219	+ } else {
220	+ bp = buf;
221	+ blen = sizeof(buf);
222	+ need_free = 0;
223	+ memset(buf, 0, sizeof(buf));
224	+ }
225	+
226	+ /* Seek to the current end of the file. */
227	+ if ((ret = __os_seek(
228	+ dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
229	+ goto err;
230	+
231	+ /*
232	+ * Hash is the only access method that allocates groups of pages. Hash
233	+ * uses the existence of the last page in a group to signify the entire
234	+ * group is OK; so, write all the pages but the last one in the group,
235	+ * flush them to disk, then write the last one to disk and flush it.
236	+ */
237	+ for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
238	+ if (write_offset - stat_offset <= blen) {
239	+ blen = (size_t)(write_offset - stat_offset);
240	+ if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
241	+ goto err;
242	+ }
243	+ if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
244	+ goto err;
245	+ stat_offset += blen;
246	+ }
247	+ if ((ret = __os_fsync(dbenv, fhp)) != 0)
248	+ goto err;
249	+
250	+ /* Seek back to where we started. */
251	+ mbytes = (u_int32_t)(write_offset / MEGABYTE);
252	+ bytes = (u_int32_t)(write_offset % MEGABYTE);
253	+ ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
254	+
255	+ err: if (need_free)
256	+ __os_free(dbenv, bp);
257	+ return (ret);
258	+ }
259	+ #endif
260	*** os/os_seek.c.orig Mon Jul 15 22:03:38 2002
261	--- os/os_seek.c Thu Sep 26 14:13:52 2002
262	***************
263	* 68,74 **
264	} while (ret == EINTR);
265	}
266
267	! if (ret != 0)
268	__db_err(dbenv, "seek: %lu %d %d: %s",
269	(u_long)pgsize * pageno + relative,
270	isrewind, db_whence, strerror(ret));
271	--- 68,78 ----
272	} while (ret == EINTR);
273	}
274
275	! if (ret == 0) {
276	! fhp->pgsize = pgsize;
277	! fhp->pgno = pageno;
278	! fhp->offset = relative;
279	! } else
280	__db_err(dbenv, "seek: %lu %d %d: %s",
281	(u_long)pgsize * pageno + relative,
282	isrewind, db_whence, strerror(ret));
283	*** os_win32/os_rw.c.orig 2002/08/06 04:56:19 11.28
284	--- os_win32/os_rw.c 2002/09/26 18:10:20
285	***************
286	* 20,25 **
287	--- 20,30 ----
288
289	#include "db_int.h"
290
291	+ #ifdef HAVE_FILESYSTEM_NOTZERO
292	+ static int __os_zerofill __P((DB_ENV , DB_FH ));
293	+ #endif
294	+ static int __os_physwrite __P((DB_ENV , DB_FH , void , size_t, size_t ));
295	+
296	/*
297	* __os_io --
298	* Do an I/O.
299	***************
300	* 54,59 **
301	--- 59,68 ----
302	case DB_IO_WRITE:
303	if (DB_GLOBAL(j_write) != NULL)
304	goto slow;
305	+ #ifdef HAVE_FILESYSTEM_NOTZERO
306	+ if (__os_fs_notzero())
307	+ goto slow;
308	+ #endif
309	if (!WriteFile(db_iop->fhp->handle,
310	db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
311	goto slow;
312	***************
313	* 149,154 **
314	--- 158,185 ----
315	size_t len;
316	size_t *nwp;
317	{
318	+ int ret;
319	+
320	+ #ifdef HAVE_FILESYSTEM_NOTZERO
321	+ /* Zero-fill as necessary. */
322	+ if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
323	+ return (ret);
324	+ #endif
325	+ return (__os_physwrite(dbenv, fhp, addr, len, nwp));
326	+ }
327	+
328	+ /*
329	+ * __os_physwrite --
330	+ * Physical write to a file handle.
331	+ */
332	+ static int
333	+ __os_physwrite(dbenv, fhp, addr, len, nwp)
334	+ DB_ENV *dbenv;
335	+ DB_FH *fhp;
336	+ void *addr;
337	+ size_t len;
338	+ size_t *nwp;
339	+ {
340	size_t offset;
341	DWORD nw;
342	int ret;
343	***************
344	* 180,182 **
345	--- 211,300 ----
346	*nwp = len;
347	return (0);
348	}
349	+
350	+ #ifdef HAVE_FILESYSTEM_NOTZERO
351	+ /*
352	+ * __os_zerofill --
353	+ * Zero out bytes in the file.
354	+ *
355	+ * Pages allocated by writing pages past end-of-file are not zeroed,
356	+ * on some systems. Recovery could theoretically be fooled by a page
357	+ * showing up that contained garbage. In order to avoid this, we
358	+ * have to write the pages out to disk, and flush them. The reason
359	+ * for the flush is because if we don't sync, the allocation of another
360	+ * page subsequent to this one might reach the disk first, and if we
361	+ * crashed at the right moment, leave us with this page as the one
362	+ * allocated by writing a page past it in the file.
363	+ */
364	+ static int
365	+ __os_zerofill(dbenv, fhp)
366	+ DB_ENV *dbenv;
367	+ DB_FH *fhp;
368	+ {
369	+ unsigned __int64 stat_offset, write_offset;
370	+ size_t blen, nw;
371	+ u_int32_t bytes, mbytes;
372	+ int group_sync, need_free, ret;
373	+ u_int8_t buf[8 * 1024], *bp;
374	+
375	+ /* Calculate the byte offset of the next write. */
376	+ write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
377	+
378	+ /* Stat the file. */
379	+ if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
380	+ return (ret);
381	+ stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
382	+
383	+ /* Check if the file is large enough. */
384	+ if (stat_offset >= write_offset)
385	+ return (0);
386	+
387	+ /* Get a large buffer if we're writing lots of data. */
388	+ #undef ZF_LARGE_WRITE
389	+ #define ZF_LARGE_WRITE (64 * 1024)
390	+ if (write_offset - stat_offset > ZF_LARGE_WRITE) {
391	+ if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
392	+ return (ret);
393	+ blen = ZF_LARGE_WRITE;
394	+ need_free = 1;
395	+ } else {
396	+ bp = buf;
397	+ blen = sizeof(buf);
398	+ need_free = 0;
399	+ memset(buf, 0, sizeof(buf));
400	+ }
401	+
402	+ /* Seek to the current end of the file. */
403	+ if ((ret = __os_seek(
404	+ dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
405	+ goto err;
406	+
407	+ /*
408	+ * Hash is the only access method that allocates groups of pages. Hash
409	+ * uses the existence of the last page in a group to signify the entire
410	+ * group is OK; so, write all the pages but the last one in the group,
411	+ * flush them to disk, then write the last one to disk and flush it.
412	+ */
413	+ for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
414	+ if (write_offset - stat_offset <= blen) {
415	+ blen = (size_t)(write_offset - stat_offset);
416	+ if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
417	+ goto err;
418	+ }
419	+ if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
420	+ goto err;
421	+ stat_offset += blen;
422	+ }
423	+ if ((ret = __os_fsync(dbenv, fhp)) != 0)
424	+ goto err;
425	+
426	+ /* Seek back to where we started. */
427	+ mbytes = (u_int32_t)(write_offset / MEGABYTE);
428	+ bytes = (u_int32_t)(write_offset % MEGABYTE);
429	+ ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
430	+
431	+ err: if (need_free)
432	+ __os_free(dbenv, bp);
433	+ return (ret);
434	+ }
435	+ #endif
436	*** os_win32/os_seek.c.orig 2002/08/06 04:56:20 11.17
437	--- os_win32/os_seek.c 2002/09/26 18:10:20
438	***************
439	* 79,88 **
440	__os_win32_errno() : 0;
441	}
442
443	! if (ret != 0)
444	__db_err(dbenv, "seek: %lu %d %d: %s",
445	(u_long)pgsize * pageno + relative,
446	isrewind, db_whence, strerror(ret));
447
448	return (ret);
449	}
450	--- 79,93 ----
451	__os_win32_errno() : 0;
452	}
453
454	! if (ret == 0) {
455	! fhp->pgsize = pgsize;
456	! fhp->pgno = pageno;
457	! fhp->offset = relative;
458	! } else {
459	__db_err(dbenv, "seek: %lu %d %d: %s",
460	(u_long)pgsize * pageno + relative,
461	isrewind, db_whence, strerror(ret));
462	+ }
463
464	return (ret);
465	}