]>
Commit | Line | Data |
---|---|---|
4bc83bc6 AM |
1 | *** dbinc/os.h.orig 2002/03/27 04:34:55 11.14 |
2 | --- dbinc/os.h 2002/09/26 18:10:10 | |
3 | *************** | |
4 | *** 22,29 **** | |
5 | int fd; /* POSIX file descriptor. */ | |
6 | char *name; /* File name. */ | |
7 | ||
8 | u_int32_t log_size; /* XXX: Log file size. */ | |
9 | ! u_int32_t pagesize; /* XXX: Page size. */ | |
10 | ||
11 | #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */ | |
12 | #define DB_FH_UNLINK 0x02 /* Unlink on close */ | |
13 | --- 22,34 ---- | |
14 | int fd; /* POSIX file descriptor. */ | |
15 | char *name; /* File name. */ | |
16 | ||
17 | + u_int32_t pagesize; /* Underlying page size. */ | |
18 | + | |
19 | u_int32_t log_size; /* XXX: Log file size. */ | |
20 | ! | |
21 | ! u_int32_t pgno; /* Last seek. */ | |
22 | ! u_int32_t pgsize; | |
23 | ! u_int32_t offset; | |
24 | ||
25 | #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */ | |
26 | #define DB_FH_UNLINK 0x02 /* Unlink on close */ | |
27 | *** os/os_rw.c.orig 2002/07/12 18:56:52 11.24 | |
28 | --- os/os_rw.c 2002/09/16 20:46:14 11.25 | |
29 | *************** | |
30 | *** 35,40 **** | |
31 | --- 35,43 ---- | |
32 | { | |
33 | int ret; | |
34 | ||
35 | + /* Check for illegal usage. */ | |
36 | + DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1); | |
37 | + | |
38 | #if defined(HAVE_PREAD) && defined(HAVE_PWRITE) | |
39 | switch (op) { | |
40 | case DB_IO_READ: | |
41 | *************** | |
42 | *** 95,100 **** | |
43 | --- 98,106 ---- | |
44 | int ret; | |
45 | u_int8_t *taddr; | |
46 | ||
47 | + /* Check for illegal usage. */ | |
48 | + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); | |
49 | + | |
50 | for (taddr = addr, | |
51 | offset = 0; offset < len; taddr += nr, offset += nr) { | |
52 | retry: if ((nr = DB_GLOBAL(j_read) != NULL ? | |
53 | *************** | |
54 | *** 131,136 **** | |
55 | --- 137,145 ---- | |
56 | ssize_t nw; | |
57 | int ret; | |
58 | u_int8_t *taddr; | |
59 | + | |
60 | + /* Check for illegal usage. */ | |
61 | + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); | |
62 | ||
63 | for (taddr = addr, | |
64 | offset = 0; offset < len; taddr += nw, offset += nw) | |
65 | *** os/os_rw.c.orig 2002/09/16 20:46:14 11.25 | |
66 | --- os/os_rw.c 2002/09/26 18:10:20 | |
67 | *************** | |
68 | *** 13,18 **** | |
69 | --- 13,19 ---- | |
70 | ||
71 | #ifndef NO_SYSTEM_INCLUDES | |
72 | #include <sys/types.h> | |
73 | + #include <sys/stat.h> | |
74 | ||
75 | #include <string.h> | |
76 | #include <unistd.h> | |
77 | *************** | |
78 | *** 20,25 **** | |
79 | --- 21,31 ---- | |
80 | ||
81 | #include "db_int.h" | |
82 | ||
83 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
84 | + static int __os_zerofill __P((DB_ENV *, DB_FH *)); | |
85 | + #endif | |
86 | + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *)); | |
87 | + | |
88 | /* | |
89 | * __os_io -- | |
90 | * Do an I/O. | |
91 | *************** | |
92 | *** 49,54 **** | |
93 | --- 55,64 ---- | |
94 | case DB_IO_WRITE: | |
95 | if (DB_GLOBAL(j_write) != NULL) | |
96 | goto slow; | |
97 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
98 | + if (__os_fs_notzero()) | |
99 | + goto slow; | |
100 | + #endif | |
101 | *niop = pwrite(db_iop->fhp->fd, db_iop->buf, | |
102 | db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize); | |
103 | break; | |
104 | *************** | |
105 | *** 133,145 **** | |
106 | size_t len; | |
107 | size_t *nwp; | |
108 | { | |
109 | size_t offset; | |
110 | ssize_t nw; | |
111 | int ret; | |
112 | u_int8_t *taddr; | |
113 | ||
114 | ! /* Check for illegal usage. */ | |
115 | ! DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); | |
116 | ||
117 | for (taddr = addr, | |
118 | offset = 0; offset < len; taddr += nw, offset += nw) | |
119 | --- 143,189 ---- | |
120 | size_t len; | |
121 | size_t *nwp; | |
122 | { | |
123 | + /* Check for illegal usage. */ | |
124 | + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); | |
125 | + | |
126 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
127 | + /* Zero-fill as necessary. */ | |
128 | + if (__os_fs_notzero()) { | |
129 | + int ret; | |
130 | + if ((ret = __os_zerofill(dbenv, fhp)) != 0) | |
131 | + return (ret); | |
132 | + } | |
133 | + #endif | |
134 | + return (__os_physwrite(dbenv, fhp, addr, len, nwp)); | |
135 | + } | |
136 | + | |
137 | + /* | |
138 | + * __os_physwrite -- | |
139 | + * Physical write to a file handle. | |
140 | + */ | |
141 | + static int | |
142 | + __os_physwrite(dbenv, fhp, addr, len, nwp) | |
143 | + DB_ENV *dbenv; | |
144 | + DB_FH *fhp; | |
145 | + void *addr; | |
146 | + size_t len; | |
147 | + size_t *nwp; | |
148 | + { | |
149 | size_t offset; | |
150 | ssize_t nw; | |
151 | int ret; | |
152 | u_int8_t *taddr; | |
153 | ||
154 | ! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC) | |
155 | ! if (__os_fs_notzero()) { | |
156 | ! struct stat sb; | |
157 | ! off_t cur_off; | |
158 | ! | |
159 | ! DB_ASSERT(fstat(fhp->fd, &sb) != -1 && | |
160 | ! (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 && | |
161 | ! cur_off <= sb.st_size); | |
162 | ! } | |
163 | ! #endif | |
164 | ||
165 | for (taddr = addr, | |
166 | offset = 0; offset < len; taddr += nw, offset += nw) | |
167 | *************** | |
168 | *** 155,157 **** | |
169 | --- 199,288 ---- | |
170 | *nwp = len; | |
171 | return (0); | |
172 | } | |
173 | + | |
174 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
175 | + /* | |
176 | + * __os_zerofill -- | |
177 | + * Zero out bytes in the file. | |
178 | + * | |
179 | + * Pages allocated by writing pages past end-of-file are not zeroed, | |
180 | + * on some systems. Recovery could theoretically be fooled by a page | |
181 | + * showing up that contained garbage. In order to avoid this, we | |
182 | + * have to write the pages out to disk, and flush them. The reason | |
183 | + * for the flush is because if we don't sync, the allocation of another | |
184 | + * page subsequent to this one might reach the disk first, and if we | |
185 | + * crashed at the right moment, leave us with this page as the one | |
186 | + * allocated by writing a page past it in the file. | |
187 | + */ | |
188 | + static int | |
189 | + __os_zerofill(dbenv, fhp) | |
190 | + DB_ENV *dbenv; | |
191 | + DB_FH *fhp; | |
192 | + { | |
193 | + off_t stat_offset, write_offset; | |
194 | + size_t blen, nw; | |
195 | + u_int32_t bytes, mbytes; | |
196 | + int group_sync, need_free, ret; | |
197 | + u_int8_t buf[8 * 1024], *bp; | |
198 | + | |
199 | + /* Calculate the byte offset of the next write. */ | |
200 | + write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset; | |
201 | + | |
202 | + /* Stat the file. */ | |
203 | + if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0) | |
204 | + return (ret); | |
205 | + stat_offset = (off_t)mbytes * MEGABYTE + bytes; | |
206 | + | |
207 | + /* Check if the file is large enough. */ | |
208 | + if (stat_offset >= write_offset) | |
209 | + return (0); | |
210 | + | |
211 | + /* Get a large buffer if we're writing lots of data. */ | |
212 | + #undef ZF_LARGE_WRITE | |
213 | + #define ZF_LARGE_WRITE (64 * 1024) | |
214 | + if (write_offset - stat_offset > ZF_LARGE_WRITE) { | |
215 | + if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0) | |
216 | + return (ret); | |
217 | + blen = ZF_LARGE_WRITE; | |
218 | + need_free = 1; | |
219 | + } else { | |
220 | + bp = buf; | |
221 | + blen = sizeof(buf); | |
222 | + need_free = 0; | |
223 | + memset(buf, 0, sizeof(buf)); | |
224 | + } | |
225 | + | |
226 | + /* Seek to the current end of the file. */ | |
227 | + if ((ret = __os_seek( | |
228 | + dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0) | |
229 | + goto err; | |
230 | + | |
231 | + /* | |
232 | + * Hash is the only access method that allocates groups of pages. Hash | |
233 | + * uses the existence of the last page in a group to signify the entire | |
234 | + * group is OK; so, write all the pages but the last one in the group, | |
235 | + * flush them to disk, then write the last one to disk and flush it. | |
236 | + */ | |
237 | + for (group_sync = 0; stat_offset < write_offset; group_sync = 1) { | |
238 | + if (write_offset - stat_offset <= blen) { | |
239 | + blen = (size_t)(write_offset - stat_offset); | |
240 | + if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0) | |
241 | + goto err; | |
242 | + } | |
243 | + if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0) | |
244 | + goto err; | |
245 | + stat_offset += blen; | |
246 | + } | |
247 | + if ((ret = __os_fsync(dbenv, fhp)) != 0) | |
248 | + goto err; | |
249 | + | |
250 | + /* Seek back to where we started. */ | |
251 | + mbytes = (u_int32_t)(write_offset / MEGABYTE); | |
252 | + bytes = (u_int32_t)(write_offset % MEGABYTE); | |
253 | + ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET); | |
254 | + | |
255 | + err: if (need_free) | |
256 | + __os_free(dbenv, bp); | |
257 | + return (ret); | |
258 | + } | |
259 | + #endif | |
260 | *** os/os_seek.c.orig Mon Jul 15 22:03:38 2002 | |
261 | --- os/os_seek.c Thu Sep 26 14:13:52 2002 | |
262 | *************** | |
263 | *** 68,74 **** | |
264 | } while (ret == EINTR); | |
265 | } | |
266 | ||
267 | ! if (ret != 0) | |
268 | __db_err(dbenv, "seek: %lu %d %d: %s", | |
269 | (u_long)pgsize * pageno + relative, | |
270 | isrewind, db_whence, strerror(ret)); | |
271 | --- 68,78 ---- | |
272 | } while (ret == EINTR); | |
273 | } | |
274 | ||
275 | ! if (ret == 0) { | |
276 | ! fhp->pgsize = pgsize; | |
277 | ! fhp->pgno = pageno; | |
278 | ! fhp->offset = relative; | |
279 | ! } else | |
280 | __db_err(dbenv, "seek: %lu %d %d: %s", | |
281 | (u_long)pgsize * pageno + relative, | |
282 | isrewind, db_whence, strerror(ret)); | |
283 | *** os_win32/os_rw.c.orig 2002/08/06 04:56:19 11.28 | |
284 | --- os_win32/os_rw.c 2002/09/26 18:10:20 | |
285 | *************** | |
286 | *** 20,25 **** | |
287 | --- 20,30 ---- | |
288 | ||
289 | #include "db_int.h" | |
290 | ||
291 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
292 | + static int __os_zerofill __P((DB_ENV *, DB_FH *)); | |
293 | + #endif | |
294 | + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *)); | |
295 | + | |
296 | /* | |
297 | * __os_io -- | |
298 | * Do an I/O. | |
299 | *************** | |
300 | *** 54,59 **** | |
301 | --- 59,68 ---- | |
302 | case DB_IO_WRITE: | |
303 | if (DB_GLOBAL(j_write) != NULL) | |
304 | goto slow; | |
305 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
306 | + if (__os_fs_notzero()) | |
307 | + goto slow; | |
308 | + #endif | |
309 | if (!WriteFile(db_iop->fhp->handle, | |
310 | db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over)) | |
311 | goto slow; | |
312 | *************** | |
313 | *** 149,154 **** | |
314 | --- 158,185 ---- | |
315 | size_t len; | |
316 | size_t *nwp; | |
317 | { | |
318 | + int ret; | |
319 | + | |
320 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
321 | + /* Zero-fill as necessary. */ | |
322 | + if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0) | |
323 | + return (ret); | |
324 | + #endif | |
325 | + return (__os_physwrite(dbenv, fhp, addr, len, nwp)); | |
326 | + } | |
327 | + | |
328 | + /* | |
329 | + * __os_physwrite -- | |
330 | + * Physical write to a file handle. | |
331 | + */ | |
332 | + static int | |
333 | + __os_physwrite(dbenv, fhp, addr, len, nwp) | |
334 | + DB_ENV *dbenv; | |
335 | + DB_FH *fhp; | |
336 | + void *addr; | |
337 | + size_t len; | |
338 | + size_t *nwp; | |
339 | + { | |
340 | size_t offset; | |
341 | DWORD nw; | |
342 | int ret; | |
343 | *************** | |
344 | *** 180,182 **** | |
345 | --- 211,300 ---- | |
346 | *nwp = len; | |
347 | return (0); | |
348 | } | |
349 | + | |
350 | + #ifdef HAVE_FILESYSTEM_NOTZERO | |
351 | + /* | |
352 | + * __os_zerofill -- | |
353 | + * Zero out bytes in the file. | |
354 | + * | |
355 | + * Pages allocated by writing pages past end-of-file are not zeroed, | |
356 | + * on some systems. Recovery could theoretically be fooled by a page | |
357 | + * showing up that contained garbage. In order to avoid this, we | |
358 | + * have to write the pages out to disk, and flush them. The reason | |
359 | + * for the flush is because if we don't sync, the allocation of another | |
360 | + * page subsequent to this one might reach the disk first, and if we | |
361 | + * crashed at the right moment, leave us with this page as the one | |
362 | + * allocated by writing a page past it in the file. | |
363 | + */ | |
364 | + static int | |
365 | + __os_zerofill(dbenv, fhp) | |
366 | + DB_ENV *dbenv; | |
367 | + DB_FH *fhp; | |
368 | + { | |
369 | + unsigned __int64 stat_offset, write_offset; | |
370 | + size_t blen, nw; | |
371 | + u_int32_t bytes, mbytes; | |
372 | + int group_sync, need_free, ret; | |
373 | + u_int8_t buf[8 * 1024], *bp; | |
374 | + | |
375 | + /* Calculate the byte offset of the next write. */ | |
376 | + write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset; | |
377 | + | |
378 | + /* Stat the file. */ | |
379 | + if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0) | |
380 | + return (ret); | |
381 | + stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes; | |
382 | + | |
383 | + /* Check if the file is large enough. */ | |
384 | + if (stat_offset >= write_offset) | |
385 | + return (0); | |
386 | + | |
387 | + /* Get a large buffer if we're writing lots of data. */ | |
388 | + #undef ZF_LARGE_WRITE | |
389 | + #define ZF_LARGE_WRITE (64 * 1024) | |
390 | + if (write_offset - stat_offset > ZF_LARGE_WRITE) { | |
391 | + if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0) | |
392 | + return (ret); | |
393 | + blen = ZF_LARGE_WRITE; | |
394 | + need_free = 1; | |
395 | + } else { | |
396 | + bp = buf; | |
397 | + blen = sizeof(buf); | |
398 | + need_free = 0; | |
399 | + memset(buf, 0, sizeof(buf)); | |
400 | + } | |
401 | + | |
402 | + /* Seek to the current end of the file. */ | |
403 | + if ((ret = __os_seek( | |
404 | + dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0) | |
405 | + goto err; | |
406 | + | |
407 | + /* | |
408 | + * Hash is the only access method that allocates groups of pages. Hash | |
409 | + * uses the existence of the last page in a group to signify the entire | |
410 | + * group is OK; so, write all the pages but the last one in the group, | |
411 | + * flush them to disk, then write the last one to disk and flush it. | |
412 | + */ | |
413 | + for (group_sync = 0; stat_offset < write_offset; group_sync = 1) { | |
414 | + if (write_offset - stat_offset <= blen) { | |
415 | + blen = (size_t)(write_offset - stat_offset); | |
416 | + if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0) | |
417 | + goto err; | |
418 | + } | |
419 | + if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0) | |
420 | + goto err; | |
421 | + stat_offset += blen; | |
422 | + } | |
423 | + if ((ret = __os_fsync(dbenv, fhp)) != 0) | |
424 | + goto err; | |
425 | + | |
426 | + /* Seek back to where we started. */ | |
427 | + mbytes = (u_int32_t)(write_offset / MEGABYTE); | |
428 | + bytes = (u_int32_t)(write_offset % MEGABYTE); | |
429 | + ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET); | |
430 | + | |
431 | + err: if (need_free) | |
432 | + __os_free(dbenv, bp); | |
433 | + return (ret); | |
434 | + } | |
435 | + #endif | |
436 | *** os_win32/os_seek.c.orig 2002/08/06 04:56:20 11.17 | |
437 | --- os_win32/os_seek.c 2002/09/26 18:10:20 | |
438 | *************** | |
439 | *** 79,88 **** | |
440 | __os_win32_errno() : 0; | |
441 | } | |
442 | ||
443 | ! if (ret != 0) | |
444 | __db_err(dbenv, "seek: %lu %d %d: %s", | |
445 | (u_long)pgsize * pageno + relative, | |
446 | isrewind, db_whence, strerror(ret)); | |
447 | ||
448 | return (ret); | |
449 | } | |
450 | --- 79,93 ---- | |
451 | __os_win32_errno() : 0; | |
452 | } | |
453 | ||
454 | ! if (ret == 0) { | |
455 | ! fhp->pgsize = pgsize; | |
456 | ! fhp->pgno = pageno; | |
457 | ! fhp->offset = relative; | |
458 | ! } else { | |
459 | __db_err(dbenv, "seek: %lu %d %d: %s", | |
460 | (u_long)pgsize * pageno + relative, | |
461 | isrewind, db_whence, strerror(ret)); | |
462 | + } | |
463 | ||
464 | return (ret); | |
465 | } |