patch.4.1.24.5

   1 *** dbinc/os.h.orig     2002/03/27 04:34:55     11.14
   2 --- dbinc/os.h  2002/09/26 18:10:10
   3 ***************
   4 *** 22,29 ****
   5         int       fd;                   /* POSIX file descriptor. */
   6         char    *name;                  /* File name. */
   7
   8         u_int32_t log_size;             /* XXX: Log file size. */
   9 !       u_int32_t pagesize;             /* XXX: Page size. */
  10
  11   #define       DB_FH_NOSYNC    0x01            /* Handle doesn't need to be sync'd. */
  12   #define       DB_FH_UNLINK    0x02            /* Unlink on close */
  13 --- 22,34 ----
  14         int       fd;                   /* POSIX file descriptor. */
  15         char    *name;                  /* File name. */
  16
  17 +       u_int32_t pagesize;             /* Underlying page size. */
  18 +
  19         u_int32_t log_size;             /* XXX: Log file size. */
  20 !
  21 !       u_int32_t pgno;                 /* Last seek. */
  22 !       u_int32_t pgsize;
  23 !       u_int32_t offset;
  24
  25   #define       DB_FH_NOSYNC    0x01            /* Handle doesn't need to be sync'd. */
  26   #define       DB_FH_UNLINK    0x02            /* Unlink on close */
  27 *** os/os_rw.c.orig     2002/07/12 18:56:52     11.24
  28 --- os/os_rw.c  2002/09/16 20:46:14     11.25
  29 ***************
  30 *** 35,40 ****
  31 --- 35,43 ----
  32   {
  33         int ret;
  34
  35 +       /* Check for illegal usage. */
  36 +       DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1);
  37 +
  38   #if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
  39         switch (op) {
  40         case DB_IO_READ:
  41 ***************
  42 *** 95,100 ****
  43 --- 98,106 ----
  44         int ret;
  45         u_int8_t *taddr;
  46
  47 +       /* Check for illegal usage. */
  48 +       DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
  49 +
  50         for (taddr = addr,
  51             offset = 0; offset < len; taddr += nr, offset += nr) {
  52   retry:                if ((nr = DB_GLOBAL(j_read) != NULL ?
  53 ***************
  54 *** 131,136 ****
  55 --- 137,145 ----
  56         ssize_t nw;
  57         int ret;
  58         u_int8_t *taddr;
  59 +
  60 +       /* Check for illegal usage. */
  61 +       DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
  62
  63         for (taddr = addr,
  64             offset = 0; offset < len; taddr += nw, offset += nw)
  65 *** os/os_rw.c.orig     2002/09/16 20:46:14     11.25
  66 --- os/os_rw.c  2002/09/26 18:10:20
  67 ***************
  68 *** 13,18 ****
  69 --- 13,19 ----
  70
  71   #ifndef NO_SYSTEM_INCLUDES
  72   #include <sys/types.h>
  73 + #include <sys/stat.h>
  74
  75   #include <string.h>
  76   #include <unistd.h>
  77 ***************
  78 *** 20,25 ****
  79 --- 21,31 ----
  80
  81   #include "db_int.h"
  82
  83 + #ifdef HAVE_FILESYSTEM_NOTZERO
  84 + static int __os_zerofill __P((DB_ENV *, DB_FH *));
  85 + #endif
  86 + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
  87 +
  88   /*
  89    * __os_io --
  90    *    Do an I/O.
  91 ***************
  92 *** 49,54 ****
  93 --- 55,64 ----
  94         case DB_IO_WRITE:
  95                 if (DB_GLOBAL(j_write) != NULL)
  96                         goto slow;
  97 + #ifdef HAVE_FILESYSTEM_NOTZERO
  98 +               if (__os_fs_notzero())
  99 +                       goto slow;
 100 + #endif
 101                 *niop = pwrite(db_iop->fhp->fd, db_iop->buf,
 102                     db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
 103                 break;
 104 ***************
 105 *** 133,145 ****
 106         size_t len;
 107         size_t *nwp;
 108   {
 109         size_t offset;
 110         ssize_t nw;
 111         int ret;
 112         u_int8_t *taddr;
 113
 114 !       /* Check for illegal usage. */
 115 !       DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
 116
 117         for (taddr = addr,
 118             offset = 0; offset < len; taddr += nw, offset += nw)
 119 --- 143,189 ----
 120         size_t len;
 121         size_t *nwp;
 122   {
 123 +       /* Check for illegal usage. */
 124 +       DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1);
 125 +
 126 + #ifdef HAVE_FILESYSTEM_NOTZERO
 127 +       /* Zero-fill as necessary. */
 128 +       if (__os_fs_notzero()) {
 129 +               int ret;
 130 +               if ((ret = __os_zerofill(dbenv, fhp)) != 0)
 131 +                       return (ret);
 132 +       }
 133 + #endif
 134 +       return (__os_physwrite(dbenv, fhp, addr, len, nwp));
 135 + }
 136 +
 137 + /*
 138 +  * __os_physwrite --
 139 +  *    Physical write to a file handle.
 140 +  */
 141 + static int
 142 + __os_physwrite(dbenv, fhp, addr, len, nwp)
 143 +       DB_ENV *dbenv;
 144 +       DB_FH *fhp;
 145 +       void *addr;
 146 +       size_t len;
 147 +       size_t *nwp;
 148 + {
 149         size_t offset;
 150         ssize_t nw;
 151         int ret;
 152         u_int8_t *taddr;
 153
 154 ! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
 155 !       if (__os_fs_notzero()) {
 156 !               struct stat sb;
 157 !               off_t cur_off;
 158 !
 159 !               DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
 160 !                   (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
 161 !                   cur_off <= sb.st_size);
 162 !       }
 163 ! #endif
 164
 165         for (taddr = addr,
 166             offset = 0; offset < len; taddr += nw, offset += nw)
 167 ***************
 168 *** 155,157 ****
 169 --- 199,288 ----
 170         *nwp = len;
 171         return (0);
 172   }
 173 +
 174 + #ifdef HAVE_FILESYSTEM_NOTZERO
 175 + /*
 176 +  * __os_zerofill --
 177 +  *    Zero out bytes in the file.
 178 +  *
 179 +  *    Pages allocated by writing pages past end-of-file are not zeroed,
 180 +  *    on some systems.  Recovery could theoretically be fooled by a page
 181 +  *    showing up that contained garbage.  In order to avoid this, we
 182 +  *    have to write the pages out to disk, and flush them.  The reason
 183 +  *    for the flush is because if we don't sync, the allocation of another
 184 +  *    page subsequent to this one might reach the disk first, and if we
 185 +  *    crashed at the right moment, leave us with this page as the one
 186 +  *    allocated by writing a page past it in the file.
 187 +  */
 188 + static int
 189 + __os_zerofill(dbenv, fhp)
 190 +       DB_ENV *dbenv;
 191 +       DB_FH *fhp;
 192 + {
 193 +       off_t stat_offset, write_offset;
 194 +       size_t blen, nw;
 195 +       u_int32_t bytes, mbytes;
 196 +       int group_sync, need_free, ret;
 197 +       u_int8_t buf[8 * 1024], *bp;
 198 +
 199 +       /* Calculate the byte offset of the next write. */
 200 +       write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
 201 +
 202 +       /* Stat the file. */
 203 +       if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
 204 +               return (ret);
 205 +       stat_offset = (off_t)mbytes * MEGABYTE + bytes;
 206 +
 207 +       /* Check if the file is large enough. */
 208 +       if (stat_offset >= write_offset)
 209 +               return (0);
 210 +
 211 +       /* Get a large buffer if we're writing lots of data. */
 212 + #undef        ZF_LARGE_WRITE
 213 + #define       ZF_LARGE_WRITE  (64 * 1024)
 214 +       if (write_offset - stat_offset > ZF_LARGE_WRITE) {
 215 +               if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
 216 +                           return (ret);
 217 +               blen = ZF_LARGE_WRITE;
 218 +               need_free = 1;
 219 +       } else {
 220 +               bp = buf;
 221 +               blen = sizeof(buf);
 222 +               need_free = 0;
 223 +               memset(buf, 0, sizeof(buf));
 224 +       }
 225 +
 226 +       /* Seek to the current end of the file. */
 227 +       if ((ret = __os_seek(
 228 +           dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
 229 +               goto err;
 230 +
 231 +       /*
 232 +        * Hash is the only access method that allocates groups of pages.  Hash
 233 +        * uses the existence of the last page in a group to signify the entire
 234 +        * group is OK; so, write all the pages but the last one in the group,
 235 +        * flush them to disk, then write the last one to disk and flush it.
 236 +        */
 237 +       for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
 238 +               if (write_offset - stat_offset <= blen) {
 239 +                       blen = (size_t)(write_offset - stat_offset);
 240 +                       if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
 241 +                               goto err;
 242 +               }
 243 +               if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
 244 +                       goto err;
 245 +               stat_offset += blen;
 246 +       }
 247 +       if ((ret = __os_fsync(dbenv, fhp)) != 0)
 248 +               goto err;
 249 +
 250 +       /* Seek back to where we started. */
 251 +       mbytes = (u_int32_t)(write_offset / MEGABYTE);
 252 +       bytes = (u_int32_t)(write_offset % MEGABYTE);
 253 +       ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
 254 +
 255 + err:  if (need_free)
 256 +               __os_free(dbenv, bp);
 257 +       return (ret);
 258 + }
 259 + #endif
 260 *** os/os_seek.c.orig   Mon Jul 15 22:03:38 2002
 261 --- os/os_seek.c        Thu Sep 26 14:13:52 2002
 262 ***************
 263 *** 68,74 ****
 264                 } while (ret == EINTR);
 265         }
 266
 267 !       if (ret != 0)
 268                 __db_err(dbenv, "seek: %lu %d %d: %s",
 269                     (u_long)pgsize * pageno + relative,
 270                     isrewind, db_whence, strerror(ret));
 271 --- 68,78 ----
 272                 } while (ret == EINTR);
 273         }
 274
 275 !       if (ret == 0) {
 276 !               fhp->pgsize = pgsize;
 277 !               fhp->pgno = pageno;
 278 !               fhp->offset = relative;
 279 !       } else
 280                 __db_err(dbenv, "seek: %lu %d %d: %s",
 281                     (u_long)pgsize * pageno + relative,
 282                     isrewind, db_whence, strerror(ret));
 283 *** os_win32/os_rw.c.orig       2002/08/06 04:56:19     11.28
 284 --- os_win32/os_rw.c    2002/09/26 18:10:20
 285 ***************
 286 *** 20,25 ****
 287 --- 20,30 ----
 288
 289   #include "db_int.h"
 290
 291 + #ifdef HAVE_FILESYSTEM_NOTZERO
 292 + static int __os_zerofill __P((DB_ENV *, DB_FH *));
 293 + #endif
 294 + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
 295 +
 296   /*
 297    * __os_io --
 298    *    Do an I/O.
 299 ***************
 300 *** 54,59 ****
 301 --- 59,68 ----
 302                 case DB_IO_WRITE:
 303                         if (DB_GLOBAL(j_write) != NULL)
 304                                 goto slow;
 305 + #ifdef HAVE_FILESYSTEM_NOTZERO
 306 +                       if (__os_fs_notzero())
 307 +                               goto slow;
 308 + #endif
 309                         if (!WriteFile(db_iop->fhp->handle,
 310                             db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
 311                                 goto slow;
 312 ***************
 313 *** 149,154 ****
 314 --- 158,185 ----
 315         size_t len;
 316         size_t *nwp;
 317   {
 318 +       int ret;
 319 +
 320 + #ifdef HAVE_FILESYSTEM_NOTZERO
 321 +       /* Zero-fill as necessary. */
 322 +       if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
 323 +               return (ret);
 324 + #endif
 325 +       return (__os_physwrite(dbenv, fhp, addr, len, nwp));
 326 + }
 327 +
 328 + /*
 329 +  * __os_physwrite --
 330 +  *    Physical write to a file handle.
 331 +  */
 332 + static int
 333 + __os_physwrite(dbenv, fhp, addr, len, nwp)
 334 +       DB_ENV *dbenv;
 335 +       DB_FH *fhp;
 336 +       void *addr;
 337 +       size_t len;
 338 +       size_t *nwp;
 339 + {
 340         size_t offset;
 341         DWORD nw;
 342         int ret;
 343 ***************
 344 *** 180,182 ****
 345 --- 211,300 ----
 346         *nwp = len;
 347         return (0);
 348   }
 349 +
 350 + #ifdef HAVE_FILESYSTEM_NOTZERO
 351 + /*
 352 +  * __os_zerofill --
 353 +  *    Zero out bytes in the file.
 354 +  *
 355 +  *    Pages allocated by writing pages past end-of-file are not zeroed,
 356 +  *    on some systems.  Recovery could theoretically be fooled by a page
 357 +  *    showing up that contained garbage.  In order to avoid this, we
 358 +  *    have to write the pages out to disk, and flush them.  The reason
 359 +  *    for the flush is because if we don't sync, the allocation of another
 360 +  *    page subsequent to this one might reach the disk first, and if we
 361 +  *    crashed at the right moment, leave us with this page as the one
 362 +  *    allocated by writing a page past it in the file.
 363 +  */
 364 + static int
 365 + __os_zerofill(dbenv, fhp)
 366 +       DB_ENV *dbenv;
 367 +       DB_FH *fhp;
 368 + {
 369 +       unsigned __int64 stat_offset, write_offset;
 370 +       size_t blen, nw;
 371 +       u_int32_t bytes, mbytes;
 372 +       int group_sync, need_free, ret;
 373 +       u_int8_t buf[8 * 1024], *bp;
 374 +
 375 +       /* Calculate the byte offset of the next write. */
 376 +       write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
 377 +
 378 +       /* Stat the file. */
 379 +       if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
 380 +               return (ret);
 381 +       stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
 382 +
 383 +       /* Check if the file is large enough. */
 384 +       if (stat_offset >= write_offset)
 385 +               return (0);
 386 +
 387 +       /* Get a large buffer if we're writing lots of data. */
 388 + #undef        ZF_LARGE_WRITE
 389 + #define       ZF_LARGE_WRITE  (64 * 1024)
 390 +       if (write_offset - stat_offset > ZF_LARGE_WRITE) {
 391 +               if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
 392 +                           return (ret);
 393 +               blen = ZF_LARGE_WRITE;
 394 +               need_free = 1;
 395 +       } else {
 396 +               bp = buf;
 397 +               blen = sizeof(buf);
 398 +               need_free = 0;
 399 +               memset(buf, 0, sizeof(buf));
 400 +       }
 401 +
 402 +       /* Seek to the current end of the file. */
 403 +       if ((ret = __os_seek(
 404 +           dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
 405 +               goto err;
 406 +
 407 +       /*
 408 +        * Hash is the only access method that allocates groups of pages.  Hash
 409 +        * uses the existence of the last page in a group to signify the entire
 410 +        * group is OK; so, write all the pages but the last one in the group,
 411 +        * flush them to disk, then write the last one to disk and flush it.
 412 +        */
 413 +       for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
 414 +               if (write_offset - stat_offset <= blen) {
 415 +                       blen = (size_t)(write_offset - stat_offset);
 416 +                       if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
 417 +                               goto err;
 418 +               }
 419 +               if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
 420 +                       goto err;
 421 +               stat_offset += blen;
 422 +       }
 423 +       if ((ret = __os_fsync(dbenv, fhp)) != 0)
 424 +               goto err;
 425 +
 426 +       /* Seek back to where we started. */
 427 +       mbytes = (u_int32_t)(write_offset / MEGABYTE);
 428 +       bytes = (u_int32_t)(write_offset % MEGABYTE);
 429 +       ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
 430 +
 431 + err:  if (need_free)
 432 +               __os_free(dbenv, bp);
 433 +       return (ret);
 434 + }
 435 + #endif
 436 *** os_win32/os_seek.c.orig     2002/08/06 04:56:20     11.17
 437 --- os_win32/os_seek.c  2002/09/26 18:10:20
 438 ***************
 439 *** 79,88 ****
 440                     __os_win32_errno() : 0;
 441         }
 442
 443 !       if (ret != 0)
 444                 __db_err(dbenv, "seek: %lu %d %d: %s",
 445                     (u_long)pgsize * pageno + relative,
 446                     isrewind, db_whence, strerror(ret));
 447
 448         return (ret);
 449   }
 450 --- 79,93 ----
 451                     __os_win32_errno() : 0;
 452         }
 453
 454 !       if (ret == 0) {
 455 !               fhp->pgsize = pgsize;
 456 !               fhp->pgno = pageno;
 457 !               fhp->offset = relative;
 458 !       } else {
 459                 __db_err(dbenv, "seek: %lu %d %d: %s",
 460                     (u_long)pgsize * pageno + relative,
 461                     isrewind, db_whence, strerror(ret));
 462 +       }
 463
 464         return (ret);
 465   }