+*** dbinc/mp.h.orig 2004-02-02 10:24:53.000000000 -0800
+--- dbinc/mp.h 2004-02-02 10:26:27.000000000 -0800
+***************
+*** 149,154 ****
+--- 149,161 ----
+ * region lock).
+ */
+ DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
++
++ /*
++ * We track page puts so that we can decide when allocation is never
++ * going to succeed. We don't lock the field, all we care about is
++ * if it changes.
++ */
++ u_int32_t put_counter; /* Count of page put calls. */
+ };
+
+ struct __db_mpool_hash {
+*** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700
+--- mp/mp_fput.c 2004-02-02 10:22:35.000000000 -0800
+***************
+*** 19,24 ****
+--- 19,26 ----
+ #include "dbinc/db_shash.h"
+ #include "dbinc/mp.h"
+
++ static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
++
+ /*
+ * __memp_fput --
+ * Mpool file put function.
+***************
+*** 198,202 ****
+--- 200,255 ----
+
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
++ /*
++ * On every buffer put we update the buffer generation number and check
++ * for wraparound.
++ */
++ if (++c_mp->lru_count == UINT32_T_MAX)
++ __memp_reset_lru(dbenv, dbmp->reginfo);
++
+ return (0);
+ }
++
++ /*
++ * __memp_reset_lru --
++ * Reset the cache LRU counter.
++ */
++ static void
++ __memp_reset_lru(dbenv, memreg)
++ DB_ENV *dbenv;
++ REGINFO *memreg;
++ {
++ BH *bhp;
++ DB_MPOOL_HASH *hp;
++ MPOOL *c_mp;
++ int bucket;
++
++ c_mp = memreg->primary;
++
++ /*
++ * Update the counter so all future allocations will start at the
++ * bottom.
++ */
++ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
++
++ /* Adjust the priority of every buffer in the system. */
++ for (hp = R_ADDR(memreg, c_mp->htab),
++ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
++ /*
++ * Skip empty buckets.
++ *
++ * We can check for empty buckets before locking as we
++ * only care if the pointer is zero or non-zero.
++ */
++ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
++ continue;
++
++ MUTEX_LOCK(dbenv, &hp->hash_mutex);
++ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
++ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
++ if (bhp->priority != UINT32_T_MAX &&
++ bhp->priority > MPOOL_BASE_DECREMENT)
++ bhp->priority -= MPOOL_BASE_DECREMENT;
++ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
++ }
++ }
+*** mp/mp_alloc.c.orig 2002-08-17 07:23:25.000000000 -0700
+--- mp/mp_alloc.c 2004-02-02 10:28:15.000000000 -0800
+***************
+*** 25,31 ****
+ } HS;
+
+ static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+- static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
+
+ /*
+ * __memp_alloc --
+--- 25,30 ----
+***************
+*** 50,57 ****
+ MPOOL *c_mp;
+ MPOOLFILE *bh_mfp;
+ size_t freed_space;
+! u_int32_t buckets, buffers, high_priority, max_na, priority;
+! int aggressive, ret;
+ void *p;
+
+ dbenv = dbmp->dbenv;
+--- 49,57 ----
+ MPOOL *c_mp;
+ MPOOLFILE *bh_mfp;
+ size_t freed_space;
+! u_int32_t buckets, buffers, high_priority, priority, put_counter;
+! u_int32_t total_buckets;
+! int aggressive, giveup, ret;
+ void *p;
+
+ dbenv = dbmp->dbenv;
+***************
+*** 59,76 ****
+ dbht = R_ADDR(memreg, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+
+! buckets = buffers = 0;
+! aggressive = 0;
+
+ c_mp->stat.st_alloc++;
+
+ /*
+- * Get aggressive if we've tried to flush the number of pages as are
+- * in the system without finding space.
+- */
+- max_na = 5 * c_mp->htab_buckets;
+-
+- /*
+ * If we're allocating a buffer, and the one we're discarding is the
+ * same size, we don't want to waste the time to re-integrate it into
+ * the shared memory free list. If the DB_MPOOLFILE argument isn't
+--- 59,71 ----
+ dbht = R_ADDR(memreg, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+
+! buckets = buffers = put_counter = total_buckets = 0;
+! aggressive = giveup = 0;
+! hp_tmp = NULL;
+
+ c_mp->stat.st_alloc++;
+
+ /*
+ * If we're allocating a buffer, and the one we're discarding is the
+ * same size, we don't want to waste the time to re-integrate it into
+ * the shared memory free list. If the DB_MPOOLFILE argument isn't
+***************
+*** 81,99 ****
+ len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
+
+ R_LOCK(dbenv, memreg);
+-
+- /*
+- * On every buffer allocation we update the buffer generation number
+- * and check for wraparound.
+- */
+- if (++c_mp->lru_count == UINT32_T_MAX)
+- __memp_reset_lru(dbenv, memreg, c_mp);
+-
+ /*
+ * Anything newer than 1/10th of the buffer pool is ignored during
+ * allocation (unless allocation starts failing).
+ */
+- DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+ high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+ /*
+--- 76,85 ----
+***************
+*** 120,129 ****
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+! if (buckets != 0) {
+! if (buckets > c_mp->stat.st_alloc_max_buckets)
+! c_mp->stat.st_alloc_max_buckets = buckets;
+! c_mp->stat.st_alloc_buckets += buckets;
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+--- 106,116 ----
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+! total_buckets += buckets;
+! if (total_buckets != 0) {
+! if (total_buckets > c_mp->stat.st_alloc_max_buckets)
+! c_mp->stat.st_alloc_max_buckets = total_buckets;
+! c_mp->stat.st_alloc_buckets += total_buckets;
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+***************
+*** 131,136 ****
+--- 118,129 ----
+ c_mp->stat.st_alloc_pages += buffers;
+ }
+ return (0);
++ } else if (giveup || c_mp->stat.st_pages == 0) {
++ R_UNLOCK(dbenv, memreg);
++
++ __db_err(dbenv,
++ "unable to allocate space from the buffer cache");
++ return (ret);
+ }
+
+ /*
+***************
+*** 138,163 ****
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
+
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+! for (hp_tmp = NULL;;) {
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+!
+! /*
+! * If we've gone through all of the hash buckets, try
+! * an allocation. If the cache is small, the old page
+! * size is small, and the new page size is large, we
+! * might have freed enough memory (but not 3 times the
+! * memory).
+! */
+! goto alloc;
+ }
+
+ /*
+--- 131,154 ----
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
++ total_buckets += buckets;
++ buckets = 0;
+
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+! for (;;) {
+! /* All pages have been freed, make one last try */
+! if (c_mp->stat.st_pages == 0)
+! goto alloc;
+!
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+! hp = &dbht[c_mp->last_checked++];
+ }
+
+ /*
+***************
+*** 172,210 ****
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system. We don't
+! * have a metric for deciding if allocation has no possible way
+! * to succeed, so we don't ever fail, we assume memory will be
+! * available if we wait long enough.
+ *
+! * Get aggressive if we've tried to flush 5 times the number of
+! * hash buckets as are in the system -- it's possible we have
+! * been repeatedly trying to flush the same buffers, although
+! * it's unlikely. Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: sync the mpool to force out queue extent pages. While we
+ * might not have enough space for what we want and flushing
+ * is expensive, why not?
+! * c: sleep for a second -- hopefully someone else will run and
+! * free up some memory. Try to allocate memory too, in case
+! * the other thread returns its memory to the region.
+! * d: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- that shouldn't be possible.
+ */
+! if ((++buckets % max_na) == 0) {
+! aggressive = 1;
+!
+ R_UNLOCK(dbenv, memreg);
+
+! (void)__memp_sync_int(
+! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+!
+! (void)__os_sleep(dbenv, 1, 0);
+
+ R_LOCK(dbenv, memreg);
+ goto alloc;
+--- 163,221 ----
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system. We don't
+! * have a way to know that allocation has no way to succeed.
+! * We fail if there were no pages returned to the cache after
+! * we've been trying for a relatively long time.
+ *
+! * Get aggressive if we've tried to flush the number of hash
+! * buckets as are in the system and have not found any more
+! * space. Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: sync the mpool to force out queue extent pages. While we
+ * might not have enough space for what we want and flushing
+ * is expensive, why not?
+! * c: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
++ * d: start to think about giving up.
++ *
++ * If we get here twice, sleep for a second, hopefully someone
++ * else will run and free up some memory.
++ *
++ * Always try to allocate memory too, in case some other thread
++ * returns its memory to the region.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- that shouldn't be possible.
+ */
+! if ((++buckets % c_mp->htab_buckets) == 0) {
+! if (freed_space > 0)
+! goto alloc;
+ R_UNLOCK(dbenv, memreg);
+
+! switch (++aggressive) {
+! case 1:
+! break;
+! case 2:
+! put_counter = c_mp->put_counter;
+! /* FALLTHROUGH */
+! case 3:
+! case 4:
+! case 5:
+! case 6:
+! (void)__memp_sync_int(
+! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+!
+! (void)__os_sleep(dbenv, 1, 0);
+! break;
+! default:
+! aggressive = 1;
+! if (put_counter == c_mp->put_counter)
+! giveup = 1;
+! break;
+! }
+
+ R_LOCK(dbenv, memreg);
+ goto alloc;
+***************
+*** 277,283 ****
+ * thread may have acquired this buffer and incremented the ref
+ * count after we wrote it, in which case we can't have it.
+ *
+! * If there's a write error, avoid selecting this buffer again
+ * by making it the bucket's least-desirable buffer.
+ */
+ if (ret != 0 || bhp->ref != 0) {
+--- 288,295 ----
+ * thread may have acquired this buffer and incremented the ref
+ * count after we wrote it, in which case we can't have it.
+ *
+! * If there's a write error and we're having problems finding
+! * something to allocate, avoid selecting this buffer again
+ * by making it the bucket's least-desirable buffer.
+ */
+ if (ret != 0 || bhp->ref != 0) {
+***************
+*** 301,306 ****
+--- 313,320 ----
+
+ freed_space += __db_shsizeof(bhp);
+ __memp_bhfree(dbmp, hp, bhp, 1);
++ if (aggressive > 1)
++ aggressive = 1;
+
+ /*
+ * Unlock this hash bucket and re-acquire the region lock. If
+***************
+*** 362,415 ****
+ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+ }
+
+- /*
+- * __memp_reset_lru --
+- * Reset the cache LRU counter.
+- */
+- static void
+- __memp_reset_lru(dbenv, memreg, c_mp)
+- DB_ENV *dbenv;
+- REGINFO *memreg;
+- MPOOL *c_mp;
+- {
+- BH *bhp;
+- DB_MPOOL_HASH *hp;
+- int bucket;
+-
+- /*
+- * Update the counter so all future allocations will start at the
+- * bottom.
+- */
+- c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+-
+- /* Release the region lock. */
+- R_UNLOCK(dbenv, memreg);
+-
+- /* Adjust the priority of every buffer in the system. */
+- for (hp = R_ADDR(memreg, c_mp->htab),
+- bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+- /*
+- * Skip empty buckets.
+- *
+- * We can check for empty buckets before locking as we
+- * only care if the pointer is zero or non-zero.
+- */
+- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+- continue;
+-
+- MUTEX_LOCK(dbenv, &hp->hash_mutex);
+- for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+- if (bhp->priority != UINT32_T_MAX &&
+- bhp->priority > MPOOL_BASE_DECREMENT)
+- bhp->priority -= MPOOL_BASE_DECREMENT;
+- MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+- }
+-
+- /* Reacquire the region lock. */
+- R_LOCK(dbenv, memreg);
+- }
+-
+ #ifdef DIAGNOSTIC
+ /*
+ * __memp_check_order --
+--- 376,381 ----
+*** dbreg/dbreg_rec.c.orig 2002-08-17 07:22:52.000000000 -0700
+--- dbreg/dbreg_rec.c 2003-11-08 10:59:19.000000000 -0800
+***************
+*** 174,192 ****
+ * Typically, closes should match an open which means
+ * that if this is a close, there should be a valid
+ * entry in the dbentry table when we get here,
+! * however there is an exception. If this is an
+ * OPENFILES pass, then we may have started from
+ * a log file other than the first, and the
+ * corresponding open appears in an earlier file.
+! * We can ignore that case, but all others are errors.
+ */
+ dbe = &dblp->dbentry[argp->fileid];
+ if (dbe->dbp == NULL && !dbe->deleted) {
+ /* No valid entry here. */
+! if ((argp->opcode != LOG_CLOSE &&
+! argp->opcode != LOG_RCLOSE) ||
+! (op != DB_TXN_OPENFILES &&
+! op !=DB_TXN_POPENFILES)) {
+ __db_err(dbenv,
+ "Improper file close at %lu/%lu",
+ (u_long)lsnp->file,
+--- 174,193 ----
+ * Typically, closes should match an open which means
+ * that if this is a close, there should be a valid
+ * entry in the dbentry table when we get here,
+! * however there are exceptions. 1. If this is an
+ * OPENFILES pass, then we may have started from
+ * a log file other than the first, and the
+ * corresponding open appears in an earlier file.
+! * 2. If we are undoing an open on an abort or
+! * recovery, it's possible that we failed after
+! * the log record, but before we actually entered
+! * a handle here.
+ */
+ dbe = &dblp->dbentry[argp->fileid];
+ if (dbe->dbp == NULL && !dbe->deleted) {
+ /* No valid entry here. */
+! if (DB_REDO(op) ||
+! argp->opcode == LOG_CHECKPOINT) {
+ __db_err(dbenv,
+ "Improper file close at %lu/%lu",
+ (u_long)lsnp->file,
+*** env/env_recover.c.orig.1 2002-08-22 14:52:51.000000000 -0700
+--- env/env_recover.c 2003-11-15 08:20:59.000000000 -0800
+***************
+*** 232,243 ****
+ * we'll still need to do a vtruncate based on information we haven't
+ * yet collected.
+ */
+! if (ret == DB_NOTFOUND) {
+ ret = 0;
+! if (max_lsn == NULL)
+! goto done;
+! }
+! if (ret != 0)
+ goto err;
+
+ hi_txn = txnid;
+--- 232,240 ----
+ * we'll still need to do a vtruncate based on information we haven't
+ * yet collected.
+ */
+! if (ret == DB_NOTFOUND)
+ ret = 0;
+! else if (ret != 0)
+ goto err;
+
+ hi_txn = txnid;
+***************
+*** 331,337 ****
+
+ /* Find a low txnid. */
+ ret = 0;
+! do {
+ /* txnid is after rectype, which is a u_int32. */
+ memcpy(&txnid,
+ (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
+--- 328,334 ----
+
+ /* Find a low txnid. */
+ ret = 0;
+! if (hi_txn != 0) do {
+ /* txnid is after rectype, which is a u_int32. */
+ memcpy(&txnid,
+ (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
+***************
+*** 344,354 ****
+ * There are no transactions and we're not recovering to an LSN (see
+ * above), so there is nothing to do.
+ */
+! if (ret == DB_NOTFOUND) {
+ ret = 0;
+- if (max_lsn == NULL)
+- goto done;
+- }
+
+ /* Reset to the first lsn. */
+ if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
+--- 341,348 ----
+ * There are no transactions and we're not recovering to an LSN (see
+ * above), so there is nothing to do.
+ */
+! if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ /* Reset to the first lsn. */
+ if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
+***************
+*** 367,372 ****
+--- 361,370 ----
+ txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
+ goto err;
+
++ /* If there were no transactions, then we can bail out early. */
++ if (hi_txn == 0 && max_lsn == NULL)
++ goto done;
++
+ /*
+ * Pass #2.
+ *
+***************
+*** 483,488 ****
+--- 481,487 ----
+ if ((ret = __dbreg_close_files(dbenv)) != 0)
+ goto err;
+
++ done:
+ if (max_lsn != NULL) {
+ region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
+
+***************
+*** 538,544 ****
+ __db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
+ __db_err(dbenv, "%s %lx %s [%lu][%lu]",
+ "Maximum transaction ID",
+! ((DB_TXNHEAD *)txninfo)->maxid,
+ "Recovery checkpoint",
+ (u_long)region->last_ckp.file,
+ (u_long)region->last_ckp.offset);
+--- 537,544 ----
+ __db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
+ __db_err(dbenv, "%s %lx %s [%lu][%lu]",
+ "Maximum transaction ID",
+! txninfo == NULL ? TXN_MINIMUM :
+! ((DB_TXNHEAD *)txninfo)->maxid,
+ "Recovery checkpoint",
+ (u_long)region->last_ckp.file,
+ (u_long)region->last_ckp.offset);
+***************
+*** 550,556 ****
+ (u_long)lsn.file, (u_long)lsn.offset, pass);
+ }
+
+- done:
+ err: if (lockid != DB_LOCK_INVALIDID) {
+ if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
+ ret = t_ret;
+--- 550,555 ----