/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2001 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: mp_sync.c,v 11.46 2001/07/26 19:53:31 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #endif #include "db_int.h" #include "db_shash.h" #include "mp.h" static int __bhcmp __P((const void *, const void *)); static int __memp_fsync_int __P((DB_MPOOLFILE *)); static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *)); /* * __memp_sync -- * Mpool sync function. * * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *)); */ int __memp_sync(dbenv, lsnp) DB_ENV *dbenv; DB_LSN *lsnp; { BH *bhp, **bharray; DB_MPOOL *dbmp; DB_LSN tlsn; MPOOL *c_mp, *mp; MPOOLFILE *mfp; u_int32_t ar_cnt, ar_max, ccnt, i; int ret, retry_done, retry_need, t_ret, wrote; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL); if (lsnp != NULL) ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, "memp_sync", DB_INIT_LOG); dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; /* * If no LSN is provided, flush the entire cache. * * !!! * Our current behavior is to flush the entire cache, so there's * nothing special we have to do here other than deal with NULL * pointers. */ if (lsnp == NULL) { ZERO_LSN(tlsn); lsnp = &tlsn; } /* * Sync calls are single-threaded so that we don't have multiple * threads, with different checkpoint LSNs, walking the caches * and updating the checkpoint LSNs and how many buffers remain * to be written for the checkpoint. This shouldn't be a problem, * any application that has multiple checkpoint threads isn't what * I'd call trustworthy. */ MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp); /* * If the application is asking about a previous call to memp_sync(), * and we haven't found any buffers that the application holding the * pin couldn't write, return yes or no based on the current count. * Note, if the application is asking about a LSN *smaller* than one * we've already handled or are currently handling, then we return a * result based on the count for the larger LSN. */ R_LOCK(dbenv, dbmp->reginfo); if (!IS_ZERO_LSN(*lsnp) && !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { if (mp->lsn_cnt == 0) { *lsnp = mp->lsn; ret = 0; } else ret = DB_INCOMPLETE; R_UNLOCK(dbenv, dbmp->reginfo); MUTEX_UNLOCK(dbenv, &mp->sync_mutex); return (ret); } /* * Allocate room for a list of buffers, and decide how many buffers * we can pin down. * * !!! * Note: __memp_sballoc has released the region lock if we're not * continuing forward. */ if ((ret = __memp_sballoc(dbenv, &bharray, &ar_max)) != 0 || ar_max == 0) { MUTEX_UNLOCK(dbenv, &mp->sync_mutex); return (ret); } retry_done = 0; retry: retry_need = 0; /* * Start a new checkpoint. * * Save the LSN. We know that it's a new LSN, a retry, or larger than * the one for which we were already doing a checkpoint. (BTW, I don't * expect to see multiple LSN's from the same or multiple processes, * but You Just Never Know. Responding as if they all called with the * largest of the LSNs specified makes everything work.) * * We don't currently use the LSN we save. We could potentially save * the last-written LSN in each buffer header and use it to determine * what buffers need to be written. The problem with this is that it's * sizeof(LSN) more bytes of buffer header. We currently write all the * dirty buffers instead, but with a sufficiently large cache that's * going to be a problem. * * Clear the global count of buffers waiting to be written, walk the * list of files clearing the count of buffers waiting to be written. * * Clear the retry flag. */ mp->lsn = *lsnp; mp->lsn_cnt = 0; for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) mfp->lsn_cnt = 0; F_CLR(mp, MP_LSN_RETRY); /* * Walk each cache's list of buffers and mark all dirty buffers to be * written and all pinned buffers to be potentially written (we can't * know if they'll need to be written until the holder returns them to * the cache). We do this in one pass while holding the region locked * so that processes can't make new buffers dirty, causing us to never * finish. Since the application may have restarted the sync using a * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that * appear leftover from previous calls. * * Keep a count of the total number of buffers we need to write in * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. */ for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) { c_mp = dbmp->reginfo[i].primary; /* * Don't pin down more than 80% of the pages in any cache, * otherwise we'll starve threads needing new pages. If * only a small number of pages have been allocated from * the cache we can end up with a silly ccnt. There's no * way we can know exactly how many pages a cache will hold, * so we bound it at something insane -- if there are less * than 10 pages in the cache, someone screwed up pretty badly. */ ccnt = c_mp->stat.st_page_dirty + c_mp->stat.st_page_clean; if (ccnt >= 10) ccnt = (ccnt * 8) / 10; for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); /* * If this buffer belongs to a temp file, * ignore it. Checkpoint doesn't require * that temporary buffers be flushed and * the underlying buffer write routine may * not be able to write it anyway. */ if (F_ISSET(mfp, MP_TEMP)) continue; /* * Ignore files that aren't involved in DB's * transactional operations during checkpoints. */ if (lsnp != NULL && mfp->lsn_off == -1) continue; F_SET(bhp, BH_SYNC); ++mp->lsn_cnt; ++mfp->lsn_cnt; /* * If the buffer isn't being used, we can write * it immediately, so increment its reference * count to lock it down, and save a reference * to it. * * If we've run out space to store buffer refs, * we're screwed. We don't want to realloc the * array while holding a region lock, so we set * a flag and deal with it later. */ if (bhp->ref == 0 && !F_ISSET(bhp, BH_LOCKED)) { ++bhp->ref; bharray[ar_cnt] = bhp; if (++ar_cnt >= ar_max || ccnt-- == 0) { retry_need = 1; break; } } } else if (F_ISSET(bhp, BH_SYNC)) F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); } if (ar_cnt >= ar_max) break; } /* If there no buffers we can write immediately, we're done. */ if (ar_cnt == 0) { ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; goto done; } R_UNLOCK(dbenv, dbmp->reginfo); /* * Sort the buffers we're going to write immediately. * * We try and write the buffers in file/page order: it should reduce * seeks by the underlying filesystem and possibly reduce the actual * number of writes. */ if (ar_cnt > 1) qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); /* * Flush the log. We have to ensure the log records reflecting the * changes on the database pages we're writing have already made it * to disk. We usually do that as we write each page, but if we * are going to write a large number of pages, repeatedly acquiring * the log region lock is going to be expensive. Flush the entire * log now, so that sync doesn't require any more log flushes. */ if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0) { i = 0; R_LOCK(dbenv, dbmp->reginfo); goto err; } R_LOCK(dbenv, dbmp->reginfo); /* Walk the array, writing buffers. */ for (i = 0; i < ar_cnt; ++i) { bhp = bharray[i]; /* * It's possible for a thread to have acquired the buffer since * we listed it for writing, and even to have written it (or be * in the process of writing it). If the buffer is still dirty, * not currently being written and the reference count is 1, we * are the only ones using it, go ahead and write. Otherwise, * skip the buffer and assume it will be written, if necessary, * when it's returned to the cache. */ if (bhp->ref > 1 || !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED)) { --bhp->ref; continue; } mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); /* Write the buffer. */ ret = __memp_bhwrite(dbmp, mfp, bhp, 1, NULL, &wrote); /* Release the buffer. */ --bhp->ref; if (ret == 0 && wrote) continue; /* * Any process syncing the shared memory buffer pool had best * be able to write to any underlying file. Be understanding, * but firm, on this point. */ if (ret == 0) { __db_err(dbenv, "%s: unable to flush page: %lu", __memp_fns(dbmp, mfp), (u_long)bhp->pgno); ret = EPERM; } /* We've already released this buffer. */ ++i; err: /* * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no * future checkpoint return can depend on this failure. Clear * the buffer's BH_SYNC flag, because it's used to determine * if lsn_cnt values are incremented/decremented. Don't bother * to reset/clear: * * MPOOL->lsn_cnt * MPOOLFILE->lsn_cnt * * they don't make any difference. */ ZERO_LSN(mp->lsn); F_SET(mp, MP_LSN_RETRY); /* Release any buffers we're still pinning down. */ for (; i < ar_cnt; ++i) { bhp = bharray[i]; --bhp->ref; F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); } goto done; } ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0; /* * If there were too many buffers and we're not returning an error, we * re-try the checkpoint once -- since we can clear 80% of the dirty * pages in any single cache, once should be enough. (The only way we * can fail is if our original calculation of the total number of dirty * pages in all the caches was too small to hold even half of the * total dirty pages currently in the cache. Which implies that some * thread of control dirtied a lot of buffers when we released our lock * to allocate the buffer header array.) If we do fail, set the global * retry flag, we have to start from scratch on the next checkpoint. */ if (retry_need) { if (retry_done) { ret = DB_INCOMPLETE; F_SET(mp, MP_LSN_RETRY); } else { retry_done = 1; goto retry; } } done: if (dbmp->extents != 0 && (t_ret = __memp_close_flush_files(dbmp)) != 0 && ret == 0) ret = t_ret; R_UNLOCK(dbenv, dbmp->reginfo); MUTEX_UNLOCK(dbenv, &mp->sync_mutex); __os_free(dbenv, bharray, ar_max * sizeof(BH *)); return (ret); } /* * __memp_fsync -- * Mpool file sync function. * * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); */ int __memp_fsync(dbmfp) DB_MPOOLFILE *dbmfp; { DB_ENV *dbenv; DB_MPOOL *dbmp; int is_tmp; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; PANIC_CHECK(dbenv); /* * If this handle doesn't have a file descriptor that's open for * writing, or if the file is a temporary, there's no reason to * proceed further. */ if (F_ISSET(dbmfp, MP_READONLY)) return (0); R_LOCK(dbenv, dbmp->reginfo); is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP) ? 1 : 0; R_UNLOCK(dbenv, dbmp->reginfo); if (is_tmp) return (0); return (__memp_fsync_int(dbmfp)); } /* * __mp_xxx_fh -- * Return a file descriptor for DB 1.85 compatibility locking. * * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); */ int __mp_xxx_fh(dbmfp, fhp) DB_MPOOLFILE *dbmfp; DB_FH **fhp; { /* * This is a truly spectacular layering violation, intended ONLY to * support compatibility for the DB 1.85 DB->fd call. * * Sync the database file to disk, creating the file as necessary. * * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). * The MP_READONLY test isn't interesting because we will either * already have a file descriptor (we opened the database file for * reading) or we aren't readonly (we created the database which * requires write privileges). The MP_TEMP test isn't interesting * because we want to write to the backing file regardless so that * we get a file descriptor to return. */ *fhp = dbmfp->fhp; return (F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 : __memp_fsync_int(dbmfp)); } /* * __memp_fsync_int -- * Mpool file internal sync function. */ static int __memp_fsync_int(dbmfp) DB_MPOOLFILE *dbmfp; { BH *bhp, **bharray; DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOL *c_mp, *mp; size_t mf_offset; u_int32_t ar_cnt, ar_max, ccnt, i; int incomplete, ret, retry_done, retry_need, wrote; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; mp = dbmp->reginfo[0].primary; R_LOCK(dbenv, dbmp->reginfo); /* * Allocate room for a list of buffers, and decide how many buffers * we can pin down. * * !!! * Note: __memp_sballoc has released our region lock if we're not * continuing forward. */ if ((ret = __memp_sballoc(dbenv, &bharray, &ar_max)) != 0 || ar_max == 0) return (ret); retry_done = 0; retry: retry_need = 0; /* * Walk each cache's list of buffers and mark all dirty buffers to be * written and all pinned buffers to be potentially written (we can't * know if they'll need to be written until the holder returns them to * the cache). We do this in one pass while holding the region locked * so that processes can't make new buffers dirty, causing us to never * finish. */ mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp); for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) { c_mp = dbmp->reginfo[i].primary; /* * Don't pin down more than 80% of the pages in any cache, * otherwise we'll starve threads needing new pages. If * only a small number of pages have been allocated from * the cache we can end up with a silly ccnt. There's no * way we can know exactly how many pages a cache will hold, * so we bound it at something insane -- if there are less * than 10 pages in the cache, someone screwed up pretty badly. */ ccnt = c_mp->stat.st_page_dirty + c_mp->stat.st_page_clean; if (ccnt >= 10) ccnt = (ccnt * 8) / 10; for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset) continue; if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { incomplete = 1; continue; } /* * If the buffer isn't being used, we can write * it immediately, so increment its reference * count to lock it down, and save a reference * to it. * * If we've run out space to store buffer refs, * we're screwed. We don't want to realloc the * array while holding a region lock, so we set * a flag and deal with it later. */ ++bhp->ref; bharray[ar_cnt] = bhp; if (++ar_cnt >= ar_max || ccnt-- == 0) { retry_need = 1; break; } } if (ar_cnt >= ar_max) break; } /* If there no buffers we can write immediately, we're done. */ if (ar_cnt == 0) { ret = 0; goto done; } R_UNLOCK(dbenv, dbmp->reginfo); /* Sort the buffers we're going to write. */ if (ar_cnt > 1) qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); R_LOCK(dbenv, dbmp->reginfo); /* Walk the array, writing buffers. */ for (i = 0; i < ar_cnt; ++i) { bhp = bharray[i]; /* * It's possible for a thread to have acquired the buffer since * we listed it for writing, and even to have written it (or be * in the process of writing it). If the buffer is still dirty, * not currently being written and the reference count is 1, we * are the only ones using it, go ahead and write. Otherwise, * skip the buffer and assume it will be written, if necessary, * when it's returned to the cache. */ if (bhp->ref > 1 || !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED)) { incomplete = 1; --bhp->ref; continue; } /* Write the buffer. */ ret = __memp_pgwrite(dbmp, dbmfp, bhp, NULL, &wrote); /* Release the buffer. */ --bhp->ref; if (ret == 0) { if (!wrote) incomplete = 1; continue; } /* * On error: * * Release any buffers we're still pinning down. */ while (++i < ar_cnt) --bharray[i]->ref; break; } /* * If there were too many buffers and we're not returning an error, we * re-try the flush once -- since we can clear 80% of the dirty * pages in any single cache, once should be enough. (The only way we * can fail is if our original calculation of the total number of dirty * pages in all the caches was too small to hold even half of the * total dirty pages currently in the cache. Which implies that some * thread of control dirtied a lot of buffers when we released our lock * to allocate the buffer header array.) */ if (retry_need) { if (retry_done) incomplete = 1; else { retry_done = 1; goto retry; } } done: R_UNLOCK(dbenv, dbmp->reginfo); __os_free(dbenv, bharray, ar_max * sizeof(BH *)); /* * Sync the underlying file as the last thing we do, so that the OS * has a maximal opportunity to flush buffers before we request it. * * !!!: * Don't lock the region around the sync, fsync(2) has no atomicity * issues. */ if (ret == 0) ret = incomplete ? DB_INCOMPLETE : __os_fsync(dbenv, dbmfp->fhp); return (ret); } /* * __memp_sballoc -- * Allocate room for a list of buffers. */ static int __memp_sballoc(dbenv, bharrayp, ar_maxp) DB_ENV *dbenv; BH ***bharrayp; u_int32_t *ar_maxp; { DB_MPOOL *dbmp; MPOOL *c_mp, *mp; u_int32_t i, ar_max; int ret; dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; /* * We don't want to hold the region lock while we write the buffers, * so only lock it while we create a list. * * Walk through the list of caches, figuring out how many buffers * we're going to need. * * Make a point of not holding the region lock across the library * allocation call. */ for (ar_max = 0, i = 0; i < mp->nreg; ++i) { c_mp = dbmp->reginfo[i].primary; ar_max += c_mp->stat.st_page_dirty; } R_UNLOCK(dbenv, dbmp->reginfo); /* If there's nothing to write, we're done. */ if (ar_max == 0) { *ar_maxp = 0; return (0); } /* * We may dirty more buffers while doing the allocation, increase * the array by 25%, or, by 10 in any case. */ ar_max += ar_max / 4 + 10; if ((ret = __os_malloc(dbenv, ar_max * sizeof(BH *), bharrayp)) != 0) return (ret); *ar_maxp = ar_max; R_LOCK(dbenv, dbmp->reginfo); return (0); } /* * __memp_close_flush_files -- * Close files used to flush extents. * * The routine exists because we MUST close extents so they may be removed * when empty. When flushing pages for extents not opened by this process * the extents will get opened by __memp_bhwrite and we close them here. * * PUBLIC: int __memp_close_flush_files __P((DB_MPOOL *)); */ int __memp_close_flush_files(dbmp) DB_MPOOL *dbmp; { DB_MPOOLFILE *dbmfp, *next; int ret; for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); dbmfp != NULL; dbmfp = next) { next = TAILQ_NEXT(dbmfp, q); if (F_ISSET(dbmfp, MP_FLUSH) && F_ISSET(dbmfp->mfp, MP_EXTENT)) if ((ret = __memp_fclose_int(dbmfp, 0, 0)) != 0) return (ret); } return (0); } static int __bhcmp(p1, p2) const void *p1, *p2; { BH *bhp1, *bhp2; bhp1 = *(BH * const *)p1; bhp2 = *(BH * const *)p2; /* Sort by file (shared memory pool offset). */ if (bhp1->mf_offset < bhp2->mf_offset) return (-1); if (bhp1->mf_offset > bhp2->mf_offset) return (1); /* * !!! * Defend against badly written quicksort code calling the comparison * function with two identical pointers (e.g., WATCOM C++ (Power++)). */ if (bhp1->pgno < bhp2->pgno) return (-1); if (bhp1->pgno > bhp2->pgno) return (1); return (0); }