mp_sync.c

Go to the documentation of this file.
00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996, 1997, 1998, 1999, 2000
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 #include "config.h"
00008 
00009 #ifndef lint
00010 static const char revid[] = "$Id: mp__sync_8c-source.html,v 1.1 2008/06/08 10:20:52 sebdiaz Exp $";
00011 #endif /* not lint */
00012 
00013 #ifndef NO_SYSTEM_INCLUDES
00014 #include <sys/types.h>
00015 
00016 #include <errno.h>
00017 #include <stdlib.h>
00018 #endif
00019 
00020 #ifdef  HAVE_RPC
00021 #include "db_server.h"
00022 #endif
00023 
00024 #include "db_int.h"
00025 #include "db_shash.h"
00026 #include "mp.h"
00027 
00028 #ifdef HAVE_RPC
00029 #include "gen_client_ext.h"
00030 #include "rpc_client_ext.h"
00031 #endif
00032 
00033 static int __bhcmp __P((const void *, const void *));
00034 static int __memp_fsync __P((DB_MPOOLFILE *));
00035 static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
00036 
00037 /*
00038  * CDB_memp_sync --
00039  *      Mpool sync function.
00040  */
00041 int
00042 CDB_memp_sync(dbenv, lsnp)
00043         DB_ENV *dbenv;
00044         DB_LSN *lsnp;
00045 {
00046         BH *bhp, **bharray;
00047         DB_MPOOL *dbmp;
00048         DB_LSN tlsn;
00049         MPOOL *c_mp, *mp;
00050         MPOOLFILE *mfp;
00051         u_int32_t ar_cnt, i, ndirty;
00052         int ret, retry_done, retry_need, wrote;
00053 
00054 #ifdef HAVE_RPC
00055         if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
00056                 return (__dbcl_memp_sync(dbenv, lsnp));
00057 #endif
00058 
00059         PANIC_CHECK(dbenv);
00060         ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
00061 
00062         dbmp = dbenv->mp_handle;
00063         mp = dbmp->reginfo[0].primary;
00064 
00065         if (!LOGGING_ON(dbenv)) {
00066                 CDB___db_err(dbenv, "CDB_memp_sync: requires logging");
00067                 return (EINVAL);
00068         }
00069 
00070         /*
00071          * If no LSN is provided, flush the entire cache.
00072          *
00073          * !!!
00074          * Our current behavior is to flush the entire cache, so there's
00075          * nothing special we have to do here other than deal with NULL
00076          * pointers.
00077          */
00078         if (lsnp == NULL) {
00079                 ZERO_LSN(tlsn);
00080                 lsnp = &tlsn;
00081                 F_SET(mp, MP_LSN_RETRY);
00082         }
00083 
00084         /*
00085          * Sync calls are single-threaded so that we don't have multiple
00086          * threads, with different checkpoint LSNs, walking the caches
00087          * and updating the checkpoint LSNs and how many buffers remain
00088          * to be written for the checkpoint.  This shouldn't be a problem,
00089          * any application that has multiple checkpoint threads isn't what
00090          * I'd call trustworthy.
00091          */
00092         MUTEX_LOCK(&mp->sync_mutex, dbenv->lockfhp);
00093 
00094         /*
00095          * If the application is asking about a previous call to CDB_memp_sync(),
00096          * and we haven't found any buffers that the application holding the
00097          * pin couldn't write, return yes or no based on the current count.
00098          * Note, if the application is asking about a LSN *smaller* than one
00099          * we've already handled or are currently handling, then we return a
00100          * result based on the count for the larger LSN.
00101          */
00102         R_LOCK(dbenv, dbmp->reginfo);
00103         if (!IS_ZERO_LSN(*lsnp) &&
00104             !F_ISSET(mp, MP_LSN_RETRY) && CDB_log_compare(lsnp, &mp->lsn) <= 0) {
00105                 if (mp->lsn_cnt == 0) {
00106                         *lsnp = mp->lsn;
00107                         ret = 0;
00108                 } else
00109                         ret = DB_INCOMPLETE;
00110 
00111                 R_UNLOCK(dbenv, dbmp->reginfo);
00112                 MUTEX_UNLOCK(&mp->sync_mutex);
00113                 return (ret);
00114         }
00115 
00116         /*
00117          * Allocate room for a list of buffers, and decide how many buffers
00118          * we can pin down.
00119          *
00120          * !!!
00121          * Note: __memp_sballoc has released the region lock if we're not
00122          * continuing forward.
00123          */
00124         if ((ret =
00125             __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
00126                 MUTEX_UNLOCK(&mp->sync_mutex);
00127                 return (ret);
00128         }
00129 
00130         retry_done = 0;
00131 retry:  retry_need = 0;
00132         /*
00133          * Start a new checkpoint.
00134          *
00135          * Save the LSN.  We know that it's a new LSN, a retry, or larger than
00136          * the one for which we were already doing a checkpoint.  (BTW, I don't
00137          * expect to see multiple LSN's from the same or multiple processes,
00138          * but You Just Never Know.  Responding as if they all called with the
00139          * largest of the LSNs specified makes everything work.)
00140          *
00141          * We don't currently use the LSN we save.  We could potentially save
00142          * the last-written LSN in each buffer header and use it to determine
00143          * what buffers need to be written.  The problem with this is that it's
00144          * sizeof(LSN) more bytes of buffer header.  We currently write all the
00145          * dirty buffers instead, but with a sufficiently large cache that's
00146          * going to be a problem.
00147          */
00148         mp->lsn = *lsnp;
00149 
00150         /*
00151          * Clear the global count of buffers waiting to be written, walk the
00152          * list of files clearing the count of buffers waiting to be written.
00153          *
00154          * Clear the retry flag.
00155          */
00156         mp->lsn_cnt = 0;
00157         for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
00158             mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
00159                 mfp->lsn_cnt = 0;
00160         F_CLR(mp, MP_LSN_RETRY);
00161 
00162         /*
00163          * Walk each cache's list of buffers and mark all dirty buffers to be
00164          * written and all pinned buffers to be potentially written (we can't
00165          * know if they'll need to be written until the holder returns them to
00166          * the cache).  We do this in one pass while holding the region locked
00167          * so that processes can't make new buffers dirty, causing us to never
00168          * finish.  Since the application may have restarted the sync using a
00169          * different LSN value, clear any BH_WRITE flags that appear leftover
00170          * from previous calls.
00171          *
00172          * Keep a count of the total number of buffers we need to write in
00173          * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
00174          */
00175         for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
00176                 c_mp = dbmp->reginfo[i].primary;
00177                 for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
00178                     bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
00179                         if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
00180                                 F_SET(bhp, BH_WRITE);
00181 
00182                                 ++mp->lsn_cnt;
00183 
00184                                 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
00185                                 ++mfp->lsn_cnt;
00186 
00187                                 /*
00188                                  * If the buffer isn't being used, we can write
00189                                  * it immediately, so increment its reference
00190                                  * count to lock it down, and save a reference
00191                                  * to it.
00192                                  *
00193                                  * If we've run out space to store buffer refs,
00194                                  * we're screwed.  We don't want to realloc the
00195                                  * array while holding a region lock, so we set
00196                                  * a flag and deal with it later.
00197                                  */
00198                                 if (bhp->ref == 0) {
00199                                         ++bhp->ref;
00200                                         bharray[ar_cnt] = bhp;
00201 
00202                                         if (++ar_cnt >= ndirty) {
00203                                                 retry_need = 1;
00204                                                 break;
00205                                         }
00206                                 }
00207                         } else
00208                                 if (F_ISSET(bhp, BH_WRITE))
00209                                         F_CLR(bhp, BH_WRITE);
00210                 }
00211                 if (ar_cnt >= ndirty)
00212                         break;
00213         }
00214 
00215         /* If there no buffers we can write immediately, we're done. */
00216         if (ar_cnt == 0) {
00217                 ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
00218                 goto done;
00219         }
00220 
00221         R_UNLOCK(dbenv, dbmp->reginfo);
00222 
00223         /*
00224          * Sort the buffers we're going to write immediately.
00225          *
00226          * We try and write the buffers in file/page order: it should reduce
00227          * seeks by the underlying filesystem and possibly reduce the actual
00228          * number of writes.
00229          */
00230         if (ar_cnt > 1)
00231                 qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
00232 
00233         R_LOCK(dbenv, dbmp->reginfo);
00234 
00235         /* Walk the array, writing buffers. */
00236         for (i = 0; i < ar_cnt; ++i) {
00237                 /*
00238                  * It's possible for a thread to have gotten the buffer since
00239                  * we listed it for writing.  If the reference count is still
00240                  * 1, we're the only ones using the buffer, go ahead and write.
00241                  * If it's >1, then skip the buffer and assume that it will be
00242                  * written when it's returned to the cache.
00243                  */
00244                 if (bharray[i]->ref > 1) {
00245                         --bharray[i]->ref;
00246                         continue;
00247                 }
00248 
00249                 /* Write the buffer. */
00250                 mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
00251                 ret = CDB___memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
00252 
00253                 /* Release the buffer. */
00254                 --bharray[i]->ref;
00255 
00256                 if (ret == 0 && wrote)
00257                         continue;
00258 
00259                 /*
00260                  * Any process syncing the shared memory buffer pool had best
00261                  * be able to write to any underlying file. Be understanding,
00262                  * but firm, on this point.
00263                  */
00264                 if (ret == 0) {
00265                         CDB___db_err(dbenv, "%s: unable to flush page: %lu",
00266                             CDB___memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
00267                         ret = EPERM;
00268                 }
00269 
00270                 /*
00271                  * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
00272                  * future checkpoint return can depend on this failure.  Don't
00273                  * bother to reset/clear:
00274                  *
00275                  *      MPOOL->lsn_cnt
00276                  *      MPOOLFILE->lsn_cnt
00277                  *      buffer BH_WRITE flags
00278                  *
00279                  * they don't make any difference.
00280                  */
00281                 ZERO_LSN(mp->lsn);
00282                 F_SET(mp, MP_LSN_RETRY);
00283 
00284                 /* Release any buffers we're still pinning down. */
00285                 while (++i < ar_cnt)
00286                         --bharray[i]->ref;
00287 
00288                 goto done;
00289         }
00290 
00291         ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
00292 
00293         /*
00294          * If there were too many buffers and we're not returning an error, we
00295          * re-try the checkpoint once -- since we allocated 80% of the total
00296          * buffer count, once should be enough. If it still doesn't work, some
00297          * other thread of control is dirtying buffers as fast as we're writing
00298          * them, and we might as well give up for now.  In the latter case, set
00299          * the global retry flag, we'll have to start from scratch on the next
00300          * checkpoint.
00301          */
00302         if (retry_need) {
00303                 if (retry_done) {
00304                         ret = DB_INCOMPLETE;
00305                         F_SET(mp, MP_LSN_RETRY);
00306                 } else {
00307                         retry_done = 1;
00308                         goto retry;
00309                 }
00310         }
00311 
00312 done:   R_UNLOCK(dbenv, dbmp->reginfo);
00313         MUTEX_UNLOCK(&mp->sync_mutex);
00314 
00315         CDB___os_free(bharray, ndirty * sizeof(BH *));
00316 
00317         return (ret);
00318 }
00319 
00320 /*
00321  * CDB_memp_fsync --
00322  *      Mpool file sync function.
00323  */
00324 int
00325 CDB_memp_fsync(dbmfp)
00326         DB_MPOOLFILE *dbmfp;
00327 {
00328         DB_ENV *dbenv;
00329         DB_MPOOL *dbmp;
00330         int is_tmp;
00331 
00332         dbmp = dbmfp->dbmp;
00333         dbenv = dbmp->dbenv;
00334 
00335 #ifdef HAVE_RPC
00336         if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
00337                 return (__dbcl_memp_fsync(dbmfp));
00338 #endif
00339 
00340         PANIC_CHECK(dbenv);
00341 
00342         /*
00343          * If this handle doesn't have a file descriptor that's open for
00344          * writing, or if the file is a temporary, there's no reason to
00345          * proceed further.
00346          */
00347         if (F_ISSET(dbmfp, MP_READONLY))
00348                 return (0);
00349 
00350         R_LOCK(dbenv, dbmp->reginfo);
00351         is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
00352         R_UNLOCK(dbenv, dbmp->reginfo);
00353         if (is_tmp)
00354                 return (0);
00355 
00356         return (__memp_fsync(dbmfp));
00357 }
00358 
00359 /*
00360  * CDB___mp_xxx_fh --
00361  *      Return a file descriptor for DB 1.85 compatibility locking.
00362  *
00363  * PUBLIC: int CDB___mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
00364  */
00365 int
00366 CDB___mp_xxx_fh(dbmfp, fhp)
00367         DB_MPOOLFILE *dbmfp;
00368         DB_FH **fhp;
00369 {
00370         /*
00371          * This is a truly spectacular layering violation, intended ONLY to
00372          * support compatibility for the DB 1.85 DB->fd call.
00373          *
00374          * Sync the database file to disk, creating the file as necessary.
00375          *
00376          * We skip the MP_READONLY and MP_TEMP tests done by CDB_memp_fsync(3).
00377          * The MP_READONLY test isn't interesting because we will either
00378          * already have a file descriptor (we opened the database file for
00379          * reading) or we aren't readonly (we created the database which
00380          * requires write privileges).  The MP_TEMP test isn't interesting
00381          * because we want to write to the backing file regardless so that
00382          * we get a file descriptor to return.
00383          */
00384         *fhp = &dbmfp->fh;
00385         return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
00386 }
00387 
00388 /*
00389  * __memp_fsync --
00390  *      Mpool file internal sync function.
00391  */
00392 static int
00393 __memp_fsync(dbmfp)
00394         DB_MPOOLFILE *dbmfp;
00395 {
00396         BH *bhp, **bharray;
00397         DB_ENV *dbenv;
00398         DB_MPOOL *dbmp;
00399         MPOOL *c_mp, *mp;
00400         size_t mf_offset;
00401         u_int32_t ar_cnt, i, ndirty;
00402         int incomplete, ret, retry_done, retry_need, wrote;
00403 
00404         dbmp = dbmfp->dbmp;
00405         dbenv = dbmp->dbenv;
00406         mp = dbmp->reginfo[0].primary;
00407 
00408         R_LOCK(dbenv, dbmp->reginfo);
00409 
00410         /*
00411          * Allocate room for a list of buffers, and decide how many buffers
00412          * we can pin down.
00413          *
00414          * !!!
00415          * Note: __memp_sballoc has released our region lock if we're not
00416          * continuing forward.
00417          */
00418         if ((ret =
00419             __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
00420                 return (ret);
00421 
00422         retry_done = 0;
00423 retry:  retry_need = 0;
00424         /*
00425          * Walk each cache's list of buffers and mark all dirty buffers to be
00426          * written and all pinned buffers to be potentially written (we can't
00427          * know if they'll need to be written until the holder returns them to
00428          * the cache).  We do this in one pass while holding the region locked
00429          * so that processes can't make new buffers dirty, causing us to never
00430          * finish.
00431          */
00432         mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
00433         for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
00434                 c_mp = dbmp->reginfo[i].primary;
00435                 for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
00436                     bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
00437                         if (!F_ISSET(bhp, BH_DIRTY) ||
00438                             bhp->mf_offset != mf_offset)
00439                                 continue;
00440                         if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
00441                                 incomplete = 1;
00442                                 continue;
00443                         }
00444 
00445                         /*
00446                          * If the buffer isn't being used, we can write
00447                          * it immediately, so increment its reference
00448                          * count to lock it down, and save a reference
00449                          * to it.
00450                          *
00451                          * If we've run out space to store buffer refs,
00452                          * we're screwed.  We don't want to realloc the
00453                          * array while holding a region lock, so we set
00454                          * a flag and deal with it later.
00455                          */
00456                         ++bhp->ref;
00457                         bharray[ar_cnt] = bhp;
00458                         if (++ar_cnt >= ndirty) {
00459                                 retry_need = 1;
00460                                 break;
00461                         }
00462                 }
00463                 if (ar_cnt >= ndirty)
00464                         break;
00465         }
00466 
00467         /* If there no buffers we can write immediately, we're done. */
00468         if (ar_cnt == 0) {
00469                 ret = 0;
00470                 goto done;
00471         }
00472 
00473         R_UNLOCK(dbenv, dbmp->reginfo);
00474 
00475         /* Sort the buffers we're going to write. */
00476         if (ar_cnt > 1)
00477                 qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
00478 
00479         R_LOCK(dbenv, dbmp->reginfo);
00480 
00481         /* Walk the array, writing buffers. */
00482         for (i = 0; i < ar_cnt;) {
00483                 /*
00484                  * It's possible for a thread to have gotten the buffer since
00485                  * we listed it for writing.  If the reference count is still
00486                  * 1, we're the only ones using the buffer, go ahead and write.
00487                  * If it's >1, then skip the buffer and assume that it will be
00488                  * written when it's returned to the cache.
00489                  */
00490                 if (bharray[i]->ref > 1) {
00491                         incomplete = 1;
00492                         --bharray[i++]->ref;
00493                         continue;
00494                 }
00495 
00496                 /* Write the buffer. */
00497                 ret = CDB___memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
00498 
00499                 /* Release the buffer. */
00500                 --bharray[i++]->ref;
00501 
00502                 if (ret == 0) {
00503                         if (!wrote)
00504                                 incomplete = 1;
00505                         continue;
00506                 }
00507 
00508                 /*
00509                  * On error:
00510                  *
00511                  * Release any buffers we're still pinning down.
00512                  */
00513                 while (i < ar_cnt)
00514                         --bharray[i++]->ref;
00515                 break;
00516         }
00517 
00518         /*
00519          * If there were too many buffers and we're not returning an error, we
00520          * re-try the flush once -- since we allocated 80% of the total
00521          * buffer count, once should be enough. If it still doesn't work, some
00522          * other thread of control is dirtying buffers as fast as we're writing
00523          * them, and we might as well give up.
00524          */
00525         if (retry_need) {
00526                 if (retry_done)
00527                         incomplete = 1;
00528                 else {
00529                         retry_done = 1;
00530                         goto retry;
00531                 }
00532         }
00533 
00534 done:   R_UNLOCK(dbenv, dbmp->reginfo);
00535 
00536         CDB___os_free(bharray, ndirty * sizeof(BH *));
00537 
00538         /*
00539          * Sync the underlying file as the last thing we do, so that the OS
00540          * has a maximal opportunity to flush buffers before we request it.
00541          *
00542          * !!!:
00543          * Don't lock the region around the sync, fsync(2) has no atomicity
00544          * issues.
00545          */
00546         if (ret == 0)
00547                 ret = incomplete ?
00548                     DB_INCOMPLETE : CDB___os_fsync(dbenv, &dbmfp->fh);
00549 
00550         return (ret);
00551 }
00552 
00553 /*
00554  * __memp_sballoc --
00555  *      Allocate room for a list of buffers.
00556  */
00557 static int
00558 __memp_sballoc(dbenv, bharrayp, ndirtyp)
00559         DB_ENV *dbenv;
00560         BH ***bharrayp;
00561         u_int32_t *ndirtyp;
00562 {
00563         DB_MPOOL *dbmp;
00564         MPOOL *c_mp, *mp;
00565         u_int32_t i, nclean, ndirty, maxpin;
00566         int ret;
00567 
00568         dbmp = dbenv->mp_handle;
00569         mp = dbmp->reginfo[0].primary;
00570 
00571         /*
00572          * We don't want to hold the region lock while we write the buffers,
00573          * so only lock it while we create a list.
00574          *
00575          * Walk through the list of caches, figuring out how many buffers
00576          * we're going to need.
00577          *
00578          * Make a point of not holding the region lock across the library
00579          * allocation call.
00580          */
00581         for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
00582                 c_mp = dbmp->reginfo[i].primary;
00583                 ndirty += c_mp->stat.st_page_dirty;
00584                 nclean += c_mp->stat.st_page_clean;
00585         }
00586         R_UNLOCK(dbenv, dbmp->reginfo);
00587         if (ndirty == 0) {
00588                 *ndirtyp = 0;
00589                 return (0);
00590         }
00591 
00592         /*
00593          * We don't want to pin down the entire buffer cache, otherwise we'll
00594          * starve threads needing new pages.  Don't pin down more than 80% of
00595          * the cache, making sure that we don't screw up just because only a
00596          * few pages have been created.
00597          */
00598         maxpin = ((ndirty + nclean) * 8) / 10;
00599         if (maxpin < 10)
00600                 maxpin = 10;
00601 
00602         /*
00603          * Get a good-sized block of memory to hold buffer pointers, we don't
00604          * want to run out, but correct if we want to allocate more than we
00605          * would be allowed to store, regardless.
00606          */
00607         ndirty += ndirty / 2 + 10;
00608         if (ndirty > maxpin)
00609                 ndirty = maxpin;
00610         if ((ret =
00611             CDB___os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
00612                 return (ret);
00613 
00614         *ndirtyp = ndirty;
00615 
00616         R_LOCK(dbenv, dbmp->reginfo);
00617 
00618         return (0);
00619 }
00620 
00621 static int
00622 __bhcmp(p1, p2)
00623         const void *p1, *p2;
00624 {
00625         BH *bhp1, *bhp2;
00626 
00627         bhp1 = *(BH * const *)p1;
00628         bhp2 = *(BH * const *)p2;
00629 
00630         /* Sort by file (shared memory pool offset). */
00631         if (bhp1->mf_offset < bhp2->mf_offset)
00632                 return (-1);
00633         if (bhp1->mf_offset > bhp2->mf_offset)
00634                 return (1);
00635 
00636         /*
00637          * !!!
00638          * Defend against badly written quicksort code calling the comparison
00639          * function with two identical pointers (e.g., WATCOM C++ (Power++)).
00640          */
00641         if (bhp1->pgno < bhp2->pgno)
00642                 return (-1);
00643         if (bhp1->pgno > bhp2->pgno)
00644                 return (1);
00645         return (0);
00646 }

Generated on Sun Jun 8 10:56:38 2008 for GNUmifluz by  doxygen 1.5.5