bt_search.c

Go to the documentation of this file.
00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996, 1997, 1998, 1999, 2000
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  */
00042 
00043 #include "config.h"
00044 
00045 #ifndef lint
00046 static const char revid[] = "$Id: bt__search_8c-source.html,v 1.1 2008/06/08 10:13:44 sebdiaz Exp $";
00047 #endif /* not lint */
00048 
00049 #ifndef NO_SYSTEM_INCLUDES
00050 #include <sys/types.h>
00051 
00052 #include <string.h>
00053 #endif
00054 
00055 #include "db_int.h"
00056 #include "db_page.h"
00057 #include "db_shash.h"
00058 #include "btree.h"
00059 #include "lock.h"
00060 
00061 /*
00062  * CDB___bam_search --
00063  *      Search a btree for a key.
00064  *
00065  * PUBLIC: int CDB___bam_search __P((DBC *,
00066  * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
00067  */
00068 int
00069 CDB___bam_search(dbc, key, flags, stop, recnop, exactp)
00070         DBC *dbc;
00071         const DBT *key;
00072         u_int32_t flags;
00073         int stop, *exactp;
00074         db_recno_t *recnop;
00075 {
00076         BTREE *t;
00077         BTREE_CURSOR *cp;
00078         DB *dbp;
00079         DB_LOCK lock;
00080         PAGE *h;
00081         db_indx_t base, i, indx, lim;
00082         db_lockmode_t lock_mode;
00083         db_pgno_t pg;
00084         db_recno_t recno;
00085         int adjust, cmp, deloffset, ret, stack;
00086         int (*func) __P((const DBT *, const DBT *));
00087 
00088         dbp = dbc->dbp;
00089         cp = (BTREE_CURSOR *)dbc->internal;
00090         t = dbp->bt_internal;
00091         recno = 0;
00092 
00093         BT_STK_CLR(cp);
00094 
00095         /*
00096          * There are several ways we search a btree tree.  The flags argument
00097          * specifies if we're acquiring read or write locks, if we position
00098          * to the first or last item in a set of duplicates, if we return
00099          * deleted items, and if we are locking pairs of pages.  In addition,
00100          * if we're modifying record numbers, we have to lock the entire tree
00101          * regardless.  See btree.h for more details.
00102          *
00103          * If write-locking pages, we need to know whether or not to acquire a
00104          * write lock on a page before getting it.  This depends on how deep it
00105          * is in tree, which we don't know until we acquire the root page.  So,
00106          * if we need to lock the root page we may have to upgrade it later,
00107          * because we won't get the correct lock initially.
00108          *
00109          * Retrieve the root page.
00110          */
00111 try_again:
00112         pg = cp->root;
00113         stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM);
00114         lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
00115         if ((ret = CDB___db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
00116                 return (ret);
00117         if ((ret = CDB_memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
00118                 /* Did not read it, so we can release the lock */
00119                 (void)__LPUT(dbc, lock);
00120                 return (ret);
00121         }
00122 
00123         /*
00124          * Decide if we need to save this page; if we do, write lock it.
00125          * We deliberately don't lock-couple on this call.  If the tree
00126          * is tiny, i.e., one page, and two threads are busily updating
00127          * the root page, we're almost guaranteed deadlocks galore, as
00128          * each one gets a read lock and then blocks the other's attempt
00129          * for a write lock.
00130          */
00131         if (!stack &&
00132             ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
00133             (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
00134                 (void)CDB_memp_fput(dbp->mpf, h, 0);
00135                 (void)__LPUT(dbc, lock);
00136                 lock_mode = DB_LOCK_WRITE;
00137                 if ((ret = CDB___db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
00138                         return (ret);
00139                 if ((ret = CDB_memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
00140                         /* Did not read it, so we can release the lock */
00141                         (void)__LPUT(dbc, lock);
00142                         return (ret);
00143                 }
00144                 if (!((LF_ISSET(S_PARENT)
00145                     && (u_int8_t)(stop + 1) >= h->level) ||
00146                     (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
00147                         /* Someone else split the root, start over. */
00148                         (void)CDB_memp_fput(dbp->mpf, h, 0);
00149                         (void)__LPUT(dbc, lock);
00150                         goto try_again;
00151                 }
00152                 stack = 1;
00153         }
00154 
00155         /* Choose a comparison function. */
00156         func = F_ISSET(dbc, DBC_OPD) ?
00157             (dbp->dup_compare == NULL ? CDB___bam_defcmp : dbp->dup_compare) :
00158             t->bt_compare;
00159 
00160         for (;;) {
00161                 /*
00162                  * Do a binary search on the current page.  If we're searching
00163                  * a Btree leaf page, we have to walk the indices in groups of
00164                  * two.  If we're searching an internal page or a off-page dup
00165                  * page, they're an index per page item.  If we find an exact
00166                  * match on a leaf page, we're done.
00167                  */
00168                 adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
00169                 for (base = 0,
00170                     lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) {
00171                         indx = base + ((lim >> 1) * adjust);
00172                         if ((ret = CDB___bam_cmp(dbp,
00173                             key, h, indx, func, &cmp)) != 0)
00174                                 goto err;
00175                         if (cmp == 0) {
00176                                 if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
00177                                         goto found;
00178                                 goto next;
00179                         }
00180                         if (cmp > 0) {
00181                                 base = indx + adjust;
00182                                 --lim;
00183                         }
00184                 }
00185 
00186                 /*
00187                  * No match found.  Base is the smallest index greater than
00188                  * key and may be zero or a last + O_INDX index.
00189                  *
00190                  * If it's a leaf page, return base as the "found" value.
00191                  * Delete only deletes exact matches.
00192                  */
00193                 if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) {
00194                         *exactp = 0;
00195 
00196                         if (LF_ISSET(S_EXACT))
00197                                 goto notfound;
00198 
00199                         if (LF_ISSET(S_STK_ONLY)) {
00200                                 BT_STK_NUM(dbp->dbenv, cp, h, base, ret);
00201                                 __LPUT(dbc, lock);
00202                                 (void)CDB_memp_fput(dbp->mpf, h, 0);
00203                                 return (ret);
00204                         }
00205 
00206                         /*
00207                          * !!!
00208                          * Possibly returning a deleted record -- DB_SET_RANGE,
00209                          * DB_KEYFIRST and DB_KEYLAST don't require an exact
00210                          * match, and we don't want to walk multiple pages here
00211                          * to find an undeleted record.  This is handled by the
00212                          * calling routine.
00213                          */
00214                         BT_STK_ENTER(dbp->dbenv,
00215                             cp, h, base, lock, lock_mode, ret);
00216                         if (ret != 0)
00217                                 goto err;
00218                         return (0);
00219                 }
00220 
00221                 /*
00222                  * If it's not a leaf page, record the internal page (which is
00223                  * a parent page for the key).  Decrement the base by 1 if it's
00224                  * non-zero so that if a split later occurs, the inserted page
00225                  * will be to the right of the saved page.
00226                  */
00227                 indx = base > 0 ? base - O_INDX : base;
00228 
00229                 /*
00230                  * If we're trying to calculate the record number, sum up
00231                  * all the record numbers on this page up to the indx point.
00232                  */
00233 next:           if (recnop != NULL)
00234                         for (i = 0; i < indx; ++i)
00235                                 recno += GET_BINTERNAL(h, i)->nrecs;
00236 
00237                 pg = GET_BINTERNAL(h, indx)->pgno;
00238 
00239                 if (LF_ISSET(S_STK_ONLY)) {
00240                         if (stop == h->level) {
00241                                 BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
00242                                 __LPUT(dbc, lock);
00243                                 (void)CDB_memp_fput(dbp->mpf, h, 0);
00244                                 return (ret);
00245                         }
00246                         BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret);
00247                         (void)CDB_memp_fput(dbp->mpf, h, 0);
00248                         if ((ret = CDB___db_lget(dbc,
00249                             LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
00250                                 /*
00251                                  * Discard our lock and return on failure.  This
00252                                  * is OK because it only happens when descending
00253                                  * the tree holding read-locks.
00254                                  */
00255                                 __LPUT(dbc, lock);
00256                                 return (ret);
00257                         }
00258                 } else if (stack) {
00259                         /* Return if this is the lowest page wanted. */
00260                         if (LF_ISSET(S_PARENT) && stop == h->level) {
00261                                 BT_STK_ENTER(dbp->dbenv,
00262                                     cp, h, indx, lock, lock_mode, ret);
00263                                 if (ret != 0)
00264                                         goto err;
00265                                 return (0);
00266                         }
00267                         BT_STK_PUSH(dbp->dbenv,
00268                             cp, h, indx, lock, lock_mode, ret);
00269                         if (ret != 0)
00270                                 goto err;
00271 
00272                         lock_mode = DB_LOCK_WRITE;
00273                         if ((ret =
00274                             CDB___db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
00275                                 goto err;
00276                 } else {
00277                         /*
00278                          * Decide if we want to return a reference to the next
00279                          * page in the return stack.  If so, lock it and never
00280                          * unlock it.
00281                          */
00282                         if ((LF_ISSET(S_PARENT) &&
00283                             (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
00284                             (h->level - 1) == LEAFLEVEL)
00285                                 stack = 1;
00286 
00287                         (void)CDB_memp_fput(dbp->mpf, h, 0);
00288 
00289                         lock_mode = stack &&
00290                             LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
00291                         if ((ret = CDB___db_lget(dbc,
00292                             LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
00293                                 /*
00294                                  * If we fail, discard the lock we held.  This
00295                                  * is OK because this only happens when we are
00296                                  * descending the tree holding read-locks.
00297                                  */
00298                                 __LPUT(dbc, lock);
00299                                 goto err;
00300                         }
00301                 }
00302                 if ((ret = CDB_memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
00303                         goto err;
00304         }
00305         /* NOTREACHED */
00306 
00307 found:  *exactp = 1;
00308 
00309         /*
00310          * If we're trying to calculate the record number, add in the
00311          * offset on this page and correct for the fact that records
00312          * in the tree are 0-based.
00313          */
00314         if (recnop != NULL)
00315                 *recnop = recno + (indx / P_INDX) + 1;
00316 
00317         /*
00318          * If we got here, we know that we have a Btree leaf or off-page
00319          * duplicates page.  If it's a Btree leaf page, we have to handle
00320          * on-page duplicates.
00321          *
00322          * If there are duplicates, go to the first/last one.  This is
00323          * safe because we know that we're not going to leave the page,
00324          * all duplicate sets that are not on overflow pages exist on a
00325          * single leaf page.
00326          */
00327         if (TYPE(h) == P_LBTREE) {
00328                 if (LF_ISSET(S_DUPLAST))
00329                         while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
00330                             h->inp[indx] == h->inp[indx + P_INDX])
00331                                 indx += P_INDX;
00332                 else
00333                         while (indx > 0 &&
00334                             h->inp[indx] == h->inp[indx - P_INDX])
00335                                 indx -= P_INDX;
00336         }
00337 
00338         /*
00339          * Now check if we are allowed to return deleted items; if not, then
00340          * find the next (or previous) non-deleted duplicate entry.  (We do
00341          * not move from the original found key on the basis of the S_DELNO
00342          * flag.)
00343          */
00344         if (LF_ISSET(S_DELNO)) {
00345                 deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
00346                 if (LF_ISSET(S_DUPLAST))
00347                         while (B_DISSET(GET_BKEYDATA(
00348                             h, indx + deloffset)->type) && indx > 0 &&
00349                             h->inp[indx] == h->inp[indx - adjust])
00350                                 indx -= adjust;
00351                 else
00352                         while (B_DISSET(GET_BKEYDATA(
00353                             h, indx + deloffset)->type) &&
00354                             indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
00355                             h->inp[indx] == h->inp[indx + adjust])
00356                                 indx += adjust;
00357 
00358                 /*
00359                  * If we weren't able to find a non-deleted duplicate, return
00360                  * DB_NOTFOUND.
00361                  */
00362                 if (B_DISSET(GET_BKEYDATA(h, indx + deloffset)->type))
00363                         goto notfound;
00364         }
00365 
00366         if (LF_ISSET(S_STK_ONLY)) {
00367                 BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
00368                 __LPUT(dbc, lock);
00369                 (void)CDB_memp_fput(dbp->mpf, h, 0);
00370         } else {
00371                 BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret);
00372                 if (ret != 0)
00373                         goto err;
00374         }
00375         return (0);
00376 
00377 notfound:
00378         /* Keep the page locked for serializability. */
00379         (void)CDB_memp_fput(dbp->mpf, h, 0);
00380         (void)__TLPUT(dbc, lock);
00381         ret = DB_NOTFOUND;
00382 
00383 err:    if (cp->csp > cp->sp) {
00384                 BT_STK_POP(cp);
00385                 CDB___bam_stkrel(dbc, 0);
00386         }
00387         return (ret);
00388 }
00389 
00390 /*
00391  * CDB___bam_stkrel --
00392  *      Release all pages currently held in the stack.
00393  *
00394  * PUBLIC: int CDB___bam_stkrel __P((DBC *, u_int32_t));
00395  */
00396 int
00397 CDB___bam_stkrel(dbc, flags)
00398         DBC *dbc;
00399         u_int32_t flags;
00400 {
00401         BTREE_CURSOR *cp;
00402         DB *dbp;
00403         EPG *epg;
00404         int ret, t_ret;
00405 
00406         dbp = dbc->dbp;
00407         cp = (BTREE_CURSOR *)dbc->internal;
00408 
00409         /*
00410          * Release inner pages first.
00411          *
00412          * The caller must be sure that setting STK_NOLOCK will not effect
00413          * either serializability or recoverability.
00414          */
00415         for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
00416                 if (epg->page != NULL) {
00417                         if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
00418                                 cp->page = NULL;
00419                                 cp->lock.off = LOCK_INVALID;
00420                         }
00421                         if ((t_ret = CDB_memp_fput(
00422                             dbp->mpf, epg->page, 0)) != 0 && ret == 0)
00423                                 ret = t_ret;
00424                 }
00425                 if (epg->lock.off != LOCK_INVALID) {
00426                         if (LF_ISSET(STK_NOLOCK))
00427                                 (void)__LPUT(dbc, epg->lock);
00428                         else
00429                                 (void)__TLPUT(dbc, epg->lock);
00430                 }
00431         }
00432 
00433         /* Clear the stack, all pages have been released. */
00434         BT_STK_CLR(cp);
00435 
00436         return (ret);
00437 }
00438 
00439 /*
00440  * CDB___bam_stkgrow --
00441  *      Grow the stack.
00442  *
00443  * PUBLIC: int CDB___bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *));
00444  */
00445 int
00446 CDB___bam_stkgrow(dbenv, cp)
00447         DB_ENV *dbenv;
00448         BTREE_CURSOR *cp;
00449 {
00450         EPG *p;
00451         size_t entries;
00452         int ret;
00453 
00454         entries = cp->esp - cp->sp;
00455 
00456         if ((ret = CDB___os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0)
00457                 return (ret);
00458         memcpy(p, cp->sp, entries * sizeof(EPG));
00459         if (cp->sp != cp->stack)
00460                 CDB___os_free(cp->sp, entries * sizeof(EPG));
00461         cp->sp = p;
00462         cp->csp = p + entries;
00463         cp->esp = p + entries * 2;
00464         return (0);
00465 }

Generated on Sun Jun 8 10:56:35 2008 for GNUmifluz by  doxygen 1.5.5