bt_delete.c

Go to the documentation of this file.
00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996, 1997, 1998, 1999, 2000
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  */
00042 
00043 #include "config.h"
00044 
00045 #ifndef lint
00046 static const char revid[] = "$Id: bt__delete_8c-source.html,v 1.1 2008/06/08 10:13:35 sebdiaz Exp $";
00047 #endif /* not lint */
00048 
00049 #ifndef NO_SYSTEM_INCLUDES
00050 #include <sys/types.h>
00051 
00052 #include <string.h>
00053 #endif
00054 
00055 #include "db_int.h"
00056 #include "db_page.h"
00057 #include "db_shash.h"
00058 #include "btree.h"
00059 #include "lock.h"
00060 
00061 /*
00062  * CDB___bam_delete --
00063  *      Delete the items referenced by a key.
00064  *
00065  * PUBLIC: int CDB___bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
00066  */
00067 int
00068 CDB___bam_delete(dbp, txn, key, flags)
00069         DB *dbp;
00070         DB_TXN *txn;
00071         DBT *key;
00072         u_int32_t flags;
00073 {
00074         DBC *dbc;
00075         DBT lkey;
00076         DBT data;
00077         u_int32_t f_init, f_next;
00078         int ret, t_ret;
00079 
00080         PANIC_CHECK(dbp->dbenv);
00081         DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
00082 
00083         /* Check for invalid flags. */
00084         if ((ret =
00085             CDB___db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
00086                 return (ret);
00087 
00088         /* Allocate a cursor. */
00089         if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
00090                 return (ret);
00091 
00092         DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags);
00093 
00094         /*
00095          * Walk a cursor through the key/data pairs, deleting as we go.  Set
00096          * the DB_DBT_USERMEM flag, as this might be a threaded application
00097          * and the flags checking will catch us.  We don't actually want the
00098          * keys or data, so request a partial of length 0.
00099          */
00100         memset(&lkey, 0, sizeof(lkey));
00101         F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL);
00102         memset(&data, 0, sizeof(data));
00103         F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
00104 
00105         /*
00106          * If locking (and we haven't already acquired CDB locks), set the
00107          * read-modify-write flag.
00108          */
00109         f_init = DB_SET;
00110         f_next = DB_NEXT_DUP;
00111         if (STD_LOCKING(dbc)) {
00112                 f_init |= DB_RMW;
00113                 f_next |= DB_RMW;
00114         }
00115 
00116         /* Walk through the set of key/data pairs, deleting as we go. */
00117         if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0)
00118                 goto err;
00119         for (;;) {
00120                 if ((ret = dbc->c_del(dbc, 0)) != 0)
00121                         goto err;
00122                 if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) {
00123                         if (ret == DB_NOTFOUND) {
00124                                 ret = 0;
00125                                 break;
00126                         }
00127                         goto err;
00128                 }
00129         }
00130 
00131 err:    /* Discard the cursor. */
00132         if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
00133                 ret = t_ret;
00134 
00135         return (ret);
00136 }
00137 
00138 /*
00139  * CDB___bam_ditem --
00140  *      Delete one or more entries from a page.
00141  *
00142  * PUBLIC: int CDB___bam_ditem __P((DBC *, PAGE *, u_int32_t));
00143  */
00144 int
00145 CDB___bam_ditem(dbc, h, indx)
00146         DBC *dbc;
00147         PAGE *h;
00148         u_int32_t indx;
00149 {
00150         BINTERNAL *bi;
00151         BKEYDATA *bk;
00152         DB *dbp;
00153         u_int32_t nbytes;
00154         int ret;
00155 
00156         dbp = dbc->dbp;
00157 
00158         switch (TYPE(h)) {
00159         case P_IBTREE:
00160                 bi = GET_BINTERNAL(h, indx);
00161                 switch (B_TYPE(bi->type)) {
00162                 case B_DUPLICATE:
00163                 case B_KEYDATA:
00164                         nbytes = BINTERNAL_SIZE(bi->len);
00165                         break;
00166                 case B_OVERFLOW:
00167                         nbytes = BINTERNAL_SIZE(bi->len);
00168                         if ((ret =
00169                             CDB___db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
00170                                 return (ret);
00171                         break;
00172                 default:
00173                         return (CDB___db_pgfmt(dbp, PGNO(h)));
00174                 }
00175                 break;
00176         case P_IRECNO:
00177                 nbytes = RINTERNAL_SIZE;
00178                 break;
00179         case P_LBTREE:
00180                 /*
00181                  * If it's a duplicate key, discard the index and don't touch
00182                  * the actual page item.
00183                  *
00184                  * !!!
00185                  * This works because no data item can have an index matching
00186                  * any other index so even if the data item is in a key "slot",
00187                  * it won't match any other index.
00188                  */
00189                 if ((indx % 2) == 0) {
00190                         /*
00191                          * Check for a duplicate after us on the page.  NOTE:
00192                          * we have to delete the key item before deleting the
00193                          * data item, otherwise the "indx + P_INDX" calculation
00194                          * won't work!
00195                          */
00196                         if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
00197                             h->inp[indx] == h->inp[indx + P_INDX])
00198                                 return (CDB___bam_adjindx(dbc,
00199                                     h, indx, indx + O_INDX, 0));
00200                         /*
00201                          * Check for a duplicate before us on the page.  It
00202                          * doesn't matter if we delete the key item before or
00203                          * after the data item for the purposes of this one.
00204                          */
00205                         if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
00206                                 return (CDB___bam_adjindx(dbc,
00207                                     h, indx, indx - P_INDX, 0));
00208                 }
00209                 /* FALLTHROUGH */
00210         case P_LDUP:
00211         case P_LRECNO:
00212                 bk = GET_BKEYDATA(h, indx);
00213                 switch (B_TYPE(bk->type)) {
00214                 case B_DUPLICATE:
00215                         nbytes = BOVERFLOW_SIZE;
00216                         break;
00217                 case B_OVERFLOW:
00218                         nbytes = BOVERFLOW_SIZE;
00219                         if ((ret = CDB___db_doff(
00220                             dbc, (GET_BOVERFLOW(h, indx))->pgno)) != 0)
00221                                 return (ret);
00222                         break;
00223                 case B_KEYDATA:
00224                         nbytes = BKEYDATA_SIZE(bk->len);
00225                         break;
00226                 default:
00227                         return (CDB___db_pgfmt(dbp, PGNO(h)));
00228                 }
00229                 break;
00230         default:
00231                 return (CDB___db_pgfmt(dbp, PGNO(h)));
00232         }
00233 
00234         /* Delete the item and mark the page dirty. */
00235         if ((ret = CDB___db_ditem(dbc, h, indx, nbytes)) != 0)
00236                 return (ret);
00237         if ((ret = CDB_memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
00238                 return (ret);
00239 
00240         return (0);
00241 }
00242 
00243 /*
00244  * CDB___bam_adjindx --
00245  *      Adjust an index on the page.
00246  *
00247  * PUBLIC: int CDB___bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
00248  */
00249 int
00250 CDB___bam_adjindx(dbc, h, indx, indx_copy, is_insert)
00251         DBC *dbc;
00252         PAGE *h;
00253         u_int32_t indx, indx_copy;
00254         int is_insert;
00255 {
00256         DB *dbp;
00257         db_indx_t copy;
00258         int ret;
00259 
00260         dbp = dbc->dbp;
00261 
00262         /* Log the change. */
00263         if (DB_LOGGING(dbc) &&
00264             (ret = CDB___bam_adj_log(dbp->dbenv, dbc->txn, &LSN(h),
00265             0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy,
00266             (u_int32_t)is_insert)) != 0)
00267                 return (ret);
00268 
00269         /* Shuffle the indices and mark the page dirty. */
00270         if (is_insert) {
00271                 copy = h->inp[indx_copy];
00272                 if (indx != NUM_ENT(h))
00273                         memmove(&h->inp[indx + O_INDX], &h->inp[indx],
00274                             sizeof(db_indx_t) * (NUM_ENT(h) - indx));
00275                 h->inp[indx] = copy;
00276                 ++NUM_ENT(h);
00277         } else {
00278                 --NUM_ENT(h);
00279                 if (indx != NUM_ENT(h))
00280                         memmove(&h->inp[indx], &h->inp[indx + O_INDX],
00281                             sizeof(db_indx_t) * (NUM_ENT(h) - indx));
00282         }
00283         if ((ret = CDB_memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
00284                 return (ret);
00285 
00286         return (0);
00287 }
00288 
00289 /*
00290  * CDB___bam_dpages --
00291  *      Delete a set of locked pages.
00292  *
00293  * PUBLIC: int CDB___bam_dpages __P((DBC *, EPG *));
00294  */
00295 int
00296 CDB___bam_dpages(dbc, stack_epg)
00297         DBC *dbc;
00298         EPG *stack_epg;
00299 {
00300         BTREE_CURSOR *cp;
00301         DB *dbp;
00302         DBT a, b;
00303         DB_LOCK c_lock, p_lock;
00304         EPG *epg;
00305         PAGE *child, *parent;
00306         db_indx_t nitems;
00307         db_pgno_t pgno, root_pgno;
00308         db_recno_t rcnt;
00309         int done, ret, t_ret;
00310 
00311         dbp = dbc->dbp;
00312         cp = (BTREE_CURSOR *)dbc->internal;
00313 
00314         /*
00315          * We have the entire stack of deletable pages locked.
00316          *
00317          * Btree calls us with a pointer to the beginning of a stack, where
00318          * the first page in the stack is to have a single item deleted, and
00319          * the rest of the pages are to be removed.
00320          *
00321          * Recno calls us with a pointer into the middle of the stack, where
00322          * the referenced page is to have a single item deleted, and pages
00323          * after the stack reference are to be removed.
00324          *
00325          * First, discard any pages that we don't care about.
00326          */
00327         ret = 0;
00328         for (epg = cp->sp; epg < stack_epg; ++epg) {
00329                 if ((t_ret =
00330                     CDB_memp_fput(dbp->mpf, epg->page, 0)) != 0 && ret == 0)
00331                         ret = t_ret;
00332                 (void)__TLPUT(dbc, epg->lock);
00333         }
00334         if (ret != 0)
00335                 goto err;
00336 
00337         /*
00338          * !!!
00339          * There is an interesting deadlock situation here.  We have to relink
00340          * the leaf page chain around the leaf page being deleted.  Consider
00341          * a cursor walking through the leaf pages, that has the previous page
00342          * read-locked and is waiting on a lock for the page we're deleting.
00343          * It will deadlock here.  Before we unlink the subtree, we relink the
00344          * leaf page chain.
00345          */
00346         if ((ret = CDB___db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0)
00347                 goto err;
00348 
00349         /*
00350          * Delete the last item that references the underlying pages that are
00351          * to be deleted, and adjust cursors that reference that page.  Then,
00352          * save that page's page number and item count and release it.  If
00353          * the application isn't retaining locks because it's running without
00354          * transactions, this lets the rest of the tree get back to business
00355          * immediately.
00356          */
00357         if ((ret = CDB___bam_ditem(dbc, epg->page, epg->indx)) != 0)
00358                 goto err;
00359         CDB___bam_ca_di(dbp, PGNO(epg->page), epg->indx, -1);
00360 
00361         pgno = PGNO(epg->page);
00362         nitems = NUM_ENT(epg->page);
00363 
00364         if ((ret = CDB_memp_fput(dbp->mpf, epg->page, 0)) != 0)
00365                 goto err_inc;
00366         (void)__TLPUT(dbc, epg->lock);
00367 
00368         /* Free the rest of the pages in the stack. */
00369         while (++epg <= cp->csp) {
00370                 /*
00371                  * Delete page entries so they will be restored as part of
00372                  * recovery.  We don't need to do cursor adjustment here as
00373                  * the pages are being emptied by definition and so cannot
00374                  * be referenced by a cursor.
00375                  */
00376                 if (NUM_ENT(epg->page) != 0) {
00377                         DB_ASSERT(NUM_ENT(epg->page) == 1);
00378 
00379                         if ((ret = CDB___bam_ditem(dbc, epg->page, epg->indx)) != 0)
00380                                 goto err;
00381                 }
00382 
00383                 if ((ret = CDB___db_free(dbc, epg->page)) != 0)
00384                         goto err_inc;
00385                 (void)__TLPUT(dbc, epg->lock);
00386         }
00387 
00388         if (0) {
00389 err_inc:        ++epg;
00390 err:            for (; epg <= cp->csp; ++epg) {
00391                         (void)CDB_memp_fput(dbp->mpf, epg->page, 0);
00392                         (void)__TLPUT(dbc, epg->lock);
00393                 }
00394                 BT_STK_CLR(cp);
00395                 return (ret);
00396         }
00397         BT_STK_CLR(cp);
00398 
00399         /*
00400          * If we just deleted the next-to-last item from the root page, the
00401          * tree can collapse one or more levels.  While there remains only a
00402          * single item on the root page, write lock the last page referenced
00403          * by the root page and copy it over the root page.  If we can't get a
00404          * write lock, that's okay, the tree just stays deeper than we'd like.
00405          */
00406         root_pgno = cp->root;
00407         if (pgno != root_pgno || nitems != 1)
00408                 return (0);
00409 
00410         for (done = 0; !done;) {
00411                 /* Initialize. */
00412                 parent = child = NULL;
00413                 p_lock.off = c_lock.off = LOCK_INVALID;
00414 
00415                 /* Lock the root. */
00416                 pgno = root_pgno;
00417                 if ((ret =
00418                     CDB___db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
00419                         goto stop;
00420                 if ((ret = CDB_memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0)
00421                         goto stop;
00422 
00423                 if (NUM_ENT(parent) != 1)
00424                         goto stop;
00425 
00426                 switch (TYPE(parent)) {
00427                 case P_IBTREE:
00428                         pgno = GET_BINTERNAL(parent, 0)->pgno;
00429                         break;
00430                 case P_IRECNO:
00431                         pgno = GET_RINTERNAL(parent, 0)->pgno;
00432                         break;
00433                 default:
00434                         goto stop;
00435                 }
00436 
00437                 /* Lock the child page. */
00438                 if ((ret =
00439                     CDB___db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
00440                         goto stop;
00441                 if ((ret = CDB_memp_fget(dbp->mpf, &pgno, 0, &child)) != 0)
00442                         goto stop;
00443 
00444                 /* Log the change. */
00445                 if (DB_LOGGING(dbc)) {
00446                         memset(&a, 0, sizeof(a));
00447                         a.data = child;
00448                         a.size = dbp->pgsize;
00449                         memset(&b, 0, sizeof(b));
00450                         b.data = P_ENTRY(parent, 0);
00451                         b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
00452                         CDB___bam_rsplit_log(dbp->dbenv, dbc->txn,
00453                            &child->lsn, 0, dbp->log_fileid, PGNO(child), &a,
00454                            PGNO(parent), RE_NREC(parent), &b, &parent->lsn);
00455                 }
00456 
00457                 /*
00458                  * Make the switch.
00459                  *
00460                  * One fixup -- internal pages below the top level do not store
00461                  * a record count, so we have to preserve it if we're not
00462                  * converting to a leaf page.  Note also that we are about to
00463                  * overwrite the parent page, including its LSN.  This is OK
00464                  * because the log message we wrote describing this update
00465                  * stores its LSN on the child page.  When the child is copied
00466                  * onto the parent, the correct LSN is copied into place.
00467                  */
00468                 COMPQUIET(rcnt, 0);
00469                 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
00470                         rcnt = RE_NREC(parent);
00471                 memcpy(parent, child, dbp->pgsize);
00472                 PGNO(parent) = root_pgno;
00473                 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
00474                         RE_NREC_SET(parent, rcnt);
00475 
00476                 /* Mark the pages dirty. */
00477                 CDB_memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY);
00478                 CDB_memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY);
00479 
00480                 /* Adjust the cursors. */
00481                 CDB___bam_ca_rsplit(dbp, PGNO(child), root_pgno);
00482 
00483                 /*
00484                  * Free the page copied onto the root page and discard its
00485                  * lock.  (The call to CDB___db_free() discards our reference
00486                  * to the page.)
00487                  */
00488                 (void)CDB___db_free(dbc, child);
00489                 child = NULL;
00490 
00491                 if (0) {
00492 stop:                   done = 1;
00493                 }
00494                 if (p_lock.off != LOCK_INVALID)
00495                         (void)__TLPUT(dbc, p_lock);
00496                 if (parent != NULL)
00497                         CDB_memp_fput(dbp->mpf, parent, 0);
00498                 if (c_lock.off != LOCK_INVALID)
00499                         (void)__TLPUT(dbc, c_lock);
00500                 if (child != NULL)
00501                         CDB_memp_fput(dbp->mpf, child, 0);
00502         }
00503 
00504         return (0);
00505 }

Generated on Sun Jun 8 10:56:34 2008 for GNUmifluz by  doxygen 1.5.5