bt_put.c

Go to the documentation of this file.
00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996, 1997, 1998, 1999, 2000
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  */
00042 
00043 #include "config.h"
00044 
00045 #ifndef lint
00046 static const char revid[] = "$Id: bt__put_8c-source.html,v 1.1 2008/06/08 10:13:38 sebdiaz Exp $";
00047 #endif /* not lint */
00048 
00049 #ifndef NO_SYSTEM_INCLUDES
00050 #include <sys/types.h>
00051 
00052 #include <errno.h>
00053 #include <string.h>
00054 #endif
00055 
00056 #include "db_int.h"
00057 #include "db_page.h"
00058 #include "btree.h"
00059 
00060 static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t));
00061 static int __bam_ovput
00062                __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
00063 
00064 /*
00065  * CDB___bam_iitem --
00066  *      Insert an item into the tree.
00067  *
00068  * PUBLIC: int CDB___bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
00069  */
00070 int
00071 CDB___bam_iitem(dbc, key, data, op, flags)
00072         DBC *dbc;
00073         DBT *key, *data;
00074         u_int32_t op, flags;
00075 {
00076         BKEYDATA *bk, bk_tmp;
00077         BTREE *t;
00078         BTREE_CURSOR *cp;
00079         DB *dbp;
00080         DBT bk_hdr, tdbt;
00081         PAGE *h;
00082         db_indx_t indx;
00083         u_int32_t data_size, have_bytes, need_bytes, needed;
00084         int cmp, bigkey, bigdata, dupadjust, padrec, replace, ret, was_deleted;
00085 
00086         COMPQUIET(bk, NULL);
00087 
00088         dbp = dbc->dbp;
00089         cp = (BTREE_CURSOR *)dbc->internal;
00090         t = dbp->bt_internal;
00091         h = cp->page;
00092         indx = cp->indx;
00093         dupadjust = replace = was_deleted = 0;
00094 
00095         /*
00096          * Fixed-length records with partial puts: it's an error to specify
00097          * anything other simple overwrite.
00098          */
00099         if (F_ISSET(dbp, DB_RE_FIXEDLEN) &&
00100             F_ISSET(data, DB_DBT_PARTIAL) && data->dlen != data->size) {
00101                 data_size = data->size;
00102                 goto len_err;
00103         }
00104 
00105         /*
00106          * Figure out how much space the data will take, including if it's a
00107          * partial record.
00108          *
00109          * Fixed-length records: it's an error to specify a record that's
00110          * longer than the fixed-length, and we never require less than
00111          * the fixed-length record size.
00112          */
00113         data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
00114             CDB___bam_partsize(op, data, h, indx) : data->size;
00115         padrec = 0;
00116         if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
00117                 if (data_size > t->re_len) {
00118 len_err:                CDB___db_err(dbp->dbenv,
00119                             "Length improper for fixed length record %lu",
00120                             (u_long)data_size);
00121                         return (EINVAL);
00122                 }
00123                 if (data_size < t->re_len) {
00124                         padrec = 1;
00125                         data_size = t->re_len;
00126                 }
00127         }
00128 
00129         /*
00130          * Handle partial puts or short fixed-length records: build the
00131          * real record.
00132          */
00133         if (padrec || F_ISSET(data, DB_DBT_PARTIAL)) {
00134                 tdbt = *data;
00135                 if ((ret =
00136                     CDB___bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
00137                         return (ret);
00138                 data = &tdbt;
00139         }
00140 
00141         /*
00142          * If the user has specified a duplicate comparison function, return
00143          * an error if DB_CURRENT was specified and the replacement data
00144          * doesn't compare equal to the current data.  This stops apps from
00145          * screwing up the duplicate sort order.  We have to do this after
00146          * we build the real record so that we're comparing the real items.
00147          */
00148         if (op == DB_CURRENT && dbp->dup_compare != NULL) {
00149                 if ((ret = CDB___bam_cmp(dbp, data, h,
00150                      indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
00151                      dbp->dup_compare, &cmp)) != 0)
00152                         return (ret);
00153                 if (cmp != 0) {
00154                         CDB___db_err(dbp->dbenv,
00155                             "Current data differs from put data");
00156                         return (EINVAL);
00157                 }
00158         }
00159 
00160         /*
00161          * If the key or data item won't fit on a page, we'll have to store
00162          * them on overflow pages.
00163          */
00164         needed = 0;
00165         bigdata = data_size > cp->ovflsize;
00166         switch (op) {
00167         case DB_KEYFIRST:
00168                 /* We're adding a new key and data pair. */
00169                 bigkey = key->size > cp->ovflsize;
00170                 if (bigkey)
00171                         needed += BOVERFLOW_PSIZE;
00172                 else
00173                         needed += BKEYDATA_PSIZE(key->size);
00174                 if (bigdata)
00175                         needed += BOVERFLOW_PSIZE;
00176                 else
00177                         needed += BKEYDATA_PSIZE(data_size);
00178                 break;
00179         case DB_AFTER:
00180         case DB_BEFORE:
00181         case DB_CURRENT:
00182                 /*
00183                  * We're either overwriting the data item of a key/data pair
00184                  * or we're creating a new on-page duplicate and only adding
00185                  * a data item.
00186                  *
00187                  * !!!
00188                  * We're not currently correcting for space reclaimed from
00189                  * already deleted items, but I don't think it's worth the
00190                  * complexity.
00191                  */
00192                 bigkey = 0;
00193                 if (op == DB_CURRENT) {
00194                         bk = GET_BKEYDATA(h,
00195                             indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
00196                         if (B_TYPE(bk->type) == B_KEYDATA)
00197                                 have_bytes = BKEYDATA_PSIZE(bk->len);
00198                         else
00199                                 have_bytes = BOVERFLOW_PSIZE;
00200                         need_bytes = 0;
00201                 } else {
00202                         have_bytes = 0;
00203                         need_bytes = sizeof(db_indx_t);
00204                 }
00205                 if (bigdata)
00206                         need_bytes += BOVERFLOW_PSIZE;
00207                 else
00208                         need_bytes += BKEYDATA_PSIZE(data_size);
00209 
00210                 if (have_bytes < need_bytes)
00211                         needed += need_bytes - have_bytes;
00212                 break;
00213         default:
00214                 return (CDB___db_unknown_flag(dbp->dbenv, "CDB___bam_iitem", op));
00215         }
00216 
00217         /*
00218          * If there's not enough room, or the user has put a ceiling on the
00219          * number of keys permitted in the page, split the page.
00220          *
00221          * XXX
00222          * The t->bt_maxkey test here may be insufficient -- do we have to
00223          * check in the btree split code, so we don't undo it there!?!?
00224          */
00225         if (P_FREESPACE(h) < needed ||
00226             (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey))
00227                 return (DB_NEEDSPLIT);
00228 
00229         /*
00230          * The code breaks it up into five cases:
00231          *
00232          * 1. Insert a new key/data pair.
00233          * 2. Append a new data item (a new duplicate).
00234          * 3. Insert a new data item (a new duplicate).
00235          * 4. Delete and re-add the data item (overflow item).
00236          * 5. Overwrite the data item.
00237          */
00238         switch (op) {
00239         case DB_KEYFIRST:               /* 1. Insert a new key/data pair. */
00240                 if (bigkey) {
00241                         if ((ret = __bam_ovput(dbc,
00242                             B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
00243                                 return (ret);
00244                 } else
00245                         if ((ret = CDB___db_pitem(dbc, h, indx,
00246                             BKEYDATA_SIZE(key->size), NULL, key)) != 0)
00247                                 return (ret);
00248 
00249                 CDB___bam_ca_di(dbp, PGNO(h), indx, 1);
00250                 ++indx;
00251                 break;
00252         case DB_AFTER:                  /* 2. Append a new data item. */
00253                 if (TYPE(h) == P_LBTREE) {
00254                         /* Copy the key for the duplicate and adjust cursors. */
00255                         if ((ret =
00256                             CDB___bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
00257                                 return (ret);
00258                         CDB___bam_ca_di(dbp, PGNO(h), indx + P_INDX, 1);
00259 
00260                         indx += 3;
00261                         dupadjust = 1;
00262 
00263                         cp->indx += 2;
00264                 } else {
00265                         ++indx;
00266                         CDB___bam_ca_di(dbp, PGNO(h), indx, 1);
00267 
00268                         cp->indx += 1;
00269                 }
00270                 break;
00271         case DB_BEFORE:                 /* 3. Insert a new data item. */
00272                 if (TYPE(h) == P_LBTREE) {
00273                         /* Copy the key for the duplicate and adjust cursors. */
00274                         if ((ret = CDB___bam_adjindx(dbc, h, indx, indx, 1)) != 0)
00275                                 return (ret);
00276                         CDB___bam_ca_di(dbp, PGNO(h), indx, 1);
00277 
00278                         ++indx;
00279                         dupadjust = 1;
00280                 } else
00281                         CDB___bam_ca_di(dbp, PGNO(h), indx, 1);
00282                 break;
00283         case DB_CURRENT:
00284                 if (TYPE(h) == P_LBTREE) {
00285                         ++indx;
00286                         dupadjust = 1;
00287 
00288                         /*
00289                          * In a Btree deleted records aren't counted (deleted
00290                          * records are counted in a Recno because all accesses
00291                          * are based on record number).  If it's a Btree and
00292                          * it's a DB_CURRENT operation overwriting a previously
00293                          * deleted record, increment the record count.
00294                          */
00295                         was_deleted = B_DISSET(bk->type);
00296                 }
00297 
00298                 /*
00299                  * 4. Delete and re-add the data item.
00300                  *
00301                  * If we're changing the type of the on-page structure, or we
00302                  * are referencing offpage items, we have to delete and then
00303                  * re-add the item.  We do not do any cursor adjustments here
00304                  * because we're going to immediately re-add the item into the
00305                  * same slot.
00306                  */
00307                 if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
00308                         if ((ret = CDB___bam_ditem(dbc, h, indx)) != 0)
00309                                 return (ret);
00310                         break;
00311                 }
00312 
00313                 /* 5. Overwrite the data item. */
00314                 replace = 1;
00315                 break;
00316         default:
00317                 return (CDB___db_unknown_flag(dbp->dbenv, "CDB___bam_iitem", op));
00318         }
00319 
00320         /* Add the data. */
00321         if (bigdata) {
00322                 if ((ret = __bam_ovput(dbc,
00323                     B_OVERFLOW, PGNO_INVALID, h, indx, data)) != 0)
00324                         return (ret);
00325         } else {
00326                 if (LF_ISSET(BI_DELETED)) {
00327                         B_TSET(bk_tmp.type, B_KEYDATA, 1);
00328                         bk_tmp.len = data->size;
00329                         bk_hdr.data = &bk_tmp;
00330                         bk_hdr.size = SSZA(BKEYDATA, data);
00331                         ret = CDB___db_pitem(dbc, h, indx,
00332                             BKEYDATA_SIZE(data->size), &bk_hdr, data);
00333                 } else if (replace)
00334                         ret = CDB___bam_ritem(dbc, h, indx, data);
00335                 else
00336                         ret = CDB___db_pitem(dbc, h, indx,
00337                             BKEYDATA_SIZE(data->size), NULL, data);
00338                 if (ret != 0)
00339                         return (ret);
00340         }
00341         if ((ret = CDB_memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
00342                 return (ret);
00343 
00344         /*
00345          * Adjust the cursors in general.  After that's done, reset the current
00346          * cursor to point to the new item.
00347          */
00348         if (op == DB_CURRENT)
00349                 (void)CDB___bam_ca_delete(dbp, PGNO(h),
00350                     TYPE(h) == P_LBTREE ? indx - O_INDX : indx, 0);
00351         else {
00352                 CDB___bam_ca_di(dbp, PGNO(h), indx, 1);
00353                 cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
00354         }
00355 
00356         /*
00357          * If we've changed the record count, update the tree.  There's no
00358          * need to adjust the count if the operation not performed on the
00359          * current record or when the current record was previously deleted.
00360          */
00361         if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
00362                 if ((ret = CDB___bam_adjust(dbc, 1)) != 0)
00363                         return (ret);
00364 
00365         /*
00366          * If a Btree leaf page is at least 50% full and we may have added or
00367          * modified a duplicate data item, see if the set of duplicates takes
00368          * up at least 25% of the space on the page.  If it does, move it onto
00369          * its own page.
00370          */
00371         if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
00372                 if ((ret = __bam_dup_convert(dbc, h, indx - O_INDX)) != 0)
00373                         return (ret);
00374         }
00375 
00376         /* If we've modified a recno file, set the flag. */
00377         if (dbc->dbtype == DB_RECNO)
00378                 F_SET(t, RECNO_MODIFIED);
00379 
00380         return (ret);
00381 }
00382 
00383 /*
00384  * CDB___bam_partsize --
00385  *      Figure out how much space a partial data item is in total.
00386  *
00387  * PUBLIC: u_int32_t CDB___bam_partsize __P((u_int32_t, DBT *, PAGE *, u_int32_t));
00388  */
00389 u_int32_t
00390 CDB___bam_partsize(op, data, h, indx)
00391         u_int32_t op, indx;
00392         DBT *data;
00393         PAGE *h;
00394 {
00395         BKEYDATA *bk;
00396         u_int32_t nbytes;
00397 
00398         /*
00399          * If the record doesn't already exist, it's simply the data we're
00400          * provided.
00401          */
00402         if (op != DB_CURRENT)
00403                 return (data->doff + data->size);
00404 
00405         /*
00406          * Otherwise, it's the data provided plus any already existing data
00407          * that we're not replacing.
00408          */
00409         bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
00410         nbytes =
00411             B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
00412 
00413         /*
00414          * There are really two cases here:
00415          *
00416          * Case 1: We are replacing some bytes that do not exist (i.e., they
00417          * are past the end of the record).  In this case the number of bytes
00418          * we are replacing is irrelevant and all we care about is how many
00419          * bytes we are going to add from offset.  So, the new record length
00420          * is going to be the size of the new bytes (size) plus wherever those
00421          * new bytes begin (doff).
00422          *
00423          * Case 2: All the bytes we are replacing exist.  Therefore, the new
00424          * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
00425          * plus the bytes we are adding (size).
00426          */
00427         if (nbytes < data->doff + data->dlen)           /* Case 1 */
00428                 return (data->doff + data->size);
00429 
00430         return (nbytes + data->size - data->dlen);      /* Case 2 */
00431 }
00432 
00433 /*
00434  * CDB___bam_build --
00435  *      Build the real record for a partial put, or short fixed-length record.
00436  *
00437  * PUBLIC: int CDB___bam_build __P((DBC *, u_int32_t,
00438  * PUBLIC:     DBT *, PAGE *, u_int32_t, u_int32_t));
00439  */
00440 int
00441 CDB___bam_build(dbc, op, dbt, h, indx, nbytes)
00442         DBC *dbc;
00443         u_int32_t op, indx, nbytes;
00444         DBT *dbt;
00445         PAGE *h;
00446 {
00447         BKEYDATA *bk, tbk;
00448         BOVERFLOW *bo;
00449         BTREE *t;
00450         BTREE_CURSOR *cp;
00451         DB *dbp;
00452         DBT copy;
00453         u_int32_t len, tlen;
00454         u_int8_t *p;
00455         int ret;
00456 
00457         COMPQUIET(bo, NULL);
00458 
00459         dbp = dbc->dbp;
00460         cp = (BTREE_CURSOR *) dbc->internal;
00461         t = dbp->bt_internal;
00462 
00463         /* We use the record data return memory, it's only a short-term use. */
00464         if (dbc->rdata.ulen < nbytes) {
00465                 if ((ret = CDB___os_realloc(dbp->dbenv,
00466                     nbytes, NULL, &dbc->rdata.data)) != 0) {
00467                         dbc->rdata.ulen = 0;
00468                         dbc->rdata.data = NULL;
00469                         return (ret);
00470                 }
00471                 dbc->rdata.ulen = nbytes;
00472         }
00473 
00474         /*
00475          * We use nul or pad bytes for any part of the record that isn't
00476          * specified; get it over with.
00477          */
00478         memset(dbc->rdata.data,
00479            F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_pad : 0, nbytes);
00480 
00481         /*
00482          * In the next clauses, we need to do three things: a) set p to point
00483          * to the place at which to copy the user's data, b) set tlen to the
00484          * total length of the record, not including the bytes contributed by
00485          * the user, and c) copy any valid data from an existing record.  If
00486          * it's not a partial put (this code is called for both partial puts
00487          * and fixed-length record padding) or it's a new key, we can cut to
00488          * the chase.
00489          */
00490         if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
00491                 p = (u_int8_t *)dbc->rdata.data + dbt->doff;
00492                 tlen = dbt->doff;
00493                 goto user_copy;
00494         }
00495 
00496         /* Find the current record. */
00497         if (indx < NUM_ENT(h)) {
00498                 bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
00499                 bo = (BOVERFLOW *)bk;
00500         } else {
00501                 bk = &tbk;
00502                 B_TSET(bk->type, B_KEYDATA, 0);
00503                 bk->len = 0;
00504         }
00505         if (B_TYPE(bk->type) == B_OVERFLOW) {
00506                 /*
00507                  * In the case of an overflow record, we shift things around
00508                  * in the current record rather than allocate a separate copy.
00509                  */
00510                 memset(&copy, 0, sizeof(copy));
00511                 if ((ret = CDB___db_goff(dbp, &copy, bo->tlen,
00512                     bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
00513                         return (ret);
00514 
00515                 /* Skip any leading data from the original record. */
00516                 tlen = dbt->doff;
00517                 p = (u_int8_t *)dbc->rdata.data + dbt->doff;
00518 
00519                 /*
00520                  * Copy in any trailing data from the original record.
00521                  *
00522                  * If the original record was larger than the original offset
00523                  * plus the bytes being deleted, there is trailing data in the
00524                  * original record we need to preserve.  If we aren't deleting
00525                  * the same number of bytes as we're inserting, copy it up or
00526                  * down, into place.
00527                  *
00528                  * Use memmove(), the regions may overlap.
00529                  */
00530                 if (bo->tlen > dbt->doff + dbt->dlen) {
00531                         len = bo->tlen - (dbt->doff + dbt->dlen);
00532                         if (dbt->dlen != dbt->size)
00533                                 memmove(p + dbt->size, p + dbt->dlen, len);
00534                         tlen += len;
00535                 }
00536         } else {
00537                 /* Copy in any leading data from the original record. */
00538                 memcpy(dbc->rdata.data,
00539                     bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
00540                 tlen = dbt->doff;
00541                 p = (u_int8_t *)dbc->rdata.data + dbt->doff;
00542 
00543                 /* Copy in any trailing data from the original record. */
00544                 len = dbt->doff + dbt->dlen;
00545                 if (bk->len > len) {
00546                         memcpy(p + dbt->size, bk->data + len, bk->len - len);
00547                         tlen += bk->len - len;
00548                 }
00549         }
00550 
00551 user_copy:
00552         /*
00553          * Copy in the application provided data -- p and tlen must have been
00554          * initialized above.
00555          */
00556         memcpy(p, dbt->data, dbt->size);
00557         tlen += dbt->size;
00558 
00559         /* Set the DBT to reference our new record. */
00560         dbc->rdata.size = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : tlen;
00561         dbc->rdata.dlen = 0;
00562         dbc->rdata.doff = 0;
00563         dbc->rdata.flags = 0;
00564         *dbt = dbc->rdata;
00565         return (0);
00566 }
00567 
00568 /*
00569  * CDB___bam_ritem --
00570  *      Replace an item on a page.
00571  *
00572  * PUBLIC: int CDB___bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *));
00573  */
00574 int
00575 CDB___bam_ritem(dbc, h, indx, data)
00576         DBC *dbc;
00577         PAGE *h;
00578         u_int32_t indx;
00579         DBT *data;
00580 {
00581         BKEYDATA *bk;
00582         DB *dbp;
00583         DBT orig, repl;
00584         db_indx_t cnt, lo, ln, min, off, prefix, suffix;
00585         int32_t nbytes;
00586         int ret;
00587         u_int8_t *p, *t;
00588 
00589         dbp = dbc->dbp;
00590 
00591         /*
00592          * Replace a single item onto a page.  The logic figuring out where
00593          * to insert and whether it fits is handled in the caller.  All we do
00594          * here is manage the page shuffling.
00595          */
00596         bk = GET_BKEYDATA(h, indx);
00597 
00598         /* Log the change. */
00599         if (DB_LOGGING(dbc)) {
00600                 /*
00601                  * We might as well check to see if the two data items share
00602                  * a common prefix and suffix -- it can save us a lot of log
00603                  * message if they're large.
00604                  */
00605                 min = data->size < bk->len ? data->size : bk->len;
00606                 for (prefix = 0,
00607                     p = bk->data, t = data->data;
00608                     prefix < min && *p == *t; ++prefix, ++p, ++t)
00609                         ;
00610 
00611                 min -= prefix;
00612                 for (suffix = 0,
00613                     p = (u_int8_t *)bk->data + bk->len - 1,
00614                     t = (u_int8_t *)data->data + data->size - 1;
00615                     suffix < min && *p == *t; ++suffix, --p, --t)
00616                         ;
00617 
00618                 /* We only log the parts of the keys that have changed. */
00619                 orig.data = (u_int8_t *)bk->data + prefix;
00620                 orig.size = bk->len - (prefix + suffix);
00621                 repl.data = (u_int8_t *)data->data + prefix;
00622                 repl.size = data->size - (prefix + suffix);
00623                 if ((ret = CDB___bam_repl_log(dbp->dbenv, dbc->txn,
00624                     &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h),
00625                     (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
00626                     &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
00627                         return (ret);
00628         }
00629 
00630         /*
00631          * Set references to the first in-use byte on the page and the
00632          * first byte of the item being replaced.
00633          */
00634         p = (u_int8_t *)h + HOFFSET(h);
00635         t = (u_int8_t *)bk;
00636 
00637         /*
00638          * If the entry is growing in size, shift the beginning of the data
00639          * part of the page down.  If the entry is shrinking in size, shift
00640          * the beginning of the data part of the page up.  Use memmove(3),
00641          * the regions overlap.
00642          */
00643         lo = BKEYDATA_SIZE(bk->len);
00644         ln = BKEYDATA_SIZE(data->size);
00645         if (lo != ln) {
00646                 nbytes = lo - ln;               /* Signed difference. */
00647                 if (p == t)                     /* First index is fast. */
00648                         h->inp[indx] += nbytes;
00649                 else {                          /* Else, shift the page. */
00650                         memmove(p + nbytes, p, t - p);
00651 
00652                         /* Adjust the indices' offsets. */
00653                         off = h->inp[indx];
00654                         for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
00655                                 if (h->inp[cnt] <= off)
00656                                         h->inp[cnt] += nbytes;
00657                 }
00658 
00659                 /* Clean up the page and adjust the item's reference. */
00660                 HOFFSET(h) += nbytes;
00661                 t += nbytes;
00662         }
00663 
00664         /* Copy the new item onto the page. */
00665         bk = (BKEYDATA *)t;
00666         B_TSET(bk->type, B_KEYDATA, 0);
00667         bk->len = data->size;
00668         memcpy(bk->data, data->data, data->size);
00669 
00670         return (0);
00671 }
00672 
00673 /*
00674  * __bam_dup_convert --
00675  *      Check to see if the duplicate set at indx should have its own page.
00676  *      If it should, create it.
00677  */
00678 static int
00679 __bam_dup_convert(dbc, h, indx)
00680         DBC *dbc;
00681         PAGE *h;
00682         u_int32_t indx;
00683 {
00684         BTREE_CURSOR *cp;
00685         BKEYDATA *bk;
00686         DB *dbp;
00687         DBT hdr;
00688         PAGE *dp;
00689         db_indx_t cnt, cpindx, first, sz;
00690         int ret;
00691 
00692         dbp = dbc->dbp;
00693         cp = (BTREE_CURSOR *) dbc->internal;
00694 
00695         /*
00696          * Count the duplicate records and calculate how much room they're
00697          * using on the page.
00698          */
00699         while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
00700                 indx -= P_INDX;
00701         for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
00702                 if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx])
00703                         break;
00704                 bk = GET_BKEYDATA(h, indx);
00705                 sz += B_TYPE(bk->type) == B_KEYDATA ?
00706                     BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
00707                 bk = GET_BKEYDATA(h, indx + O_INDX);
00708                 sz += B_TYPE(bk->type) == B_KEYDATA ?
00709                     BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
00710         }
00711 
00712         /*
00713          * We have to do these checks when the user is replacing the cursor's
00714          * data item -- if the application replaces a duplicate item with a
00715          * larger data item, it can increase the amount of space used by the
00716          * duplicates, requiring this check.  But that means we may have done
00717          * this check when it wasn't a duplicate item after all.
00718          */
00719         if (cnt == 1)
00720                 return (0);
00721 
00722         /*
00723          * If this set of duplicates is using more than 25% of the page, move
00724          * them off.  The choice of 25% is a WAG, but the value must be small
00725          * enough that we can always split a page without putting duplicates
00726          * on two different pages.
00727          */
00728         if (sz < dbp->pgsize / 4)
00729                 return (0);
00730 
00731         /* Get a new page. */
00732         if ((ret = CDB___db_new(dbc,
00733             ((dbp->dup_compare == NULL ? P_LRECNO : P_LDUP) | dbp->tags), &dp)) != 0)
00734                 return (ret);
00735         P_INIT(dp, dbp->pgsize, dp->pgno,
00736             PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp), TAGS(dp));
00737 
00738         /*
00739          * Move this set of duplicates off the page.  First points to the first
00740          * key of the first duplicate key/data pair, cnt is the number of pairs
00741          * we're dealing with.
00742          */
00743         memset(&hdr, 0, sizeof(hdr));
00744         for (indx = first + O_INDX, cpindx = 0;;) {
00745                 /* Move cursors referencing the old entry to the new entry. */
00746                 if ((ret = CDB___bam_ca_dup(dbp, first,
00747                     PGNO(h), indx - O_INDX, PGNO(dp), cpindx)) != 0)
00748                         goto err;
00749 
00750                 /*
00751                  * Copy the entry to the new page.  If the off-duplicate page
00752                  * is a Btree page, deleted entries move normally.  If it's a
00753                  * Recno page, deleted entries are discarded.
00754                  */
00755                 bk = GET_BKEYDATA(h, indx);
00756                 hdr.data = bk;
00757                 hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
00758                     BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
00759                 if (dbp->dup_compare != NULL || !B_DISSET(bk->type)) {
00760                         if ((ret = CDB___db_pitem(
00761                             dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
00762                                 goto err;
00763                         ++cpindx;
00764                 }
00765 
00766                 /* Delete the data item. */
00767                 if ((ret = CDB___db_ditem(dbc, h, indx, hdr.size)) != 0)
00768                         goto err;
00769                 CDB___bam_ca_di(dbp, PGNO(h), indx, -1);
00770 
00771                 /* Delete all but the first reference to the key. */
00772                 if (--cnt == 0)
00773                         break;
00774                 if ((ret = CDB___bam_adjindx(dbc, h, indx, first, 0)) != 0)
00775                         goto err;
00776                 CDB___bam_ca_di(dbp, PGNO(h), indx, -1);
00777         }
00778 
00779         /* Put in a new data item that points to the duplicates page. */
00780         if ((ret = __bam_ovput(dbc, B_DUPLICATE, dp->pgno, h, indx, NULL)) != 0)
00781                 goto err;
00782 
00783         return (CDB_memp_fput(dbp->mpf, dp, DB_MPOOL_DIRTY));
00784 
00785 err:    (void)CDB___db_free(dbc, dp);
00786         return (ret);
00787 }
00788 
00789 /*
00790  * __bam_ovput --
00791  *      Build an item for an off-page duplicates page or overflow page and
00792  *      insert it on the page.
00793  */
00794 static int
00795 __bam_ovput(dbc, type, pgno, h, indx, item)
00796         DBC *dbc;
00797         u_int32_t type, indx;
00798         db_pgno_t pgno;
00799         PAGE *h;
00800         DBT *item;
00801 {
00802         BOVERFLOW bo;
00803         DBT hdr;
00804         int ret;
00805 
00806         UMRW(bo.unused1);
00807         B_TSET(bo.type, type, 0);
00808         UMRW(bo.unused2);
00809 
00810         /*
00811          * If we're creating an overflow item, do so and acquire the page
00812          * number for it.  If we're creating an off-page duplicates tree,
00813          * we are giving the page number as an argument.
00814          */
00815         if (type == B_OVERFLOW) {
00816                 if ((ret = CDB___db_poff(dbc, item, &bo.pgno)) != 0)
00817                         return (ret);
00818                 bo.tlen = item->size;
00819         } else {
00820                 bo.pgno = pgno;
00821                 bo.tlen = 0;
00822         }
00823 
00824         /* Store the new record on the page. */
00825         memset(&hdr, 0, sizeof(hdr));
00826         hdr.data = &bo;
00827         hdr.size = BOVERFLOW_SIZE;
00828         return (CDB___db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
00829 }

Generated on Sun Jun 8 10:56:35 2008 for GNUmifluz by  doxygen 1.5.5