/[resiprocate]/main/contrib/db/btree/bt_recno.c
ViewVC logotype

Annotation of /main/contrib/db/btree/bt_recno.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4437 - (hide annotations) (download)
Mon Apr 25 05:57:16 2005 UTC (14 years, 7 months ago) by fluffy
Original Path: main/sip/contrib/db/btree/bt_recno.c
File MIME type: text/plain
File size: 35139 byte(s)
windows versionof db
1 fluffy 4437 /*-
2     * See the file LICENSE for redistribution information.
3     *
4     * Copyright (c) 1997-2004
5     * Sleepycat Software. All rights reserved.
6     *
7     * $Id: bt_recno.c,v 11.117 2004/03/28 17:01:01 bostic Exp $
8     */
9    
10     #include "db_config.h"
11    
12     #ifndef NO_SYSTEM_INCLUDES
13     #include <sys/types.h>
14    
15     #include <string.h>
16     #endif
17    
18     #include "db_int.h"
19     #include "dbinc/db_page.h"
20     #include "dbinc/btree.h"
21     #include "dbinc/db_shash.h"
22     #include "dbinc/lock.h"
23    
24     static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
25     static int __ram_source __P((DB *));
26     static int __ram_sread __P((DBC *, db_recno_t));
27     static int __ram_update __P((DBC *, db_recno_t, int));
28    
29     /*
30     * In recno, there are two meanings to the on-page "deleted" flag. If we're
31     * re-numbering records, it means the record was implicitly created. We skip
32     * over implicitly created records if doing a cursor "next" or "prev", and
33     * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
34     * records, it means that the record was implicitly created, or was deleted.
35     * We skip over implicitly created or deleted records if doing a cursor "next"
36     * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
37     *
38     * If we're re-numbering records, then we have to detect in the cursor that
39     * a record was deleted, and adjust the cursor as necessary on the next get.
40     * If we're not re-numbering records, then we can detect that a record has
41     * been deleted by looking at the actual on-page record, so we completely
42     * ignore the cursor's delete flag. This is different from the B+tree code.
43     * It also maintains whether the cursor references a deleted record in the
44     * cursor, and it doesn't always check the on-page value.
45     */
46     #define CD_SET(cp) { \
47     if (F_ISSET(cp, C_RENUMBER)) \
48     F_SET(cp, C_DELETED); \
49     }
50     #define CD_CLR(cp) { \
51     if (F_ISSET(cp, C_RENUMBER)) { \
52     F_CLR(cp, C_DELETED); \
53     cp->order = INVALID_ORDER; \
54     } \
55     }
56     #define CD_ISSET(cp) \
57     (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
58    
59     /*
60     * Macros for comparing the ordering of two cursors.
61     * cp1 comes before cp2 iff one of the following holds:
62     * cp1's recno is less than cp2's recno
63     * recnos are equal, both deleted, and cp1's order is less than cp2's
64     * recnos are equal, cp1 deleted, and cp2 not deleted
65     */
66     #define C_LESSTHAN(cp1, cp2) \
67     (((cp1)->recno < (cp2)->recno) || \
68     (((cp1)->recno == (cp2)->recno) && \
69     ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
70     (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
71    
72     /*
73     * cp1 is equal to cp2 iff their recnos and delete flags are identical,
74     * and if the delete flag is set their orders are also identical.
75     */
76     #define C_EQUAL(cp1, cp2) \
77     ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
78     (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
79    
80     /*
81     * Do we need to log the current cursor adjustment?
82     */
83     #define CURADJ_LOG(dbc) \
84     (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
85    
86     /*
87     * After a search, copy the found page into the cursor, discarding any
88     * currently held lock.
89     */
90     #define STACK_TO_CURSOR(cp, ret) { \
91     int __t_ret; \
92     (cp)->page = (cp)->csp->page; \
93     (cp)->pgno = (cp)->csp->page->pgno; \
94     (cp)->indx = (cp)->csp->indx; \
95     if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
96     ret = __t_ret; \
97     (cp)->lock = (cp)->csp->lock; \
98     (cp)->lock_mode = (cp)->csp->lock_mode; \
99     }
100    
101     /*
102     * __ram_open --
103     * Recno open function.
104     *
105     * PUBLIC: int __ram_open __P((DB *,
106     * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
107     */
108     int
109     __ram_open(dbp, txn, name, base_pgno, flags)
110     DB *dbp;
111     DB_TXN *txn;
112     const char *name;
113     db_pgno_t base_pgno;
114     u_int32_t flags;
115     {
116     BTREE *t;
117     DBC *dbc;
118     int ret, t_ret;
119    
120     COMPQUIET(name, NULL);
121     t = dbp->bt_internal;
122    
123     /* Start up the tree. */
124     if ((ret = __bam_read_root(dbp, txn, base_pgno, flags)) != 0)
125     return (ret);
126    
127     /*
128     * If the user specified a source tree, open it and map it in.
129     *
130     * !!!
131     * We don't complain if the user specified transactions or threads.
132     * It's possible to make it work, but you'd better know what you're
133     * doing!
134     */
135     if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
136     return (ret);
137    
138     /* If we're snapshotting an underlying source file, do it now. */
139     if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
140     /* Allocate a cursor. */
141     if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
142     return (ret);
143    
144     /* Do the snapshot. */
145     if ((ret = __ram_update(dbc,
146     DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
147     ret = 0;
148    
149     /* Discard the cursor. */
150     if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
151     ret = t_ret;
152     }
153    
154     return (ret);
155     }
156    
157     /*
158     * __ram_append --
159     * Recno append function.
160     *
161     * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
162     */
163     int
164     __ram_append(dbc, key, data)
165     DBC *dbc;
166     DBT *key, *data;
167     {
168     BTREE_CURSOR *cp;
169     int ret;
170    
171     cp = (BTREE_CURSOR *)dbc->internal;
172    
173     /*
174     * Make sure we've read in all of the backing source file. If
175     * we found the record or it simply didn't exist, add the
176     * user's record.
177     */
178     ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
179     if (ret == 0 || ret == DB_NOTFOUND)
180     ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
181    
182     /* Return the record number. */
183     if (ret == 0)
184     ret = __db_retcopy(dbc->dbp->dbenv, key, &cp->recno,
185     sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
186    
187     return (ret);
188     }
189    
190     /*
191     * __ram_c_del --
192     * Recno cursor->c_del function.
193     *
194     * PUBLIC: int __ram_c_del __P((DBC *));
195     */
196     int
197     __ram_c_del(dbc)
198     DBC *dbc;
199     {
200     BKEYDATA bk;
201     BTREE *t;
202     BTREE_CURSOR *cp;
203     DB *dbp;
204     DB_LSN lsn;
205     DBT hdr, data;
206     EPG *epg;
207     int exact, ret, stack, t_ret;
208    
209     dbp = dbc->dbp;
210     cp = (BTREE_CURSOR *)dbc->internal;
211     t = dbp->bt_internal;
212     stack = 0;
213    
214     /*
215     * The semantics of cursors during delete are as follows: in
216     * non-renumbering recnos, records are replaced with a marker
217     * containing a delete flag. If the record referenced by this cursor
218     * has already been deleted, we will detect that as part of the delete
219     * operation, and fail.
220     *
221     * In renumbering recnos, cursors which represent deleted items
222     * are flagged with the C_DELETED flag, and it is an error to
223     * call c_del a second time without an intervening cursor motion.
224     */
225     if (CD_ISSET(cp))
226     return (DB_KEYEMPTY);
227    
228     /* Search the tree for the key; delete only deletes exact matches. */
229     if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
230     goto err;
231     if (!exact) {
232     ret = DB_NOTFOUND;
233     goto err;
234     }
235     stack = 1;
236    
237     /* Copy the page into the cursor. */
238     STACK_TO_CURSOR(cp, ret);
239     if (ret != 0)
240     goto err;
241    
242     /*
243     * If re-numbering records, the on-page deleted flag can only mean
244     * that this record was implicitly created. Applications aren't
245     * permitted to delete records they never created, return an error.
246     *
247     * If not re-numbering records, the on-page deleted flag means that
248     * this record was implicitly created, or, was deleted at some time.
249     * The former is an error because applications aren't permitted to
250     * delete records they never created, the latter is an error because
251     * if the record was "deleted", we could never have found it.
252     */
253     if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
254     ret = DB_KEYEMPTY;
255     goto err;
256     }
257    
258     if (F_ISSET(cp, C_RENUMBER)) {
259     /* Delete the item, adjust the counts, adjust the cursors. */
260     if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
261     goto err;
262     if ((ret = __bam_adjust(dbc, -1)) != 0)
263     goto err;
264     if (__ram_ca(dbc, CA_DELETE) > 0 &&
265     CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn,
266     &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0)
267     goto err;
268    
269     /*
270     * If the page is empty, delete it.
271     *
272     * We never delete a root page. First, root pages of primary
273     * databases never go away, recno or otherwise. However, if
274     * it's the root page of an off-page duplicates database, then
275     * it can be deleted. We don't delete it here because we have
276     * no way of telling the primary database page holder (e.g.,
277     * the hash access method) that its page element should cleaned
278     * up because the underlying tree is gone. So, we keep the page
279     * around until the last cursor referencing the empty tree is
280     * are closed, and then clean it up.
281     */
282     if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
283     /*
284     * We already have a locked stack of pages. However,
285     * there are likely entries in the stack that aren't
286     * going to be emptied by removing the single reference
287     * to the emptied page (or one of its parents).
288     */
289     for (epg = cp->csp; epg >= cp->sp; --epg)
290     if (NUM_ENT(epg->page) > 1)
291     break;
292    
293     /*
294     * We want to delete a single item out of the last page
295     * that we're not deleting.
296     */
297     ret = __bam_dpages(dbc, epg);
298    
299     /*
300     * Regardless of the return from __bam_dpages, it will
301     * discard our stack and pinned page.
302     */
303     stack = 0;
304     cp->page = NULL;
305     }
306     } else {
307     /* Use a delete/put pair to replace the record with a marker. */
308     if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
309     goto err;
310    
311     B_TSET(bk.type, B_KEYDATA, 1);
312     bk.len = 0;
313     memset(&hdr, 0, sizeof(hdr));
314     hdr.data = &bk;
315     hdr.size = SSZA(BKEYDATA, data);
316     memset(&data, 0, sizeof(data));
317     data.data = (void *)"";
318     data.size = 0;
319     if ((ret = __db_pitem(dbc,
320     cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
321     goto err;
322     }
323    
324     t->re_modified = 1;
325    
326     err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
327     ret = t_ret;
328    
329     return (ret);
330     }
331    
332     /*
333     * __ram_c_get --
334     * Recno cursor->c_get function.
335     *
336     * PUBLIC: int __ram_c_get
337     * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
338     */
339     int
340     __ram_c_get(dbc, key, data, flags, pgnop)
341     DBC *dbc;
342     DBT *key, *data;
343     u_int32_t flags;
344     db_pgno_t *pgnop;
345     {
346     BTREE_CURSOR *cp;
347     DB *dbp;
348     int cmp, exact, ret;
349    
350     COMPQUIET(pgnop, NULL);
351    
352     dbp = dbc->dbp;
353     cp = (BTREE_CURSOR *)dbc->internal;
354    
355     LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
356     retry: switch (flags) {
357     case DB_CURRENT:
358     /*
359     * If we're using mutable records and the deleted flag is
360     * set, the cursor is pointing at a nonexistent record;
361     * return an error.
362     */
363     if (CD_ISSET(cp))
364     return (DB_KEYEMPTY);
365     break;
366     case DB_NEXT_DUP:
367     /*
368     * If we're not in an off-page dup set, we know there's no
369     * next duplicate since recnos don't have them. If we
370     * are in an off-page dup set, the next item assuredly is
371     * a dup, so we set flags to DB_NEXT and keep going.
372     */
373     if (!F_ISSET(dbc, DBC_OPD))
374     return (DB_NOTFOUND);
375     /* FALLTHROUGH */
376     case DB_NEXT_NODUP:
377     /*
378     * Recno databases don't have duplicates, set flags to DB_NEXT
379     * and keep going.
380     */
381     /* FALLTHROUGH */
382     case DB_NEXT:
383     flags = DB_NEXT;
384     /*
385     * If record numbers are mutable: if we just deleted a record,
386     * we have to avoid incrementing the record number so that we
387     * return the right record by virtue of renumbering the tree.
388     */
389     if (CD_ISSET(cp)) {
390     /*
391     * Clear the flag, we've moved off the deleted record.
392     */
393     CD_CLR(cp);
394     break;
395     }
396    
397     if (cp->recno != RECNO_OOB) {
398     ++cp->recno;
399     break;
400     }
401     /* FALLTHROUGH */
402     case DB_FIRST:
403     flags = DB_NEXT;
404     cp->recno = 1;
405     break;
406     case DB_PREV_NODUP:
407     /*
408     * Recno databases don't have duplicates, set flags to DB_PREV
409     * and keep going.
410     */
411     /* FALLTHROUGH */
412     case DB_PREV:
413     flags = DB_PREV;
414     if (cp->recno != RECNO_OOB) {
415     if (cp->recno == 1) {
416     ret = DB_NOTFOUND;
417     goto err;
418     }
419     --cp->recno;
420     break;
421     }
422     /* FALLTHROUGH */
423     case DB_LAST:
424     flags = DB_PREV;
425     if (((ret = __ram_update(dbc,
426     DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
427     goto err;
428     if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
429     goto err;
430     if (cp->recno == 0) {
431     ret = DB_NOTFOUND;
432     goto err;
433     }
434     break;
435     case DB_GET_BOTHC:
436     /*
437     * If we're doing a join and these are offpage dups,
438     * we want to keep searching forward from after the
439     * current cursor position. Increment the recno by 1,
440     * then proceed as for a DB_SET.
441     *
442     * Otherwise, we know there are no additional matching
443     * data, as recnos don't have dups. return DB_NOTFOUND.
444     */
445     if (F_ISSET(dbc, DBC_OPD)) {
446     cp->recno++;
447     break;
448     }
449     ret = DB_NOTFOUND;
450     goto err;
451     /* NOTREACHED */
452     case DB_GET_BOTH:
453     case DB_GET_BOTH_RANGE:
454     /*
455     * If we're searching a set of off-page dups, we start
456     * a new linear search from the first record. Otherwise,
457     * we compare the single data item associated with the
458     * requested record for a match.
459     */
460     if (F_ISSET(dbc, DBC_OPD)) {
461     cp->recno = 1;
462     break;
463     }
464     /* FALLTHROUGH */
465     case DB_SET:
466     case DB_SET_RANGE:
467     if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
468     goto err;
469     break;
470     default:
471     ret = __db_unknown_flag(dbp->dbenv, "__ram_c_get", flags);
472     goto err;
473     }
474    
475     /*
476     * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
477     * called __ram_update() to make sure sufficient records have been
478     * read from the backing source file. Do it now for DB_CURRENT (if
479     * the current record was deleted we may need more records from the
480     * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
481     * (We don't have to test for flags == DB_FIRST, because the switch
482     * statement above re-set flags to DB_NEXT in that case.)
483     */
484     if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
485     __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
486     goto err;
487    
488     for (;; ++cp->recno) {
489     /* Search the tree for the record. */
490     if ((ret = __bam_rsearch(dbc, &cp->recno,
491     F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
492     1, &exact)) != 0)
493     goto err;
494     if (!exact) {
495     ret = DB_NOTFOUND;
496     goto err;
497     }
498    
499     /* Copy the page into the cursor. */
500     STACK_TO_CURSOR(cp, ret);
501     if (ret != 0)
502     goto err;
503    
504     /*
505     * If re-numbering records, the on-page deleted flag means this
506     * record was implicitly created. If not re-numbering records,
507     * the on-page deleted flag means this record was implicitly
508     * created, or, it was deleted at some time. Regardless, we
509     * skip such records if doing cursor next/prev operations or
510     * walking through off-page duplicates, and fail if they were
511     * requested explicitly by the application.
512     */
513     if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
514     switch (flags) {
515     case DB_NEXT:
516     case DB_PREV:
517     (void)__bam_stkrel(dbc, STK_CLRDBC);
518     goto retry;
519     case DB_GET_BOTH:
520     case DB_GET_BOTH_RANGE:
521     /*
522     * If we're an OPD tree, we don't care about
523     * matching a record number on a DB_GET_BOTH
524     * -- everything belongs to the same tree. A
525     * normal recno should give up and return
526     * DB_NOTFOUND if the matching recno is deleted.
527     */
528     if (F_ISSET(dbc, DBC_OPD)) {
529     (void)__bam_stkrel(dbc, STK_CLRDBC);
530     continue;
531     }
532     ret = DB_NOTFOUND;
533     goto err;
534     default:
535     ret = DB_KEYEMPTY;
536     goto err;
537     }
538    
539     if (flags == DB_GET_BOTH ||
540     flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
541     if ((ret = __bam_cmp(dbp, data,
542     cp->page, cp->indx, __bam_defcmp, &cmp)) != 0)
543     return (ret);
544     if (cmp == 0)
545     break;
546     if (!F_ISSET(dbc, DBC_OPD)) {
547     ret = DB_NOTFOUND;
548     goto err;
549     }
550     (void)__bam_stkrel(dbc, STK_CLRDBC);
551     } else
552     break;
553     }
554    
555     /* Return the key if the user didn't give us one. */
556     if (!F_ISSET(dbc, DBC_OPD)) {
557     if (flags != DB_GET_BOTH && flags != DB_GET_BOTH_RANGE &&
558     flags != DB_SET && flags != DB_SET_RANGE)
559     ret = __db_retcopy(dbp->dbenv,
560     key, &cp->recno, sizeof(cp->recno),
561     &dbc->rkey->data, &dbc->rkey->ulen);
562     F_SET(key, DB_DBT_ISSET);
563     }
564    
565     /* The cursor was reset, no further delete adjustment is necessary. */
566     err: CD_CLR(cp);
567    
568     return (ret);
569     }
570    
571     /*
572     * __ram_c_put --
573     * Recno cursor->c_put function.
574     *
575     * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
576     */
577     int
578     __ram_c_put(dbc, key, data, flags, pgnop)
579     DBC *dbc;
580     DBT *key, *data;
581     u_int32_t flags;
582     db_pgno_t *pgnop;
583     {
584     BTREE_CURSOR *cp;
585     DB *dbp;
586     DB_LSN lsn;
587     int exact, nc, ret, t_ret;
588     u_int32_t iiflags;
589     void *arg;
590    
591     COMPQUIET(pgnop, NULL);
592    
593     dbp = dbc->dbp;
594     cp = (BTREE_CURSOR *)dbc->internal;
595    
596     /*
597     * DB_KEYFIRST and DB_KEYLAST mean different things if they're
598     * used in an off-page duplicate tree. If we're an off-page
599     * duplicate tree, they really mean "put at the beginning of the
600     * tree" and "put at the end of the tree" respectively, so translate
601     * them to something else.
602     */
603     if (F_ISSET(dbc, DBC_OPD))
604     switch (flags) {
605     case DB_KEYFIRST:
606     cp->recno = 1;
607     flags = DB_BEFORE;
608     break;
609     case DB_KEYLAST:
610     if ((ret = __ram_add(dbc,
611     &cp->recno, data, DB_APPEND, 0)) != 0)
612     return (ret);
613     if (CURADJ_LOG(dbc) &&
614     (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
615     CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
616     return (ret);
617     return (0);
618     default:
619     break;
620     }
621    
622     /*
623     * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
624     * no duplicates, these are identical and mean "put the given
625     * datum at the given recno".
626     *
627     * Note that the code here used to be in __ram_put; now, we
628     * go through the access-method-common __db_put function, which
629     * handles DB_NOOVERWRITE, so we and __ram_add don't have to.
630     */
631     if (flags == DB_KEYFIRST || flags == DB_KEYLAST) {
632     ret = __ram_getno(dbc, key, &cp->recno, 1);
633     if (ret == 0 || ret == DB_NOTFOUND)
634     ret = __ram_add(dbc, &cp->recno, data, 0, 0);
635     return (ret);
636     }
637    
638     /*
639     * If we're putting with a cursor that's marked C_DELETED, we need to
640     * take special care; the cursor doesn't "really" reference the item
641     * corresponding to its current recno, but instead is "between" that
642     * record and the current one. Translate the actual insert into
643     * DB_BEFORE, and let the __ram_ca work out the gory details of what
644     * should wind up pointing where.
645     */
646     if (CD_ISSET(cp))
647     iiflags = DB_BEFORE;
648     else
649     iiflags = flags;
650    
651     split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
652     goto err;
653     /*
654     * An inexact match is okay; it just means we're one record past the
655     * end, which is reasonable if we're marked deleted.
656     */
657     DB_ASSERT(exact || CD_ISSET(cp));
658    
659     /* Copy the page into the cursor. */
660     STACK_TO_CURSOR(cp, ret);
661     if (ret != 0)
662     goto err;
663    
664     ret = __bam_iitem(dbc, key, data, iiflags, 0);
665     t_ret = __bam_stkrel(dbc, STK_CLRDBC);
666    
667     if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
668     ret = t_ret;
669     else if (ret == DB_NEEDSPLIT) {
670     arg = &cp->recno;
671     if ((ret = __bam_split(dbc, arg, NULL)) != 0)
672     goto err;
673     goto split;
674     }
675     if (ret != 0)
676     goto err;
677    
678     switch (flags) { /* Adjust the cursors. */
679     case DB_AFTER:
680     nc = __ram_ca(dbc, CA_IAFTER);
681    
682     /*
683     * We only need to adjust this cursor forward if we truly added
684     * the item after the current recno, rather than remapping it
685     * to DB_BEFORE.
686     */
687     if (iiflags == DB_AFTER)
688     ++cp->recno;
689    
690     /* Only log if __ram_ca found any relevant cursors. */
691     if (nc > 0 && CURADJ_LOG(dbc) &&
692     (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
693     cp->root, cp->recno, cp->order)) != 0)
694     goto err;
695     break;
696     case DB_BEFORE:
697     nc = __ram_ca(dbc, CA_IBEFORE);
698     --cp->recno;
699    
700     /* Only log if __ram_ca found any relevant cursors. */
701     if (nc > 0 && CURADJ_LOG(dbc) &&
702     (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
703     cp->root, cp->recno, cp->order)) != 0)
704     goto err;
705     break;
706     case DB_CURRENT:
707     /*
708     * We only need to do an adjustment if we actually
709     * added an item, which we only would have done if the
710     * cursor was marked deleted.
711     *
712     * Only log if __ram_ca found any relevant cursors.
713     */
714     if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 &&
715     CURADJ_LOG(dbc) &&
716     (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
717     CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
718     goto err;
719     break;
720     default:
721     break;
722     }
723    
724     /* Return the key if we've created a new record. */
725     if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE))
726     ret = __db_retcopy(dbp->dbenv, key, &cp->recno,
727     sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
728    
729     /* The cursor was reset, no further delete adjustment is necessary. */
730     err: CD_CLR(cp);
731    
732     return (ret);
733     }
734    
735     /*
736     * __ram_ca --
737     * Adjust cursors. Returns the number of relevant cursors.
738     *
739     * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg));
740     */
741     int
742     __ram_ca(dbc_arg, op)
743     DBC *dbc_arg;
744     ca_recno_arg op;
745     {
746     BTREE_CURSOR *cp, *cp_arg;
747     DB *dbp, *ldbp;
748     DB_ENV *dbenv;
749     DBC *dbc;
750     db_recno_t recno;
751     int adjusted, found;
752     u_int32_t order;
753    
754     dbp = dbc_arg->dbp;
755     dbenv = dbp->dbenv;
756     cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
757     recno = cp_arg->recno;
758    
759     found = 0;
760    
761     /*
762     * It only makes sense to adjust cursors if we're a renumbering
763     * recno; we should only be called if this is one.
764     */
765     DB_ASSERT(F_ISSET(cp_arg, C_RENUMBER));
766    
767     MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
768     /*
769     * Adjust the cursors. See the comment in __bam_ca_delete().
770     */
771     /*
772     * If we're doing a delete, we need to find the highest
773     * order of any cursor currently pointing at this item,
774     * so we can assign a higher order to the newly deleted
775     * cursor. Unfortunately, this requires a second pass through
776     * the cursor list.
777     */
778     if (op == CA_DELETE) {
779     order = 1;
780     for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
781     ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
782     ldbp = LIST_NEXT(ldbp, dblistlinks)) {
783     MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
784     for (dbc = TAILQ_FIRST(&ldbp->active_queue);
785     dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
786     cp = (BTREE_CURSOR *)dbc->internal;
787     if (cp_arg->root == cp->root &&
788     recno == cp->recno && CD_ISSET(cp) &&
789     order <= cp->order)
790     order = cp->order + 1;
791     }
792     MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
793     }
794     } else
795     order = INVALID_ORDER;
796    
797     /* Now go through and do the actual adjustments. */
798     for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
799     ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
800     ldbp = LIST_NEXT(ldbp, dblistlinks)) {
801     MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
802     for (dbc = TAILQ_FIRST(&ldbp->active_queue);
803     dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
804     cp = (BTREE_CURSOR *)dbc->internal;
805     if (cp_arg->root != cp->root)
806     continue;
807     ++found;
808     adjusted = 0;
809     switch (op) {
810     case CA_DELETE:
811     if (recno < cp->recno) {
812     --cp->recno;
813     /*
814     * If the adjustment made them equal,
815     * we have to merge the orders.
816     */
817     if (recno == cp->recno && CD_ISSET(cp))
818     cp->order += order;
819     } else if (recno == cp->recno &&
820     !CD_ISSET(cp)) {
821     CD_SET(cp);
822     cp->order = order;
823     }
824     break;
825     case CA_IBEFORE:
826     /*
827     * IBEFORE is just like IAFTER, except that we
828     * adjust cursors on the current record too.
829     */
830     if (C_EQUAL(cp_arg, cp)) {
831     ++cp->recno;
832     adjusted = 1;
833     }
834     goto iafter;
835     case CA_ICURRENT:
836    
837     /*
838     * If the original cursor wasn't deleted, we
839     * just did a replacement and so there's no
840     * need to adjust anything--we shouldn't have
841     * gotten this far. Otherwise, we behave
842     * much like an IAFTER, except that all
843     * cursors pointing to the current item get
844     * marked undeleted and point to the new
845     * item.
846     */
847     DB_ASSERT(CD_ISSET(cp_arg));
848     if (C_EQUAL(cp_arg, cp)) {
849     CD_CLR(cp);
850     break;
851     }
852     /* FALLTHROUGH */
853     case CA_IAFTER:
854     iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
855     ++cp->recno;
856     adjusted = 1;
857     }
858     if (recno == cp->recno && adjusted)
859     /*
860     * If we've moved this cursor's recno,
861     * split its order number--i.e.,
862     * decrement it by enough so that
863     * the lowest cursor moved has order 1.
864     * cp_arg->order is the split point,
865     * so decrement by one less than that.
866     */
867     cp->order -= (cp_arg->order - 1);
868     break;
869     }
870     }
871     MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
872     }
873     MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
874    
875     return (found);
876     }
877    
878     /*
879     * __ram_getno --
880     * Check the user's record number, and make sure we've seen it.
881     *
882     * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
883     */
884     int
885     __ram_getno(dbc, key, rep, can_create)
886     DBC *dbc;
887     const DBT *key;
888     db_recno_t *rep;
889     int can_create;
890     {
891     DB *dbp;
892     db_recno_t recno;
893    
894     dbp = dbc->dbp;
895    
896     /* Check the user's record number. */
897     if ((recno = *(db_recno_t *)key->data) == 0) {
898     __db_err(dbp->dbenv, "illegal record number of 0");
899     return (EINVAL);
900     }
901     if (rep != NULL)
902     *rep = recno;
903    
904     /*
905     * Btree can neither create records nor read them in. Recno can
906     * do both, see if we can find the record.
907     */
908     return (dbc->dbtype == DB_RECNO ?
909     __ram_update(dbc, recno, can_create) : 0);
910     }
911    
912     /*
913     * __ram_update --
914     * Ensure the tree has records up to and including the specified one.
915     */
916     static int
917     __ram_update(dbc, recno, can_create)
918     DBC *dbc;
919     db_recno_t recno;
920     int can_create;
921     {
922     BTREE *t;
923     DB *dbp;
924     DBT *rdata;
925     db_recno_t nrecs;
926     int ret;
927    
928     dbp = dbc->dbp;
929     t = dbp->bt_internal;
930    
931     /*
932     * If we can't create records and we've read the entire backing input
933     * file, we're done.
934     */
935     if (!can_create && t->re_eof)
936     return (0);
937    
938     /*
939     * If we haven't seen this record yet, try to get it from the original
940     * file.
941     */
942     if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
943     return (ret);
944     if (!t->re_eof && recno > nrecs) {
945     if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
946     return (ret);
947     if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
948     return (ret);
949     }
950    
951     /*
952     * If we can create records, create empty ones up to the requested
953     * record.
954     */
955     if (!can_create || recno <= nrecs + 1)
956     return (0);
957    
958     rdata = &dbc->my_rdata;
959     rdata->flags = 0;
960     rdata->size = 0;
961    
962     while (recno > ++nrecs)
963     if ((ret = __ram_add(dbc,
964     &nrecs, rdata, 0, BI_DELETED)) != 0)
965     return (ret);
966     return (0);
967     }
968    
969     /*
970     * __ram_source --
971     * Load information about the backing file.
972     */
973     static int
974     __ram_source(dbp)
975     DB *dbp;
976     {
977     BTREE *t;
978     char *source;
979     int ret;
980    
981     t = dbp->bt_internal;
982    
983     /* Find the real name, and swap out the one we had before. */
984     if ((ret = __db_appname(dbp->dbenv,
985     DB_APP_DATA, t->re_source, 0, NULL, &source)) != 0)
986     return (ret);
987     __os_free(dbp->dbenv, t->re_source);
988     t->re_source = source;
989    
990     /*
991     * !!!
992     * It's possible that the backing source file is read-only. We don't
993     * much care other than we'll complain if there are any modifications
994     * when it comes time to write the database back to the source.
995     */
996     if ((t->re_fp = fopen(t->re_source, "r")) == NULL) {
997     ret = __os_get_errno();
998     __db_err(dbp->dbenv, "%s: %s", t->re_source, db_strerror(ret));
999     return (ret);
1000     }
1001    
1002     t->re_eof = 0;
1003     return (0);
1004     }
1005    
1006     /*
1007     * __ram_writeback --
1008     * Rewrite the backing file.
1009     *
1010     * PUBLIC: int __ram_writeback __P((DB *));
1011     */
1012     int
1013     __ram_writeback(dbp)
1014     DB *dbp;
1015     {
1016     BTREE *t;
1017     DB_ENV *dbenv;
1018     DBC *dbc;
1019     DBT key, data;
1020     FILE *fp;
1021     db_recno_t keyno;
1022     int ret, t_ret;
1023     u_int8_t delim, *pad;
1024    
1025     t = dbp->bt_internal;
1026     dbenv = dbp->dbenv;
1027     fp = NULL;
1028     pad = NULL;
1029    
1030     /* If the file wasn't modified, we're done. */
1031     if (!t->re_modified)
1032     return (0);
1033    
1034     /* If there's no backing source file, we're done. */
1035     if (t->re_source == NULL) {
1036     t->re_modified = 0;
1037     return (0);
1038     }
1039    
1040     /* Allocate a cursor. */
1041     if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
1042     return (ret);
1043    
1044     /*
1045     * Read any remaining records into the tree.
1046     *
1047     * !!!
1048     * This is why we can't support transactions when applications specify
1049     * backing (re_source) files. At this point we have to read in the
1050     * rest of the records from the file so that we can write all of the
1051     * records back out again, which could modify a page for which we'd
1052     * have to log changes and which we don't have locked. This could be
1053     * partially fixed by taking a snapshot of the entire file during the
1054     * DB->open as DB->open is transaction protected. But, if a checkpoint
1055     * occurs then, the part of the log holding the copy of the file could
1056     * be discarded, and that would make it impossible to recover in the
1057     * face of disaster. This could all probably be fixed, but it would
1058     * require transaction protecting the backing source file.
1059     *
1060     * XXX
1061     * This could be made to work now that we have transactions protecting
1062     * file operations. Margo has specifically asked for the privilege of
1063     * doing this work.
1064     */
1065     if ((ret =
1066     __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
1067     return (ret);
1068    
1069     /*
1070     * Close any existing file handle and re-open the file, truncating it.
1071     */
1072     if (t->re_fp != NULL) {
1073     if (fclose(t->re_fp) != 0) {
1074     ret = __os_get_errno();
1075     goto err;
1076     }
1077     t->re_fp = NULL;
1078     }
1079     if ((fp = fopen(t->re_source, "w")) == NULL) {
1080     ret = __os_get_errno();
1081     __db_err(dbenv, "%s: %s", t->re_source, db_strerror(ret));
1082     goto err;
1083     }
1084    
1085     /*
1086     * We step through the records, writing each one out. Use the record
1087     * number and the dbp->get() function, instead of a cursor, so we find
1088     * and write out "deleted" or non-existent records. The DB handle may
1089     * be threaded, so allocate memory as we go.
1090     */
1091     memset(&key, 0, sizeof(key));
1092     key.size = sizeof(db_recno_t);
1093     key.data = &keyno;
1094     memset(&data, 0, sizeof(data));
1095     F_SET(&data, DB_DBT_REALLOC);
1096    
1097     /*
1098     * We'll need the delimiter if we're doing variable-length records,
1099     * and the pad character if we're doing fixed-length records.
1100     */
1101     delim = t->re_delim;
1102     for (keyno = 1;; ++keyno) {
1103     switch (ret = __db_get(dbp, NULL, &key, &data, 0)) {
1104     case 0:
1105     if (data.size != 0 &&
1106     fwrite(data.data, 1, data.size, fp) != data.size)
1107     goto write_err;
1108     break;
1109     case DB_KEYEMPTY:
1110     if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
1111     if (pad == NULL) {
1112     if ((ret = __os_malloc(
1113     dbenv, t->re_len, &pad)) != 0)
1114     goto err;
1115     memset(pad, t->re_pad, t->re_len);
1116     }
1117     if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
1118     goto write_err;
1119     }
1120     break;
1121     case DB_NOTFOUND:
1122     ret = 0;
1123     goto done;
1124     default:
1125     goto err;
1126     }
1127     if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
1128     fwrite(&delim, 1, 1, fp) != 1) {
1129     write_err: ret = __os_get_errno();
1130     __db_err(dbenv,
1131     "%s: write failed to backing file: %s",
1132     t->re_source, strerror(ret));
1133     goto err;
1134     }
1135     }
1136    
1137     err:
1138     done: /* Close the file descriptor. */
1139     if (fp != NULL && fclose(fp) != 0) {
1140     t_ret = __os_get_errno();
1141     if (ret == 0)
1142     ret = t_ret;
1143     __db_err(dbenv, "%s: %s", t->re_source, db_strerror(t_ret));
1144     }
1145    
1146     /* Discard the cursor. */
1147     if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
1148     ret = t_ret;
1149    
1150     /* Discard memory allocated to hold the data items. */
1151     if (data.data != NULL)
1152     __os_ufree(dbenv, data.data);
1153     if (pad != NULL)
1154     __os_free(dbenv, pad);
1155    
1156     if (ret == 0)
1157     t->re_modified = 0;
1158    
1159     return (ret);
1160     }
1161    
1162     /*
1163     * __ram_sread --
1164     * Read records from a source file.
1165     */
1166     static int
1167     __ram_sread(dbc, top)
1168     DBC *dbc;
1169     db_recno_t top;
1170     {
1171     BTREE *t;
1172     DB *dbp;
1173     DBT data, *rdata;
1174     db_recno_t recno;
1175     size_t len;
1176     int ch, ret, was_modified;
1177    
1178     t = dbc->dbp->bt_internal;
1179     dbp = dbc->dbp;
1180     was_modified = t->re_modified;
1181    
1182     if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1183     return (ret);
1184    
1185     /*
1186     * Use the record key return memory, it's only a short-term use.
1187     * The record data return memory is used by __bam_iitem, which
1188     * we'll indirectly call, so use the key so as not to collide.
1189     */
1190     len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
1191     rdata = &dbc->my_rkey;
1192     if (rdata->ulen < len) {
1193     if ((ret = __os_realloc(
1194     dbp->dbenv, len, &rdata->data)) != 0) {
1195     rdata->ulen = 0;
1196     rdata->data = NULL;
1197     return (ret);
1198     }
1199     rdata->ulen = (u_int32_t)len;
1200     }
1201    
1202     memset(&data, 0, sizeof(data));
1203     while (recno < top) {
1204     data.data = rdata->data;
1205     data.size = 0;
1206     if (F_ISSET(dbp, DB_AM_FIXEDLEN))
1207     for (len = t->re_len; len > 0; --len) {
1208     if ((ch = getc(t->re_fp)) == EOF) {
1209     if (data.size == 0)
1210     goto eof;
1211     break;
1212     }
1213     ((u_int8_t *)data.data)[data.size++] = ch;
1214     }
1215     else
1216     for (;;) {
1217     if ((ch = getc(t->re_fp)) == EOF) {
1218     if (data.size == 0)
1219     goto eof;
1220     break;
1221     }
1222     if (ch == t->re_delim)
1223     break;
1224    
1225     ((u_int8_t *)data.data)[data.size++] = ch;
1226     if (data.size == rdata->ulen) {
1227     if ((ret = __os_realloc(dbp->dbenv,
1228     rdata->ulen *= 2,
1229     &rdata->data)) != 0) {
1230     rdata->ulen = 0;
1231     rdata->data = NULL;
1232     return (ret);
1233     } else
1234     data.data = rdata->data;
1235     }
1236     }
1237    
1238     /*
1239     * Another process may have read this record from the input
1240     * file and stored it into the database already, in which
1241     * case we don't need to repeat that operation. We detect
1242     * this by checking if the last record we've read is greater
1243     * or equal to the number of records in the database.
1244     */
1245     if (t->re_last >= recno) {
1246     ++recno;
1247     if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1248     goto err;
1249     }
1250     ++t->re_last;
1251     }
1252    
1253     if (0) {
1254     eof: t->re_eof = 1;
1255     ret = DB_NOTFOUND;
1256     }
1257     err: if (!was_modified)
1258     t->re_modified = 0;
1259    
1260     return (ret);
1261     }
1262    
1263     /*
1264     * __ram_add --
1265     * Add records into the tree.
1266     */
1267     static int
1268     __ram_add(dbc, recnop, data, flags, bi_flags)
1269     DBC *dbc;
1270     db_recno_t *recnop;
1271     DBT *data;
1272     u_int32_t flags, bi_flags;
1273     {
1274     BTREE_CURSOR *cp;
1275     int exact, ret, stack, t_ret;
1276    
1277     cp = (BTREE_CURSOR *)dbc->internal;
1278    
1279     retry: /* Find the slot for insertion. */
1280     if ((ret = __bam_rsearch(dbc, recnop,
1281     S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
1282     return (ret);
1283     stack = 1;
1284    
1285     /* Copy the page into the cursor. */
1286     STACK_TO_CURSOR(cp, ret);
1287     if (ret != 0)
1288     goto err;
1289    
1290     /*
1291     * The application may modify the data based on the selected record
1292     * number.
1293     */
1294     if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
1295     (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
1296     goto err;
1297    
1298     /*
1299     * Select the arguments for __bam_iitem() and do the insert. If the
1300     * key is an exact match, or we're replacing the data item with a
1301     * new data item, replace the current item. If the key isn't an exact
1302     * match, we're inserting a new key/data pair, before the search
1303     * location.
1304     */
1305     switch (ret = __bam_iitem(dbc,
1306     NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
1307     case 0:
1308     /*
1309     * Don't adjust anything.
1310     *
1311     * If we inserted a record, no cursors need adjusting because
1312     * the only new record it's possible to insert is at the very
1313     * end of the tree. The necessary adjustments to the internal
1314     * page counts were made by __bam_iitem().
1315     *
1316     * If we overwrote a record, no cursors need adjusting because
1317     * future DBcursor->get calls will simply return the underlying
1318     * record (there's no adjustment made for the DB_CURRENT flag
1319     * when a cursor get operation immediately follows a cursor
1320     * delete operation, and the normal adjustment for the DB_NEXT
1321     * flag is still correct).
1322     */
1323     break;
1324     case DB_NEEDSPLIT:
1325     /* Discard the stack of pages and split the page. */
1326     (void)__bam_stkrel(dbc, STK_CLRDBC);
1327     stack = 0;
1328    
1329     if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
1330     goto err;
1331    
1332     goto retry;
1333     /* NOTREACHED */
1334     default:
1335     goto err;
1336     }
1337    
1338     err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
1339     ret = t_ret;
1340    
1341     return (ret);
1342     }

webmaster AT resiprocate DOT org
ViewVC Help
Powered by ViewVC 1.1.27