/[resiprocate]/main/contrib/db/btree/bt_recno.c
ViewVC logotype

Contents of /main/contrib/db/btree/bt_recno.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5295 - (show annotations) (download)
Mon Aug 22 00:30:05 2005 UTC (14 years, 2 months ago) by jason
File MIME type: text/plain
File size: 35139 byte(s)
merged 5270:HEAD from b-directory-reorg
1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1997-2004
5 * Sleepycat Software. All rights reserved.
6 *
7 * $Id: bt_recno.c,v 11.117 2004/03/28 17:01:01 bostic Exp $
8 */
9
10 #include "db_config.h"
11
12 #ifndef NO_SYSTEM_INCLUDES
13 #include <sys/types.h>
14
15 #include <string.h>
16 #endif
17
18 #include "db_int.h"
19 #include "dbinc/db_page.h"
20 #include "dbinc/btree.h"
21 #include "dbinc/db_shash.h"
22 #include "dbinc/lock.h"
23
24 static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
25 static int __ram_source __P((DB *));
26 static int __ram_sread __P((DBC *, db_recno_t));
27 static int __ram_update __P((DBC *, db_recno_t, int));
28
29 /*
30 * In recno, there are two meanings to the on-page "deleted" flag. If we're
31 * re-numbering records, it means the record was implicitly created. We skip
32 * over implicitly created records if doing a cursor "next" or "prev", and
33 * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
34 * records, it means that the record was implicitly created, or was deleted.
35 * We skip over implicitly created or deleted records if doing a cursor "next"
36 * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
37 *
38 * If we're re-numbering records, then we have to detect in the cursor that
39 * a record was deleted, and adjust the cursor as necessary on the next get.
40 * If we're not re-numbering records, then we can detect that a record has
41 * been deleted by looking at the actual on-page record, so we completely
42 * ignore the cursor's delete flag. This is different from the B+tree code.
43 * It also maintains whether the cursor references a deleted record in the
44 * cursor, and it doesn't always check the on-page value.
45 */
46 #define CD_SET(cp) { \
47 if (F_ISSET(cp, C_RENUMBER)) \
48 F_SET(cp, C_DELETED); \
49 }
50 #define CD_CLR(cp) { \
51 if (F_ISSET(cp, C_RENUMBER)) { \
52 F_CLR(cp, C_DELETED); \
53 cp->order = INVALID_ORDER; \
54 } \
55 }
56 #define CD_ISSET(cp) \
57 (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
58
59 /*
60 * Macros for comparing the ordering of two cursors.
61 * cp1 comes before cp2 iff one of the following holds:
62 * cp1's recno is less than cp2's recno
63 * recnos are equal, both deleted, and cp1's order is less than cp2's
64 * recnos are equal, cp1 deleted, and cp2 not deleted
65 */
66 #define C_LESSTHAN(cp1, cp2) \
67 (((cp1)->recno < (cp2)->recno) || \
68 (((cp1)->recno == (cp2)->recno) && \
69 ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
70 (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
71
72 /*
73 * cp1 is equal to cp2 iff their recnos and delete flags are identical,
74 * and if the delete flag is set their orders are also identical.
75 */
76 #define C_EQUAL(cp1, cp2) \
77 ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
78 (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
79
80 /*
81 * Do we need to log the current cursor adjustment?
82 */
83 #define CURADJ_LOG(dbc) \
84 (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
85
86 /*
87 * After a search, copy the found page into the cursor, discarding any
88 * currently held lock.
89 */
90 #define STACK_TO_CURSOR(cp, ret) { \
91 int __t_ret; \
92 (cp)->page = (cp)->csp->page; \
93 (cp)->pgno = (cp)->csp->page->pgno; \
94 (cp)->indx = (cp)->csp->indx; \
95 if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
96 ret = __t_ret; \
97 (cp)->lock = (cp)->csp->lock; \
98 (cp)->lock_mode = (cp)->csp->lock_mode; \
99 }
100
101 /*
102 * __ram_open --
103 * Recno open function.
104 *
105 * PUBLIC: int __ram_open __P((DB *,
106 * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
107 */
108 int
109 __ram_open(dbp, txn, name, base_pgno, flags)
110 DB *dbp;
111 DB_TXN *txn;
112 const char *name;
113 db_pgno_t base_pgno;
114 u_int32_t flags;
115 {
116 BTREE *t;
117 DBC *dbc;
118 int ret, t_ret;
119
120 COMPQUIET(name, NULL);
121 t = dbp->bt_internal;
122
123 /* Start up the tree. */
124 if ((ret = __bam_read_root(dbp, txn, base_pgno, flags)) != 0)
125 return (ret);
126
127 /*
128 * If the user specified a source tree, open it and map it in.
129 *
130 * !!!
131 * We don't complain if the user specified transactions or threads.
132 * It's possible to make it work, but you'd better know what you're
133 * doing!
134 */
135 if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
136 return (ret);
137
138 /* If we're snapshotting an underlying source file, do it now. */
139 if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
140 /* Allocate a cursor. */
141 if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
142 return (ret);
143
144 /* Do the snapshot. */
145 if ((ret = __ram_update(dbc,
146 DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
147 ret = 0;
148
149 /* Discard the cursor. */
150 if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
151 ret = t_ret;
152 }
153
154 return (ret);
155 }
156
157 /*
158 * __ram_append --
159 * Recno append function.
160 *
161 * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
162 */
163 int
164 __ram_append(dbc, key, data)
165 DBC *dbc;
166 DBT *key, *data;
167 {
168 BTREE_CURSOR *cp;
169 int ret;
170
171 cp = (BTREE_CURSOR *)dbc->internal;
172
173 /*
174 * Make sure we've read in all of the backing source file. If
175 * we found the record or it simply didn't exist, add the
176 * user's record.
177 */
178 ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
179 if (ret == 0 || ret == DB_NOTFOUND)
180 ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
181
182 /* Return the record number. */
183 if (ret == 0)
184 ret = __db_retcopy(dbc->dbp->dbenv, key, &cp->recno,
185 sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
186
187 return (ret);
188 }
189
190 /*
191 * __ram_c_del --
192 * Recno cursor->c_del function.
193 *
194 * PUBLIC: int __ram_c_del __P((DBC *));
195 */
196 int
197 __ram_c_del(dbc)
198 DBC *dbc;
199 {
200 BKEYDATA bk;
201 BTREE *t;
202 BTREE_CURSOR *cp;
203 DB *dbp;
204 DB_LSN lsn;
205 DBT hdr, data;
206 EPG *epg;
207 int exact, ret, stack, t_ret;
208
209 dbp = dbc->dbp;
210 cp = (BTREE_CURSOR *)dbc->internal;
211 t = dbp->bt_internal;
212 stack = 0;
213
214 /*
215 * The semantics of cursors during delete are as follows: in
216 * non-renumbering recnos, records are replaced with a marker
217 * containing a delete flag. If the record referenced by this cursor
218 * has already been deleted, we will detect that as part of the delete
219 * operation, and fail.
220 *
221 * In renumbering recnos, cursors which represent deleted items
222 * are flagged with the C_DELETED flag, and it is an error to
223 * call c_del a second time without an intervening cursor motion.
224 */
225 if (CD_ISSET(cp))
226 return (DB_KEYEMPTY);
227
228 /* Search the tree for the key; delete only deletes exact matches. */
229 if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
230 goto err;
231 if (!exact) {
232 ret = DB_NOTFOUND;
233 goto err;
234 }
235 stack = 1;
236
237 /* Copy the page into the cursor. */
238 STACK_TO_CURSOR(cp, ret);
239 if (ret != 0)
240 goto err;
241
242 /*
243 * If re-numbering records, the on-page deleted flag can only mean
244 * that this record was implicitly created. Applications aren't
245 * permitted to delete records they never created, return an error.
246 *
247 * If not re-numbering records, the on-page deleted flag means that
248 * this record was implicitly created, or, was deleted at some time.
249 * The former is an error because applications aren't permitted to
250 * delete records they never created, the latter is an error because
251 * if the record was "deleted", we could never have found it.
252 */
253 if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
254 ret = DB_KEYEMPTY;
255 goto err;
256 }
257
258 if (F_ISSET(cp, C_RENUMBER)) {
259 /* Delete the item, adjust the counts, adjust the cursors. */
260 if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
261 goto err;
262 if ((ret = __bam_adjust(dbc, -1)) != 0)
263 goto err;
264 if (__ram_ca(dbc, CA_DELETE) > 0 &&
265 CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn,
266 &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0)
267 goto err;
268
269 /*
270 * If the page is empty, delete it.
271 *
272 * We never delete a root page. First, root pages of primary
273 * databases never go away, recno or otherwise. However, if
274 * it's the root page of an off-page duplicates database, then
275 * it can be deleted. We don't delete it here because we have
276 * no way of telling the primary database page holder (e.g.,
277 * the hash access method) that its page element should cleaned
278 * up because the underlying tree is gone. So, we keep the page
279 * around until the last cursor referencing the empty tree is
280 * are closed, and then clean it up.
281 */
282 if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
283 /*
284 * We already have a locked stack of pages. However,
285 * there are likely entries in the stack that aren't
286 * going to be emptied by removing the single reference
287 * to the emptied page (or one of its parents).
288 */
289 for (epg = cp->csp; epg >= cp->sp; --epg)
290 if (NUM_ENT(epg->page) > 1)
291 break;
292
293 /*
294 * We want to delete a single item out of the last page
295 * that we're not deleting.
296 */
297 ret = __bam_dpages(dbc, epg);
298
299 /*
300 * Regardless of the return from __bam_dpages, it will
301 * discard our stack and pinned page.
302 */
303 stack = 0;
304 cp->page = NULL;
305 }
306 } else {
307 /* Use a delete/put pair to replace the record with a marker. */
308 if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
309 goto err;
310
311 B_TSET(bk.type, B_KEYDATA, 1);
312 bk.len = 0;
313 memset(&hdr, 0, sizeof(hdr));
314 hdr.data = &bk;
315 hdr.size = SSZA(BKEYDATA, data);
316 memset(&data, 0, sizeof(data));
317 data.data = (void *)"";
318 data.size = 0;
319 if ((ret = __db_pitem(dbc,
320 cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
321 goto err;
322 }
323
324 t->re_modified = 1;
325
326 err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
327 ret = t_ret;
328
329 return (ret);
330 }
331
332 /*
333 * __ram_c_get --
334 * Recno cursor->c_get function.
335 *
336 * PUBLIC: int __ram_c_get
337 * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
338 */
339 int
340 __ram_c_get(dbc, key, data, flags, pgnop)
341 DBC *dbc;
342 DBT *key, *data;
343 u_int32_t flags;
344 db_pgno_t *pgnop;
345 {
346 BTREE_CURSOR *cp;
347 DB *dbp;
348 int cmp, exact, ret;
349
350 COMPQUIET(pgnop, NULL);
351
352 dbp = dbc->dbp;
353 cp = (BTREE_CURSOR *)dbc->internal;
354
355 LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
356 retry: switch (flags) {
357 case DB_CURRENT:
358 /*
359 * If we're using mutable records and the deleted flag is
360 * set, the cursor is pointing at a nonexistent record;
361 * return an error.
362 */
363 if (CD_ISSET(cp))
364 return (DB_KEYEMPTY);
365 break;
366 case DB_NEXT_DUP:
367 /*
368 * If we're not in an off-page dup set, we know there's no
369 * next duplicate since recnos don't have them. If we
370 * are in an off-page dup set, the next item assuredly is
371 * a dup, so we set flags to DB_NEXT and keep going.
372 */
373 if (!F_ISSET(dbc, DBC_OPD))
374 return (DB_NOTFOUND);
375 /* FALLTHROUGH */
376 case DB_NEXT_NODUP:
377 /*
378 * Recno databases don't have duplicates, set flags to DB_NEXT
379 * and keep going.
380 */
381 /* FALLTHROUGH */
382 case DB_NEXT:
383 flags = DB_NEXT;
384 /*
385 * If record numbers are mutable: if we just deleted a record,
386 * we have to avoid incrementing the record number so that we
387 * return the right record by virtue of renumbering the tree.
388 */
389 if (CD_ISSET(cp)) {
390 /*
391 * Clear the flag, we've moved off the deleted record.
392 */
393 CD_CLR(cp);
394 break;
395 }
396
397 if (cp->recno != RECNO_OOB) {
398 ++cp->recno;
399 break;
400 }
401 /* FALLTHROUGH */
402 case DB_FIRST:
403 flags = DB_NEXT;
404 cp->recno = 1;
405 break;
406 case DB_PREV_NODUP:
407 /*
408 * Recno databases don't have duplicates, set flags to DB_PREV
409 * and keep going.
410 */
411 /* FALLTHROUGH */
412 case DB_PREV:
413 flags = DB_PREV;
414 if (cp->recno != RECNO_OOB) {
415 if (cp->recno == 1) {
416 ret = DB_NOTFOUND;
417 goto err;
418 }
419 --cp->recno;
420 break;
421 }
422 /* FALLTHROUGH */
423 case DB_LAST:
424 flags = DB_PREV;
425 if (((ret = __ram_update(dbc,
426 DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
427 goto err;
428 if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
429 goto err;
430 if (cp->recno == 0) {
431 ret = DB_NOTFOUND;
432 goto err;
433 }
434 break;
435 case DB_GET_BOTHC:
436 /*
437 * If we're doing a join and these are offpage dups,
438 * we want to keep searching forward from after the
439 * current cursor position. Increment the recno by 1,
440 * then proceed as for a DB_SET.
441 *
442 * Otherwise, we know there are no additional matching
443 * data, as recnos don't have dups. return DB_NOTFOUND.
444 */
445 if (F_ISSET(dbc, DBC_OPD)) {
446 cp->recno++;
447 break;
448 }
449 ret = DB_NOTFOUND;
450 goto err;
451 /* NOTREACHED */
452 case DB_GET_BOTH:
453 case DB_GET_BOTH_RANGE:
454 /*
455 * If we're searching a set of off-page dups, we start
456 * a new linear search from the first record. Otherwise,
457 * we compare the single data item associated with the
458 * requested record for a match.
459 */
460 if (F_ISSET(dbc, DBC_OPD)) {
461 cp->recno = 1;
462 break;
463 }
464 /* FALLTHROUGH */
465 case DB_SET:
466 case DB_SET_RANGE:
467 if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
468 goto err;
469 break;
470 default:
471 ret = __db_unknown_flag(dbp->dbenv, "__ram_c_get", flags);
472 goto err;
473 }
474
475 /*
476 * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
477 * called __ram_update() to make sure sufficient records have been
478 * read from the backing source file. Do it now for DB_CURRENT (if
479 * the current record was deleted we may need more records from the
480 * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
481 * (We don't have to test for flags == DB_FIRST, because the switch
482 * statement above re-set flags to DB_NEXT in that case.)
483 */
484 if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
485 __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
486 goto err;
487
488 for (;; ++cp->recno) {
489 /* Search the tree for the record. */
490 if ((ret = __bam_rsearch(dbc, &cp->recno,
491 F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
492 1, &exact)) != 0)
493 goto err;
494 if (!exact) {
495 ret = DB_NOTFOUND;
496 goto err;
497 }
498
499 /* Copy the page into the cursor. */
500 STACK_TO_CURSOR(cp, ret);
501 if (ret != 0)
502 goto err;
503
504 /*
505 * If re-numbering records, the on-page deleted flag means this
506 * record was implicitly created. If not re-numbering records,
507 * the on-page deleted flag means this record was implicitly
508 * created, or, it was deleted at some time. Regardless, we
509 * skip such records if doing cursor next/prev operations or
510 * walking through off-page duplicates, and fail if they were
511 * requested explicitly by the application.
512 */
513 if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
514 switch (flags) {
515 case DB_NEXT:
516 case DB_PREV:
517 (void)__bam_stkrel(dbc, STK_CLRDBC);
518 goto retry;
519 case DB_GET_BOTH:
520 case DB_GET_BOTH_RANGE:
521 /*
522 * If we're an OPD tree, we don't care about
523 * matching a record number on a DB_GET_BOTH
524 * -- everything belongs to the same tree. A
525 * normal recno should give up and return
526 * DB_NOTFOUND if the matching recno is deleted.
527 */
528 if (F_ISSET(dbc, DBC_OPD)) {
529 (void)__bam_stkrel(dbc, STK_CLRDBC);
530 continue;
531 }
532 ret = DB_NOTFOUND;
533 goto err;
534 default:
535 ret = DB_KEYEMPTY;
536 goto err;
537 }
538
539 if (flags == DB_GET_BOTH ||
540 flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
541 if ((ret = __bam_cmp(dbp, data,
542 cp->page, cp->indx, __bam_defcmp, &cmp)) != 0)
543 return (ret);
544 if (cmp == 0)
545 break;
546 if (!F_ISSET(dbc, DBC_OPD)) {
547 ret = DB_NOTFOUND;
548 goto err;
549 }
550 (void)__bam_stkrel(dbc, STK_CLRDBC);
551 } else
552 break;
553 }
554
555 /* Return the key if the user didn't give us one. */
556 if (!F_ISSET(dbc, DBC_OPD)) {
557 if (flags != DB_GET_BOTH && flags != DB_GET_BOTH_RANGE &&
558 flags != DB_SET && flags != DB_SET_RANGE)
559 ret = __db_retcopy(dbp->dbenv,
560 key, &cp->recno, sizeof(cp->recno),
561 &dbc->rkey->data, &dbc->rkey->ulen);
562 F_SET(key, DB_DBT_ISSET);
563 }
564
565 /* The cursor was reset, no further delete adjustment is necessary. */
566 err: CD_CLR(cp);
567
568 return (ret);
569 }
570
571 /*
572 * __ram_c_put --
573 * Recno cursor->c_put function.
574 *
575 * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
576 */
577 int
578 __ram_c_put(dbc, key, data, flags, pgnop)
579 DBC *dbc;
580 DBT *key, *data;
581 u_int32_t flags;
582 db_pgno_t *pgnop;
583 {
584 BTREE_CURSOR *cp;
585 DB *dbp;
586 DB_LSN lsn;
587 int exact, nc, ret, t_ret;
588 u_int32_t iiflags;
589 void *arg;
590
591 COMPQUIET(pgnop, NULL);
592
593 dbp = dbc->dbp;
594 cp = (BTREE_CURSOR *)dbc->internal;
595
596 /*
597 * DB_KEYFIRST and DB_KEYLAST mean different things if they're
598 * used in an off-page duplicate tree. If we're an off-page
599 * duplicate tree, they really mean "put at the beginning of the
600 * tree" and "put at the end of the tree" respectively, so translate
601 * them to something else.
602 */
603 if (F_ISSET(dbc, DBC_OPD))
604 switch (flags) {
605 case DB_KEYFIRST:
606 cp->recno = 1;
607 flags = DB_BEFORE;
608 break;
609 case DB_KEYLAST:
610 if ((ret = __ram_add(dbc,
611 &cp->recno, data, DB_APPEND, 0)) != 0)
612 return (ret);
613 if (CURADJ_LOG(dbc) &&
614 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
615 CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
616 return (ret);
617 return (0);
618 default:
619 break;
620 }
621
622 /*
623 * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
624 * no duplicates, these are identical and mean "put the given
625 * datum at the given recno".
626 *
627 * Note that the code here used to be in __ram_put; now, we
628 * go through the access-method-common __db_put function, which
629 * handles DB_NOOVERWRITE, so we and __ram_add don't have to.
630 */
631 if (flags == DB_KEYFIRST || flags == DB_KEYLAST) {
632 ret = __ram_getno(dbc, key, &cp->recno, 1);
633 if (ret == 0 || ret == DB_NOTFOUND)
634 ret = __ram_add(dbc, &cp->recno, data, 0, 0);
635 return (ret);
636 }
637
638 /*
639 * If we're putting with a cursor that's marked C_DELETED, we need to
640 * take special care; the cursor doesn't "really" reference the item
641 * corresponding to its current recno, but instead is "between" that
642 * record and the current one. Translate the actual insert into
643 * DB_BEFORE, and let the __ram_ca work out the gory details of what
644 * should wind up pointing where.
645 */
646 if (CD_ISSET(cp))
647 iiflags = DB_BEFORE;
648 else
649 iiflags = flags;
650
651 split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
652 goto err;
653 /*
654 * An inexact match is okay; it just means we're one record past the
655 * end, which is reasonable if we're marked deleted.
656 */
657 DB_ASSERT(exact || CD_ISSET(cp));
658
659 /* Copy the page into the cursor. */
660 STACK_TO_CURSOR(cp, ret);
661 if (ret != 0)
662 goto err;
663
664 ret = __bam_iitem(dbc, key, data, iiflags, 0);
665 t_ret = __bam_stkrel(dbc, STK_CLRDBC);
666
667 if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
668 ret = t_ret;
669 else if (ret == DB_NEEDSPLIT) {
670 arg = &cp->recno;
671 if ((ret = __bam_split(dbc, arg, NULL)) != 0)
672 goto err;
673 goto split;
674 }
675 if (ret != 0)
676 goto err;
677
678 switch (flags) { /* Adjust the cursors. */
679 case DB_AFTER:
680 nc = __ram_ca(dbc, CA_IAFTER);
681
682 /*
683 * We only need to adjust this cursor forward if we truly added
684 * the item after the current recno, rather than remapping it
685 * to DB_BEFORE.
686 */
687 if (iiflags == DB_AFTER)
688 ++cp->recno;
689
690 /* Only log if __ram_ca found any relevant cursors. */
691 if (nc > 0 && CURADJ_LOG(dbc) &&
692 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
693 cp->root, cp->recno, cp->order)) != 0)
694 goto err;
695 break;
696 case DB_BEFORE:
697 nc = __ram_ca(dbc, CA_IBEFORE);
698 --cp->recno;
699
700 /* Only log if __ram_ca found any relevant cursors. */
701 if (nc > 0 && CURADJ_LOG(dbc) &&
702 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
703 cp->root, cp->recno, cp->order)) != 0)
704 goto err;
705 break;
706 case DB_CURRENT:
707 /*
708 * We only need to do an adjustment if we actually
709 * added an item, which we only would have done if the
710 * cursor was marked deleted.
711 *
712 * Only log if __ram_ca found any relevant cursors.
713 */
714 if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 &&
715 CURADJ_LOG(dbc) &&
716 (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
717 CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
718 goto err;
719 break;
720 default:
721 break;
722 }
723
724 /* Return the key if we've created a new record. */
725 if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE))
726 ret = __db_retcopy(dbp->dbenv, key, &cp->recno,
727 sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
728
729 /* The cursor was reset, no further delete adjustment is necessary. */
730 err: CD_CLR(cp);
731
732 return (ret);
733 }
734
735 /*
736 * __ram_ca --
737 * Adjust cursors. Returns the number of relevant cursors.
738 *
739 * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg));
740 */
741 int
742 __ram_ca(dbc_arg, op)
743 DBC *dbc_arg;
744 ca_recno_arg op;
745 {
746 BTREE_CURSOR *cp, *cp_arg;
747 DB *dbp, *ldbp;
748 DB_ENV *dbenv;
749 DBC *dbc;
750 db_recno_t recno;
751 int adjusted, found;
752 u_int32_t order;
753
754 dbp = dbc_arg->dbp;
755 dbenv = dbp->dbenv;
756 cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
757 recno = cp_arg->recno;
758
759 found = 0;
760
761 /*
762 * It only makes sense to adjust cursors if we're a renumbering
763 * recno; we should only be called if this is one.
764 */
765 DB_ASSERT(F_ISSET(cp_arg, C_RENUMBER));
766
767 MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
768 /*
769 * Adjust the cursors. See the comment in __bam_ca_delete().
770 */
771 /*
772 * If we're doing a delete, we need to find the highest
773 * order of any cursor currently pointing at this item,
774 * so we can assign a higher order to the newly deleted
775 * cursor. Unfortunately, this requires a second pass through
776 * the cursor list.
777 */
778 if (op == CA_DELETE) {
779 order = 1;
780 for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
781 ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
782 ldbp = LIST_NEXT(ldbp, dblistlinks)) {
783 MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
784 for (dbc = TAILQ_FIRST(&ldbp->active_queue);
785 dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
786 cp = (BTREE_CURSOR *)dbc->internal;
787 if (cp_arg->root == cp->root &&
788 recno == cp->recno && CD_ISSET(cp) &&
789 order <= cp->order)
790 order = cp->order + 1;
791 }
792 MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
793 }
794 } else
795 order = INVALID_ORDER;
796
797 /* Now go through and do the actual adjustments. */
798 for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
799 ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
800 ldbp = LIST_NEXT(ldbp, dblistlinks)) {
801 MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
802 for (dbc = TAILQ_FIRST(&ldbp->active_queue);
803 dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
804 cp = (BTREE_CURSOR *)dbc->internal;
805 if (cp_arg->root != cp->root)
806 continue;
807 ++found;
808 adjusted = 0;
809 switch (op) {
810 case CA_DELETE:
811 if (recno < cp->recno) {
812 --cp->recno;
813 /*
814 * If the adjustment made them equal,
815 * we have to merge the orders.
816 */
817 if (recno == cp->recno && CD_ISSET(cp))
818 cp->order += order;
819 } else if (recno == cp->recno &&
820 !CD_ISSET(cp)) {
821 CD_SET(cp);
822 cp->order = order;
823 }
824 break;
825 case CA_IBEFORE:
826 /*
827 * IBEFORE is just like IAFTER, except that we
828 * adjust cursors on the current record too.
829 */
830 if (C_EQUAL(cp_arg, cp)) {
831 ++cp->recno;
832 adjusted = 1;
833 }
834 goto iafter;
835 case CA_ICURRENT:
836
837 /*
838 * If the original cursor wasn't deleted, we
839 * just did a replacement and so there's no
840 * need to adjust anything--we shouldn't have
841 * gotten this far. Otherwise, we behave
842 * much like an IAFTER, except that all
843 * cursors pointing to the current item get
844 * marked undeleted and point to the new
845 * item.
846 */
847 DB_ASSERT(CD_ISSET(cp_arg));
848 if (C_EQUAL(cp_arg, cp)) {
849 CD_CLR(cp);
850 break;
851 }
852 /* FALLTHROUGH */
853 case CA_IAFTER:
854 iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
855 ++cp->recno;
856 adjusted = 1;
857 }
858 if (recno == cp->recno && adjusted)
859 /*
860 * If we've moved this cursor's recno,
861 * split its order number--i.e.,
862 * decrement it by enough so that
863 * the lowest cursor moved has order 1.
864 * cp_arg->order is the split point,
865 * so decrement by one less than that.
866 */
867 cp->order -= (cp_arg->order - 1);
868 break;
869 }
870 }
871 MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
872 }
873 MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
874
875 return (found);
876 }
877
878 /*
879 * __ram_getno --
880 * Check the user's record number, and make sure we've seen it.
881 *
882 * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
883 */
884 int
885 __ram_getno(dbc, key, rep, can_create)
886 DBC *dbc;
887 const DBT *key;
888 db_recno_t *rep;
889 int can_create;
890 {
891 DB *dbp;
892 db_recno_t recno;
893
894 dbp = dbc->dbp;
895
896 /* Check the user's record number. */
897 if ((recno = *(db_recno_t *)key->data) == 0) {
898 __db_err(dbp->dbenv, "illegal record number of 0");
899 return (EINVAL);
900 }
901 if (rep != NULL)
902 *rep = recno;
903
904 /*
905 * Btree can neither create records nor read them in. Recno can
906 * do both, see if we can find the record.
907 */
908 return (dbc->dbtype == DB_RECNO ?
909 __ram_update(dbc, recno, can_create) : 0);
910 }
911
912 /*
913 * __ram_update --
914 * Ensure the tree has records up to and including the specified one.
915 */
916 static int
917 __ram_update(dbc, recno, can_create)
918 DBC *dbc;
919 db_recno_t recno;
920 int can_create;
921 {
922 BTREE *t;
923 DB *dbp;
924 DBT *rdata;
925 db_recno_t nrecs;
926 int ret;
927
928 dbp = dbc->dbp;
929 t = dbp->bt_internal;
930
931 /*
932 * If we can't create records and we've read the entire backing input
933 * file, we're done.
934 */
935 if (!can_create && t->re_eof)
936 return (0);
937
938 /*
939 * If we haven't seen this record yet, try to get it from the original
940 * file.
941 */
942 if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
943 return (ret);
944 if (!t->re_eof && recno > nrecs) {
945 if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
946 return (ret);
947 if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
948 return (ret);
949 }
950
951 /*
952 * If we can create records, create empty ones up to the requested
953 * record.
954 */
955 if (!can_create || recno <= nrecs + 1)
956 return (0);
957
958 rdata = &dbc->my_rdata;
959 rdata->flags = 0;
960 rdata->size = 0;
961
962 while (recno > ++nrecs)
963 if ((ret = __ram_add(dbc,
964 &nrecs, rdata, 0, BI_DELETED)) != 0)
965 return (ret);
966 return (0);
967 }
968
969 /*
970 * __ram_source --
971 * Load information about the backing file.
972 */
973 static int
974 __ram_source(dbp)
975 DB *dbp;
976 {
977 BTREE *t;
978 char *source;
979 int ret;
980
981 t = dbp->bt_internal;
982
983 /* Find the real name, and swap out the one we had before. */
984 if ((ret = __db_appname(dbp->dbenv,
985 DB_APP_DATA, t->re_source, 0, NULL, &source)) != 0)
986 return (ret);
987 __os_free(dbp->dbenv, t->re_source);
988 t->re_source = source;
989
990 /*
991 * !!!
992 * It's possible that the backing source file is read-only. We don't
993 * much care other than we'll complain if there are any modifications
994 * when it comes time to write the database back to the source.
995 */
996 if ((t->re_fp = fopen(t->re_source, "r")) == NULL) {
997 ret = __os_get_errno();
998 __db_err(dbp->dbenv, "%s: %s", t->re_source, db_strerror(ret));
999 return (ret);
1000 }
1001
1002 t->re_eof = 0;
1003 return (0);
1004 }
1005
1006 /*
1007 * __ram_writeback --
1008 * Rewrite the backing file.
1009 *
1010 * PUBLIC: int __ram_writeback __P((DB *));
1011 */
1012 int
1013 __ram_writeback(dbp)
1014 DB *dbp;
1015 {
1016 BTREE *t;
1017 DB_ENV *dbenv;
1018 DBC *dbc;
1019 DBT key, data;
1020 FILE *fp;
1021 db_recno_t keyno;
1022 int ret, t_ret;
1023 u_int8_t delim, *pad;
1024
1025 t = dbp->bt_internal;
1026 dbenv = dbp->dbenv;
1027 fp = NULL;
1028 pad = NULL;
1029
1030 /* If the file wasn't modified, we're done. */
1031 if (!t->re_modified)
1032 return (0);
1033
1034 /* If there's no backing source file, we're done. */
1035 if (t->re_source == NULL) {
1036 t->re_modified = 0;
1037 return (0);
1038 }
1039
1040 /* Allocate a cursor. */
1041 if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
1042 return (ret);
1043
1044 /*
1045 * Read any remaining records into the tree.
1046 *
1047 * !!!
1048 * This is why we can't support transactions when applications specify
1049 * backing (re_source) files. At this point we have to read in the
1050 * rest of the records from the file so that we can write all of the
1051 * records back out again, which could modify a page for which we'd
1052 * have to log changes and which we don't have locked. This could be
1053 * partially fixed by taking a snapshot of the entire file during the
1054 * DB->open as DB->open is transaction protected. But, if a checkpoint
1055 * occurs then, the part of the log holding the copy of the file could
1056 * be discarded, and that would make it impossible to recover in the
1057 * face of disaster. This could all probably be fixed, but it would
1058 * require transaction protecting the backing source file.
1059 *
1060 * XXX
1061 * This could be made to work now that we have transactions protecting
1062 * file operations. Margo has specifically asked for the privilege of
1063 * doing this work.
1064 */
1065 if ((ret =
1066 __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
1067 return (ret);
1068
1069 /*
1070 * Close any existing file handle and re-open the file, truncating it.
1071 */
1072 if (t->re_fp != NULL) {
1073 if (fclose(t->re_fp) != 0) {
1074 ret = __os_get_errno();
1075 goto err;
1076 }
1077 t->re_fp = NULL;
1078 }
1079 if ((fp = fopen(t->re_source, "w")) == NULL) {
1080 ret = __os_get_errno();
1081 __db_err(dbenv, "%s: %s", t->re_source, db_strerror(ret));
1082 goto err;
1083 }
1084
1085 /*
1086 * We step through the records, writing each one out. Use the record
1087 * number and the dbp->get() function, instead of a cursor, so we find
1088 * and write out "deleted" or non-existent records. The DB handle may
1089 * be threaded, so allocate memory as we go.
1090 */
1091 memset(&key, 0, sizeof(key));
1092 key.size = sizeof(db_recno_t);
1093 key.data = &keyno;
1094 memset(&data, 0, sizeof(data));
1095 F_SET(&data, DB_DBT_REALLOC);
1096
1097 /*
1098 * We'll need the delimiter if we're doing variable-length records,
1099 * and the pad character if we're doing fixed-length records.
1100 */
1101 delim = t->re_delim;
1102 for (keyno = 1;; ++keyno) {
1103 switch (ret = __db_get(dbp, NULL, &key, &data, 0)) {
1104 case 0:
1105 if (data.size != 0 &&
1106 fwrite(data.data, 1, data.size, fp) != data.size)
1107 goto write_err;
1108 break;
1109 case DB_KEYEMPTY:
1110 if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
1111 if (pad == NULL) {
1112 if ((ret = __os_malloc(
1113 dbenv, t->re_len, &pad)) != 0)
1114 goto err;
1115 memset(pad, t->re_pad, t->re_len);
1116 }
1117 if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
1118 goto write_err;
1119 }
1120 break;
1121 case DB_NOTFOUND:
1122 ret = 0;
1123 goto done;
1124 default:
1125 goto err;
1126 }
1127 if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
1128 fwrite(&delim, 1, 1, fp) != 1) {
1129 write_err: ret = __os_get_errno();
1130 __db_err(dbenv,
1131 "%s: write failed to backing file: %s",
1132 t->re_source, strerror(ret));
1133 goto err;
1134 }
1135 }
1136
1137 err:
1138 done: /* Close the file descriptor. */
1139 if (fp != NULL && fclose(fp) != 0) {
1140 t_ret = __os_get_errno();
1141 if (ret == 0)
1142 ret = t_ret;
1143 __db_err(dbenv, "%s: %s", t->re_source, db_strerror(t_ret));
1144 }
1145
1146 /* Discard the cursor. */
1147 if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
1148 ret = t_ret;
1149
1150 /* Discard memory allocated to hold the data items. */
1151 if (data.data != NULL)
1152 __os_ufree(dbenv, data.data);
1153 if (pad != NULL)
1154 __os_free(dbenv, pad);
1155
1156 if (ret == 0)
1157 t->re_modified = 0;
1158
1159 return (ret);
1160 }
1161
1162 /*
1163 * __ram_sread --
1164 * Read records from a source file.
1165 */
1166 static int
1167 __ram_sread(dbc, top)
1168 DBC *dbc;
1169 db_recno_t top;
1170 {
1171 BTREE *t;
1172 DB *dbp;
1173 DBT data, *rdata;
1174 db_recno_t recno;
1175 size_t len;
1176 int ch, ret, was_modified;
1177
1178 t = dbc->dbp->bt_internal;
1179 dbp = dbc->dbp;
1180 was_modified = t->re_modified;
1181
1182 if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1183 return (ret);
1184
1185 /*
1186 * Use the record key return memory, it's only a short-term use.
1187 * The record data return memory is used by __bam_iitem, which
1188 * we'll indirectly call, so use the key so as not to collide.
1189 */
1190 len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
1191 rdata = &dbc->my_rkey;
1192 if (rdata->ulen < len) {
1193 if ((ret = __os_realloc(
1194 dbp->dbenv, len, &rdata->data)) != 0) {
1195 rdata->ulen = 0;
1196 rdata->data = NULL;
1197 return (ret);
1198 }
1199 rdata->ulen = (u_int32_t)len;
1200 }
1201
1202 memset(&data, 0, sizeof(data));
1203 while (recno < top) {
1204 data.data = rdata->data;
1205 data.size = 0;
1206 if (F_ISSET(dbp, DB_AM_FIXEDLEN))
1207 for (len = t->re_len; len > 0; --len) {
1208 if ((ch = getc(t->re_fp)) == EOF) {
1209 if (data.size == 0)
1210 goto eof;
1211 break;
1212 }
1213 ((u_int8_t *)data.data)[data.size++] = ch;
1214 }
1215 else
1216 for (;;) {
1217 if ((ch = getc(t->re_fp)) == EOF) {
1218 if (data.size == 0)
1219 goto eof;
1220 break;
1221 }
1222 if (ch == t->re_delim)
1223 break;
1224
1225 ((u_int8_t *)data.data)[data.size++] = ch;
1226 if (data.size == rdata->ulen) {
1227 if ((ret = __os_realloc(dbp->dbenv,
1228 rdata->ulen *= 2,
1229 &rdata->data)) != 0) {
1230 rdata->ulen = 0;
1231 rdata->data = NULL;
1232 return (ret);
1233 } else
1234 data.data = rdata->data;
1235 }
1236 }
1237
1238 /*
1239 * Another process may have read this record from the input
1240 * file and stored it into the database already, in which
1241 * case we don't need to repeat that operation. We detect
1242 * this by checking if the last record we've read is greater
1243 * or equal to the number of records in the database.
1244 */
1245 if (t->re_last >= recno) {
1246 ++recno;
1247 if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1248 goto err;
1249 }
1250 ++t->re_last;
1251 }
1252
1253 if (0) {
1254 eof: t->re_eof = 1;
1255 ret = DB_NOTFOUND;
1256 }
1257 err: if (!was_modified)
1258 t->re_modified = 0;
1259
1260 return (ret);
1261 }
1262
1263 /*
1264 * __ram_add --
1265 * Add records into the tree.
1266 */
1267 static int
1268 __ram_add(dbc, recnop, data, flags, bi_flags)
1269 DBC *dbc;
1270 db_recno_t *recnop;
1271 DBT *data;
1272 u_int32_t flags, bi_flags;
1273 {
1274 BTREE_CURSOR *cp;
1275 int exact, ret, stack, t_ret;
1276
1277 cp = (BTREE_CURSOR *)dbc->internal;
1278
1279 retry: /* Find the slot for insertion. */
1280 if ((ret = __bam_rsearch(dbc, recnop,
1281 S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
1282 return (ret);
1283 stack = 1;
1284
1285 /* Copy the page into the cursor. */
1286 STACK_TO_CURSOR(cp, ret);
1287 if (ret != 0)
1288 goto err;
1289
1290 /*
1291 * The application may modify the data based on the selected record
1292 * number.
1293 */
1294 if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
1295 (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
1296 goto err;
1297
1298 /*
1299 * Select the arguments for __bam_iitem() and do the insert. If the
1300 * key is an exact match, or we're replacing the data item with a
1301 * new data item, replace the current item. If the key isn't an exact
1302 * match, we're inserting a new key/data pair, before the search
1303 * location.
1304 */
1305 switch (ret = __bam_iitem(dbc,
1306 NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
1307 case 0:
1308 /*
1309 * Don't adjust anything.
1310 *
1311 * If we inserted a record, no cursors need adjusting because
1312 * the only new record it's possible to insert is at the very
1313 * end of the tree. The necessary adjustments to the internal
1314 * page counts were made by __bam_iitem().
1315 *
1316 * If we overwrote a record, no cursors need adjusting because
1317 * future DBcursor->get calls will simply return the underlying
1318 * record (there's no adjustment made for the DB_CURRENT flag
1319 * when a cursor get operation immediately follows a cursor
1320 * delete operation, and the normal adjustment for the DB_NEXT
1321 * flag is still correct).
1322 */
1323 break;
1324 case DB_NEEDSPLIT:
1325 /* Discard the stack of pages and split the page. */
1326 (void)__bam_stkrel(dbc, STK_CLRDBC);
1327 stack = 0;
1328
1329 if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
1330 goto err;
1331
1332 goto retry;
1333 /* NOTREACHED */
1334 default:
1335 goto err;
1336 }
1337
1338 err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
1339 ret = t_ret;
1340
1341 return (ret);
1342 }

webmaster AT resiprocate DOT org
ViewVC Help
Powered by ViewVC 1.1.27