libfoedus-core
FOEDUS Core Library
masstree_cursor.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <glog/logging.h>
21 
22 #include <algorithm>
23 #include <cstring>
24 #include <string>
25 
34 #include "foedus/xct/xct.hpp"
35 
36 
37 namespace foedus {
38 namespace storage {
39 namespace masstree {
40 
42  : engine_(storage.get_engine()),
43  storage_(storage.get_engine(), storage.get_control_block()),
44  context_(context),
45  current_xct_(&context->get_current_xct()) {
46  for_writes_ = false;
47  forward_cursor_ = true;
48  reached_end_ = false;
49 
50  route_count_ = 0;
51  routes_ = nullptr;
52  should_skip_cur_route_ = false;
53 
54  end_inclusive_ = false;
55  end_key_length_ = 0;
56  end_key_ = nullptr;
57  end_key_slices_ = nullptr;
58 
59  cur_route_prefix_slices_ = nullptr;
60  cur_route_prefix_be_ = nullptr;
61 
62  cur_key_length_ = 0;
63  cur_key_owner_id_address = nullptr;
64  cur_key_suffix_ = nullptr;
65  cur_key_in_layer_slice_ = 0;
66  cur_key_in_layer_remainder_ = 0;
67  cur_key_next_layer_ = false;
68  cur_key_location_.clear();
69 
70  cur_payload_length_ = 0;
71 
72  search_key_length_ = 0;
73  search_key_ = nullptr;
74  search_key_slices_ = nullptr;
75 }
76 
77 void MasstreeCursor::copy_combined_key(char* buffer) const {
79  const Route* route = cur_route();
80  const Layer layer = route->layer_;
81  ASSERT_ND(layer == route->page_->get_layer());
82  ASSERT_ND(cur_key_in_layer_remainder_ != kInitiallyNextLayer);
83  ASSERT_ND(cur_key_length_ == cur_key_in_layer_remainder_ + layer * sizeof(KeySlice));
84 
85  std::memcpy(buffer, ASSUME_ALIGNED(cur_route_prefix_be_, 256), layer * sizeof(KeySlice));
86  assorted::write_bigendian<KeySlice>(cur_key_in_layer_slice_, buffer + layer * sizeof(KeySlice));
87  KeyLength suffix_length = calculate_suffix_length(cur_key_in_layer_remainder_);
88  if (suffix_length > 0) {
89  std::memcpy(
90  buffer + (layer + 1U) * sizeof(KeySlice),
91  ASSUME_ALIGNED(cur_key_suffix_, 8),
92  suffix_length);
93  }
94 }
95 void MasstreeCursor::copy_combined_key_part(KeyLength offset, KeyLength len, char* buffer) const {
97  ASSERT_ND(offset + len <= cur_key_length_);
98  const Route* route = cur_route();
99  const Layer layer = route->layer_;
100  ASSERT_ND(layer == route->page_->get_layer());
101  ASSERT_ND(cur_key_in_layer_remainder_ != kInitiallyNextLayer);
102  ASSERT_ND(cur_key_length_ == cur_key_in_layer_remainder_ + layer * sizeof(KeySlice));
103 
104  const KeyLength prefix_len = layer * sizeof(KeySlice);
105  KeyLength buffer_pos = 0;
106  KeyLength len_remaining = len;
107  if (offset < prefix_len) {
108  KeyLength prefix_copy_len = std::min<KeyLength>(prefix_len - offset, len_remaining);
109  std::memcpy(buffer, cur_route_prefix_be_ + offset, prefix_copy_len);
110  buffer_pos += prefix_copy_len;
111  len_remaining -= prefix_copy_len;
112  }
113 
114  if (len_remaining > 0) {
115  if (offset < prefix_len + sizeof(KeySlice)) {
116  char cur_slice_be[sizeof(KeySlice)];
117  assorted::write_bigendian<KeySlice>(cur_key_in_layer_slice_, cur_slice_be);
118 
119  KeyLength cur_slice_offset = offset - prefix_len;
120  KeyLength cur_slice_copy_len
121  = std::min<KeyLength>(sizeof(KeySlice) - cur_slice_offset, len_remaining);
122  std::memcpy(buffer + buffer_pos, cur_slice_be + cur_slice_offset, cur_slice_copy_len);
123  buffer_pos += cur_slice_copy_len;
124  len_remaining -= cur_slice_copy_len;
125  }
126 
127  if (len_remaining > 0) {
128  KeyLength suffix_offset = offset - prefix_len - sizeof(KeySlice);
129  KeyLength suffix_length = calculate_suffix_length(cur_key_in_layer_remainder_);
130  KeyLength suffix_copy_len = std::min<KeyLength>(suffix_length - suffix_offset, len_remaining);
131  std::memcpy(
132  buffer + buffer_pos,
133  cur_key_suffix_ + suffix_offset,
134  suffix_copy_len);
135  buffer_pos += suffix_copy_len;
136  len_remaining -= suffix_copy_len;
137  }
138  }
139 
140  ASSERT_ND(buffer_pos == len);
141  ASSERT_ND(len_remaining == 0);
142 }
143 
144 template <typename T>
145 inline ErrorCode MasstreeCursor::allocate_if_not_exist(T** pointer) {
146  if (*pointer) {
147  return kErrorCodeOk;
148  }
149 
150  ASSERT_ND(*pointer == nullptr);
151  void* out;
153  1U << 12,
154  &out,
155  1U << 12));
156  *pointer = reinterpret_cast<T*>(out);
157  return kErrorCodeOk;
158 }
159 
160 MasstreePage* MasstreeCursor::resolve_volatile(VolatilePagePointer ptr) const {
161  ASSERT_ND(!ptr.is_null());
162  const auto& resolver = context_->get_global_volatile_page_resolver();
163  MasstreePage* ret = reinterpret_cast<MasstreePage*>(resolver.resolve_offset(ptr));
164  assert_within_valid_volatile_page(resolver, ret);
165  return ret;
166 }
167 
169 //
170 // next() and proceed_xxx methods
171 //
173 
175  ASSERT_ND(!should_skip_cur_route_);
176  if (!is_valid_record()) {
177  return kErrorCodeOk;
178  }
179 
180  assert_route();
181 
182  CHECK_ERROR_CODE(proceed_route());
183 
184  // After proceed_route(), it is still possible that we are at a deleted record or empty page.
185  // Keep moving on in that case.
186  while (should_skip_cur_route_
187  || (is_valid_record() && cur_key_location_.observed_.is_deleted())) {
188  DVLOG(0) << "Rare. next() needs to move on to find non-deleted records/pages";
189  should_skip_cur_route_ = false;
190  CHECK_ERROR_CODE(proceed_route());
191  if (route_count_ == 0) {
192  LOG(INFO) << "The empty page was the last page that might have had the record. we are done";
194  return kErrorCodeOk;
195  }
196  }
197  ASSERT_ND(!should_skip_cur_route_);
198  check_end_key();
199  if (is_valid_record()) {
200  assert_route();
201  ASSERT_ND(!cur_key_location_.observed_.is_deleted());
202  }
203  return kErrorCodeOk;
204 }
205 
206 inline ErrorCode MasstreeCursor::proceed_route() {
207  ASSERT_ND(!should_skip_cur_route_); // must be controlled in the caller (open/next)
208  if (cur_route()->page_->is_border()) {
209  return proceed_route_border();
210  } else {
211  return proceed_route_intermediate();
212  }
213 }
214 
215 ErrorCode MasstreeCursor::proceed_route_border() {
216  Route* route = cur_route();
217  ASSERT_ND(!route->was_stably_moved());
218  ASSERT_ND(route->page_->is_border());
219  // 20160330 Hideaki : No need to do the early abort here. Disabled.
220  // Serializability is anyway guaranteed by key-count check at commit time.
221  // Now that we have MOCC implemented, there is a benefit to not abort here.
222  // If we move on and reach the commit phase, we will know the full read/write sets
223  // and construct RLL for next run. We thus should move on here.
224  // if (UNLIKELY(route->page_->get_version().status_ != route->stable_)) {
225  // // something has changed in this page.
226  // // TASK(Hideaki) until we implement range lock, we have to roll back in this case.
227  // return kErrorCodeXctRaceAbort;
228  // }
229  // PageVersionStatus stable = route->stable_;
230  MasstreeBorderPage* page = reinterpret_cast<MasstreeBorderPage*>(route->page_);
231  while (true) {
232  if (forward_cursor_) {
233  ++route->index_;
234  } else {
235  --route->index_;
236  }
237  if (route->index_ < route->key_count_) {
238  CHECK_ERROR_CODE(fetch_cur_record_logical(page, route->get_cur_original_index()));
239  // If it points to next-layer, we ignore deleted bit. It has no meaning for next-layer rec.
240  if (is_cur_key_next_layer()) {
241  CHECK_ERROR_CODE(proceed_next_layer());
242  } else if (cur_key_location_.observed_.is_deleted()) {
243  continue;
244  }
245  break;
246  } else {
247  CHECK_ERROR_CODE(proceed_pop());
248  break;
249  }
250  }
251 
252  // 20160330 Hideaki same above
253  // assorted::memory_fence_consume();
254  // if (UNLIKELY(page->get_version().status_ != stable)) {
255  // return kErrorCodeXctRaceAbort; // same above
256  // }
257  return kErrorCodeOk;
258 }
259 
260 void MasstreeCursor::proceed_route_intermediate_rebase_separator() {
261  assorted::memory_fence_seq_cst(); // This method should be rarely called. not too expensive.
262  Route* route = cur_route();
263  ASSERT_ND(!route->page_->is_border());
264  MasstreeIntermediatePage* page = reinterpret_cast<MasstreeIntermediatePage*>(route->page_);
265  // We do NOT update stable_ here.
266  // even if it's moved now, we can keep using this node because of the master-tree invariant.
267  // rather, if we update the stable_, proceed_pop will be confused by that
268  route->key_count_ = route->page_->get_key_count();
269 
270  assorted::memory_fence_consume(); // we must observe key_count_ first
271 
272  const KeySlice last = route->latest_separator_;
273  ASSERT_ND(last <= page->get_high_fence() && last >= page->get_low_fence());
274  for (route->index_ = 0; route->index_ < route->key_count_; ++route->index_) {
275  if (last <= page->get_separator(route->index_)) {
276  break;
277  }
278  }
279  ASSERT_ND(route->index_ == route->key_count_ || last <= page->get_separator(route->index_));
280 
281  DVLOG(0) << "rebase. new index=" << route->index_ << "/" << route->key_count_;
282  if (route->index_ < route->key_count_ && last == page->get_separator(route->index_)) {
283  DVLOG(0) << "oh, matched with first level separator";
284  // we are "done" up to the last separator. which pointer to follow next?
285  if (forward_cursor_) {
286  ++route->index_;
287  const MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(route->index_);
288  route->key_count_mini_ = minipage.key_count_;
289  route->index_mini_ = 0;
290  } else {
291  const MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(route->index_);
292  route->key_count_mini_ = minipage.key_count_;
293  route->index_mini_ = minipage.key_count_;
294  }
295  return;
296  }
297 
298  const MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(route->index_);
299  do {
300  route->key_count_mini_ = minipage.key_count_;
301  DVLOG(0) << "checking second level... count=" << route->key_count_mini_;
302  for (route->index_mini_ = 0;
303  route->index_mini_ < route->key_count_mini_;
304  ++route->index_mini_) {
305  ASSERT_ND(last >= minipage.separators_[route->index_mini_]);
306  if (last == minipage.separators_[route->index_mini_]) {
307  break;
308  }
309  }
310  ASSERT_ND(route->index_mini_ <= route->key_count_mini_);
311  } while (route->key_count_mini_ != minipage.key_count_ ||
312  route->index_mini_ == route->key_count_mini_);
313  DVLOG(0) << "checked second level... index_mini=" << route->index_mini_;
314  ASSERT_ND(route->index_mini_ < route->key_count_mini_);
315  if (forward_cursor_) {
316  // last==separators[n], so we are following pointers[n+1] next
317  ++route->index_mini_;
318  } else {
319  // last==separators[n], so we are following pointers[n] next
320  }
321 }
322 
323 ErrorCode MasstreeCursor::proceed_route_intermediate() {
324  while (true) {
325  Route* route = cur_route();
326  ASSERT_ND(!route->page_->is_border());
327  ASSERT_ND(route->index_ <= route->key_count_);
328  MasstreeIntermediatePage* page = reinterpret_cast<MasstreeIntermediatePage*>(route->page_);
329 
330  if (forward_cursor_) {
331  ++route->index_mini_;
332  } else {
333  --route->index_mini_;
334  }
335  if (route->index_mini_ > route->key_count_mini_) { // this is also a 'negative' check
336  if (forward_cursor_) {
337  ++route->index_;
338  } else {
339  --route->index_;
340  }
341  if (route->index_ > route->key_count_) { // also a 'negative' check
342  // seems like we reached end of this page... did we?
343  return proceed_pop();
344  } else {
345  route->key_count_mini_ = page->get_minipage(route->index_).key_count_;
346  if (forward_cursor_) {
347  route->index_mini_ = 0;
348  } else {
349  route->index_mini_ = route->key_count_mini_;
350  }
351  }
352  }
353 
354  // Master-tree invariant
355  // verify that the next separator/page starts from the separator we followed previously.
356  // as far as we check it, we don't need the hand-over-hand version verification.
357  KeySlice new_separator;
358  MasstreePage* next;
359  while (true) {
360  ASSERT_ND(route->latest_separator_ >= page->get_low_fence());
361  ASSERT_ND(route->latest_separator_ <= page->get_high_fence());
362  MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(route->index_);
363  DualPagePointer& pointer = minipage.pointers_[route->index_mini_];
364  ASSERT_ND(!pointer.is_both_null());
366  MasstreeStoragePimpl(&storage_).follow_page(context_, for_writes_, &pointer, &next));
367 
368  if (forward_cursor_) {
369  if (UNLIKELY(next->get_low_fence() != route->latest_separator_)) {
370  VLOG(0) << "Interesting3A. separator doesn't match. concurrent adoption. local retry.";
371  proceed_route_intermediate_rebase_separator();
372  continue;
373  }
374  new_separator = next->get_high_fence();
375  } else {
376  if (UNLIKELY(next->get_high_fence() != route->latest_separator_)) {
377  VLOG(0) << "Interesting3B. separator doesn't match. concurrent adoption. local retry.";
378  proceed_route_intermediate_rebase_separator();
379  continue;
380  }
381  new_separator = next->get_low_fence();
382  }
383  break;
384  }
385 
386  route->latest_separator_ = new_separator;
387  CHECK_ERROR_CODE(push_route(next));
388  return proceed_deeper();
389  }
390  return kErrorCodeOk;
391 }
392 
393 inline ErrorCode MasstreeCursor::proceed_pop() {
394  while (true) {
395  --route_count_;
396  if (route_count_ == 0) {
397  reached_end_ = true;
398  return kErrorCodeOk;
399  }
400  Route& route = routes_[route_count_ - 1];
401  if (route.page_->is_border() && route.was_stably_moved()) {
402  // in case we were at a moved page, we either followed foster minor or foster major.
403  ASSERT_ND(route.moved_page_search_status_ == Route::kMovedPageSearchedOne ||
404  route.moved_page_search_status_ == Route::kMovedPageSearchedBoth);
405  if (route.moved_page_search_status_ == Route::kMovedPageSearchedBoth) {
406  // we checked both foster children. we are done with this. so, pop again
407  continue;
408  }
409  route.moved_page_search_status_ = Route::kMovedPageSearchedBoth;
410  const auto left = route.page_->get_foster_minor();
411  const auto right = route.page_->get_foster_major();
412  MasstreePage* next = resolve_volatile(forward_cursor_ ? right : left);
413  ASSERT_ND(next->is_border());
414 
415  // check another foster child
416  CHECK_ERROR_CODE(push_route(next));
417  return proceed_deeper();
418  } else {
419  // otherwise, next record in this page.
420  // we will also ignore foster twins in intermediate pages
421  return proceed_route();
422  }
423  }
424 }
425 inline ErrorCode MasstreeCursor::proceed_next_layer() {
426  Route* route = cur_route();
427  ASSERT_ND(route->page_->is_border());
428  MasstreeBorderPage* page = reinterpret_cast<MasstreeBorderPage*>(route->page_);
429  Layer layer = page->get_layer();
430  KeySlice record_slice = page->get_slice(route->get_cur_original_index());
431  cur_route_prefix_slices_[layer] = record_slice;
432  assorted::write_bigendian<KeySlice>(
433  record_slice,
434  cur_route_prefix_be_ + (layer * sizeof(KeySlice)));
435  DualPagePointer* pointer = page->get_next_layer(route->get_cur_original_index());
436  MasstreePage* next;
438  MasstreeStoragePimpl(&storage_).follow_page(context_, for_writes_, pointer, &next));
439  CHECK_ERROR_CODE(push_route(next));
440  return proceed_deeper();
441 }
442 
443 inline ErrorCode MasstreeCursor::proceed_deeper() {
444  Route* cur = cur_route();
445  if (cur->page_->is_border()) {
446  // if we are hitting a moved page, go to left or right, depending on forward cur or not
447  while (UNLIKELY(cur->was_stably_moved())) {
448  const auto left = cur->page_->get_foster_minor();
449  const auto right = cur->page_->get_foster_major();
450  MasstreePage* next_page = resolve_volatile(forward_cursor_ ? left : right);
451  ASSERT_ND(cur->moved_page_search_status_ == Route::kMovedPageSearchedNeither);
452  cur->moved_page_search_status_ = Route::kMovedPageSearchedOne;
453  CHECK_ERROR_CODE(push_route(next_page));
454  cur = cur_route();
455  ASSERT_ND(cur->page_->is_border());
456  }
457  ASSERT_ND(cur->page_->is_border());
458  ASSERT_ND(!cur->was_stably_moved());
459  return proceed_deeper_border();
460  } else {
461  // In intermediate page, we don't mind reading moved pages.
462  // Rather, following foster-twin might complicate things when one of them is an empty-range.
463  // So, we deliberately follow only "real" children for intermediate pages.
464  return proceed_deeper_intermediate();
465  }
466 }
467 
468 inline ErrorCode MasstreeCursor::proceed_deeper_border() {
469  Route* route = cur_route();
470  ASSERT_ND(route->page_->is_border());
471  ASSERT_ND(!route->was_stably_moved());
472  MasstreeBorderPage* page = reinterpret_cast<MasstreeBorderPage*>(cur_route()->page_);
473 
474  // We might have an empty border page in the route. We just skip over such a page.
475  if (route->key_count_ == 0) {
476  LOG(INFO) << "Huh, rare but possible. A cursor hit an empty border page. Just skips";
477  set_should_skip_cur_route();
478  return kErrorCodeOk; // next proceed_route will take care of it.
479  }
480 
481  ASSERT_ND(route->key_count_ > 0);
482  route->index_ = forward_cursor_ ? 0 : route->key_count_ - 1;
483  SlotIndex record = route->get_original_index(route->index_);
484  CHECK_ERROR_CODE(fetch_cur_record_logical(page, record));
485  if (is_cur_key_next_layer()) {
486  return proceed_next_layer();
487  }
488  return kErrorCodeOk;
489 }
490 
491 inline ErrorCode MasstreeCursor::proceed_deeper_intermediate() {
492  Route* route = cur_route();
493  ASSERT_ND(!route->page_->is_border());
494  MasstreeIntermediatePage* page = reinterpret_cast<MasstreeIntermediatePage*>(cur_route()->page_);
495  route->index_ = forward_cursor_ ? 0 : route->key_count_;
496  MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(route->index_);
497 
498  MasstreePage* next;
499  KeySlice separator_low, separator_high;
500  while (true) {
501  route->key_count_mini_ = minipage.key_count_;
503  route->index_mini_ = forward_cursor_ ? 0 : route->key_count_mini_;
504 
505  extract_separators(&separator_low, &separator_high);
506  DualPagePointer& pointer = minipage.pointers_[route->index_mini_];
507  ASSERT_ND(!pointer.is_both_null());
509  MasstreeStoragePimpl(&storage_).follow_page(context_, for_writes_, &pointer, &next));
510  if (UNLIKELY(next->get_low_fence() != separator_low ||
511  next->get_high_fence() != separator_high)) {
512  VLOG(0) << "Interesting4. first sep doesn't match. concurrent adoption. local retry.";
513  continue;
514  } else {
515  break;
516  }
517  }
518 
519  route->latest_separator_ = forward_cursor_ ? separator_high : separator_low;
520  CHECK_ERROR_CODE(push_route(next));
521  return proceed_deeper();
522 }
523 
525 //
526 // common methods
527 //
529 
530 ErrorCode MasstreeCursor::fetch_cur_record_logical(MasstreeBorderPage* page, SlotIndex record) {
531  // fetch everything
532  ASSERT_ND(record < page->get_key_count());
533  cur_key_owner_id_address = page->get_owner_id(record);
534 #ifndef NDEBUG
535  if (!page->header().snapshot_) {
538  cur_key_owner_id_address);
539  }
540 #endif // NDEBUG
541  KeyLength remainder = page->get_remainder_length(record);
542  cur_key_in_layer_remainder_ = remainder;
543  cur_key_in_layer_slice_ = page->get_slice(record);
544  Layer layer = page->get_layer();
545  cur_key_length_ = layer * sizeof(KeySlice) + remainder;
546 
547  CHECK_ERROR_CODE(cur_key_location_.populate_logical(
548  current_xct_,
549  page,
550  record,
551  for_writes_));
552  if (!cur_key_location_.observed_.is_next_layer()) {
553  cur_key_suffix_ = page->get_record(record);
554  cur_payload_length_ = page->get_payload_length(record);
555  cur_payload_ = page->get_record_payload(record);
556  } else {
557  cur_key_suffix_ = nullptr;
558  cur_payload_length_ = 0;
559  cur_payload_ = nullptr;
560  }
561  return kErrorCodeOk;
562 }
563 
566  // sort entries in this page
567  // we have already called prefetch_general(), which prefetches 4 cachelines (256 bytes).
568  // as we are reading up to slices_[key_count - 1], we might want to prefetch more.
569  MasstreeBorderPage* page = reinterpret_cast<MasstreeBorderPage*>(page_);
570  for (SlotIndex i = 0; i < key_count_; ++i) {
571  order_[i] = i;
572  }
573 
574  if (page->is_consecutive_inserts()) {
575  DVLOG(2) << "lucky, an already sorted border page.";
576  return;
577  }
578 
579  page->prefetch_additional_if_needed(key_count_);
580  struct Sorter {
581  explicit Sorter(const MasstreeBorderPage* target) : target_(target) {}
582  bool operator() (SlotIndex left, SlotIndex right) {
583  KeySlice left_slice = target_->get_slice(left);
584  KeySlice right_slice = target_->get_slice(right);
585  if (left_slice < right_slice) {
586  return true;
587  } else if (left_slice == right_slice) {
588  return target_->get_remainder_length(left) < target_->get_remainder_length(right);
589  } else {
590  return false;
591  }
592  }
593  const MasstreeBorderPage* target_;
594  };
595 
596  // this sort order in page is correct even without evaluating the suffix.
597  // however, to compare with our searching key, we need to consider suffix
598  std::sort(order_, order_ + key_count_, Sorter(page));
599 }
600 
601 ErrorCode MasstreeCursor::push_route(MasstreePage* page) {
602  assert_aligned_page(page);
603  if (route_count_ == kMaxRoutes) {
605  }
606 #ifndef NDEBUG
607  if (!page->header().snapshot_) {
609  }
610 #endif // NDEBUG
611  page->prefetch_general();
612  const bool is_border = page->is_border();
613  Route& route = routes_[route_count_];
614  while (true) {
615  route.key_count_ = page->get_key_count();
617  route.stable_ = page->get_version().status_;
618  route.page_ = page;
619  if (is_border && route.was_stably_moved()) {
620  route.moved_page_search_status_ = Route::kMovedPageSearchedNeither;
621  } else {
622  route.moved_page_search_status_ = Route::kNone;
623  }
624  route.latest_separator_ = kInfimumSlice; // must be set shortly after this method
625  route.index_ = kMaxRecords; // must be set shortly after this method
626  route.index_mini_ = kMaxRecords; // must be set shortly after this method
627  route.snapshot_ = page->header().snapshot_;
628  route.layer_ = page->get_layer();
629  if (is_border && !route.was_stably_moved()) {
630  route.setup_order();
632  // the setup_order must not be confused by concurrent updates.
633  // because we check version after consume fence, this also catches the case where
634  // we have a new key, is_consecutive_inserts()==true no longer holds, etc.
635  if (UNLIKELY(route.stable_ != page->get_version().status_)) {
636  continue;
637  }
638  }
639  break;
640  }
641 
642  ++route_count_;
643  // We don't need to take a page into the page version set unless we need to lock a range in it.
644  // We thus need it only for border pages. Even if an interior page changes, splits, whatever,
645  // the pre-existing border pages are already responsible for the searched key regions.
646  // this is an outstanding difference from original masstree/silo protocol.
647  // we also don't have to consider moved pages. they are stable.
648  if (!is_border || page->header().snapshot_ || route.was_stably_moved()) {
649  return kErrorCodeOk;
650  }
651  return current_xct_->add_to_page_version_set(&page->header().page_version_, route.stable_);
652 }
653 
654 inline ErrorCode MasstreeCursor::follow_foster_border(KeySlice slice) {
655  // a bit more complicated than point queries because of exclusive cases.
656  while (true) {
657  Route* route = cur_route();
658  ASSERT_ND(route->page_->is_border()); // we follow foster twins only in border pages
659  ASSERT_ND(search_type_ == kBackwardExclusive || route->page_->within_fences(slice));
660  ASSERT_ND(search_type_ != kBackwardExclusive
661  || route->page_->within_fences(slice)
662  || route->page_->get_high_fence() == slice);
663  if (LIKELY(!route->was_stably_moved())) {
664  break;
665  }
666 
667  ASSERT_ND(route->was_stably_moved());
668  ASSERT_ND(route->moved_page_search_status_ == Route::kMovedPageSearchedNeither
669  || route->moved_page_search_status_ == Route::kMovedPageSearchedOne); // see locate_border
670  KeySlice foster_fence = route->page_->get_foster_fence();
671  MasstreePage* left = resolve_volatile(route->page_->get_foster_minor());
672  MasstreePage* right = resolve_volatile(route->page_->get_foster_major());
673  MasstreePage* page;
674  if (slice < foster_fence) {
675  page = left;
676  } else if (slice > foster_fence) {
677  page = right;
678  } else {
679  ASSERT_ND(slice == foster_fence);
680  if (search_type_ == kBackwardExclusive) {
681  page = left;
682  } else {
683  page = right;
684  }
685  }
686  // if we are following the right foster, 1) if forward-cursor, we already skipeed left,
687  // so we are "both" done. 2) otherwise, just one of them done. vice versa.
688  if ((forward_cursor_ && page == left) || (!forward_cursor_ && page == right)) {
689  route->moved_page_search_status_ = Route::kMovedPageSearchedOne;
690  } else {
691  route->moved_page_search_status_ = Route::kMovedPageSearchedBoth;
692  }
693  CHECK_ERROR_CODE(push_route(page));
694  }
695  return kErrorCodeOk;
696 }
697 
698 inline void MasstreeCursor::extract_separators(
699  KeySlice* separator_low,
700  KeySlice* separator_high) const {
701  // so far this method does not call MasstreeIntermediatePage::extract_separators() to avoid
702  // relying on key_count in the page (this method uses Route's counts which are taken with fences).
703  // I suspect it's okay to call it (and to reduce code), lets revisit later.
704  const Route* route = cur_route();
705  ASSERT_ND(!route->page_->is_border());
706  ASSERT_ND(route->index_ <= route->key_count_);
707  ASSERT_ND(route->index_mini_ <= route->key_count_mini_);
708  MasstreeIntermediatePage* cur = reinterpret_cast<MasstreeIntermediatePage*>(route->page_);
709  const MasstreeIntermediatePage::MiniPage& minipage = cur->get_minipage(route->index_);
710  if (route->index_mini_ == 0) {
711  if (route->index_ == 0) {
712  *separator_low = cur->get_low_fence();
713  } else {
714  *separator_low = cur->get_separator(route->index_ - 1U);
715  }
716  } else {
717  *separator_low = minipage.separators_[route->index_mini_ - 1U];
718  }
719  if (route->index_mini_ == route->key_count_mini_) {
720  if (route->index_ == route->key_count_) {
721  *separator_high = cur->get_high_fence();
722  } else {
723  *separator_high = cur->get_separator(route->index_);
724  }
725  } else {
726  *separator_high = minipage.separators_[route->index_mini_];
727  }
728 }
729 
730 inline void MasstreeCursor::check_end_key() {
731  if (is_valid_record()) {
732  KeyCompareResult result = compare_cur_key_aginst_end_key();
733  ASSERT_ND(result != kCurKeyContains && result != kCurKeyBeingsWith);
734  bool ended = false;
735  if (result == kCurKeySmaller) {
736  if (!forward_cursor_) {
737  reached_end_ = true;
738  }
739  } else if (result == kCurKeyLarger) {
740  if (forward_cursor_) {
741  reached_end_ = true;
742  }
743  } else {
744  ASSERT_ND(result == kCurKeyEquals);
745  if (!end_inclusive_) {
746  reached_end_ = true;
747  }
748  }
749  if (ended) {
750  reached_end_ = true;
751  }
752  }
753 }
754 
755 inline MasstreeCursor::KeyCompareResult MasstreeCursor::compare_cur_key_aginst_search_key(
756  KeySlice slice,
757  uint8_t layer) const {
758  if (is_search_key_extremum() || search_key_length_ <= layer * sizeof(KeySlice)) {
759  if (forward_cursor_) {
760  return kCurKeyLarger;
761  } else {
762  return kCurKeySmaller;
763  }
764  }
765 
766 #ifndef NDEBUG
767  // The fact that we are in this page means the page or its descendants can contain search key.
768  // Let's check.
769  for (Layer i = 0; i < layer && sizeof(KeySlice) * i < search_key_length_; ++i) {
770  if (is_forward_cursor()) {
771  ASSERT_ND(search_key_slices_[i] <= cur_route_prefix_slices_[i]);
772  } else {
773  ASSERT_ND(search_key_slices_[i] >= cur_route_prefix_slices_[i]);
774  }
775  }
776 #endif // NDEBUG
777  return compare_cur_key(slice, layer, search_key_, search_key_length_);
778 }
779 
780 inline MasstreeCursor::KeyCompareResult MasstreeCursor::compare_cur_key_aginst_end_key() const {
781  if (is_end_key_supremum()) {
782  return forward_cursor_ ? kCurKeySmaller : kCurKeyLarger;
783  }
784 
785  ASSERT_ND(!is_cur_key_next_layer());
786  const Layer layer = cur_route()->layer_;
787 
788  // TASK(Hideaki): We don't have to compare prefixes each time.
789  // We can remember whether the end_key is trivially satisfied in route.
790  // So far we compare each time... let's see what CPU profile says.
791  for (Layer i = 0; i < layer && sizeof(KeySlice) * i < end_key_length_; ++i) {
792  if (end_key_slices_[i] > cur_route_prefix_slices_[i]) {
793  return kCurKeySmaller;
794  } else if (end_key_slices_[i] < cur_route_prefix_slices_[i]) {
795  return kCurKeyLarger;
796  }
797  }
798  // Was the last slice a complete one?
799  // For example, end-key="123456", layer=1. end_key_slices_[0] was incomplete.
800  // We know cur_route_prefix_slices_[0] was "123456 " (space as \0). So it's actually different.
801  if (sizeof(KeySlice) * layer > end_key_length_) {
802  ASSERT_ND(cur_key_length_ > end_key_length_);
803  if (is_forward_cursor()) {
804  return kCurKeyLarger;
805  } else {
806  return kCurKeySmaller;
807  }
808  }
809 
810  // okay, all prefix slices were exactly the same.
811  // We have to compare in-layer slice and suffix
812 
813  // 1. in-layer slice.
814  KeySlice end_slice = end_key_slices_[layer];
815  if (cur_key_in_layer_slice_ < end_slice) {
816  return kCurKeySmaller;
817  } else if (cur_key_in_layer_slice_ > end_slice) {
818  return kCurKeyLarger;
819  }
820 
821  KeyLength end_remainder = end_key_length_ - sizeof(KeySlice) * layer;
822  if (cur_key_in_layer_remainder_ <= sizeof(KeySlice) || end_remainder <= sizeof(KeySlice)) {
823  if (cur_key_in_layer_remainder_ == end_remainder) {
824  return kCurKeyEquals;
825  } else if (cur_key_in_layer_remainder_ >= end_remainder) {
826  return kCurKeyLarger;
827  } else {
828  return kCurKeySmaller;
829  }
830  }
831 
832  // 2. suffix
833  ASSERT_ND(cur_key_in_layer_remainder_ > sizeof(KeySlice));
834  ASSERT_ND(end_remainder > sizeof(KeySlice));
835  KeyLength cur_suffix_len = cur_key_in_layer_remainder_ - sizeof(KeySlice);
836  KeyLength end_suffix_len = end_remainder - sizeof(KeySlice);
837  KeyLength min_suffix_len = std::min(cur_suffix_len, end_suffix_len);
838 
839  int cmp = std::memcmp(
840  cur_key_suffix_,
841  end_key_ + (layer + 1U) * sizeof(KeySlice),
842  min_suffix_len);
843  if (cmp < 0) {
844  return kCurKeySmaller;
845  } else if (cmp > 0) {
846  return kCurKeyLarger;
847  } else {
848  if (cur_key_length_ > end_key_length_) {
849  return kCurKeySmaller;
850  } else if (cur_key_length_ < end_key_length_) {
851  return kCurKeyLarger;
852  } else {
853  return kCurKeyEquals;
854  }
855  }
856 }
857 
858 inline MasstreeCursor::KeyCompareResult MasstreeCursor::compare_cur_key(
859  KeySlice slice,
860  uint8_t layer,
861  const char* full_key,
862  KeyLength full_length) const {
863  ASSERT_ND(full_length >= layer * sizeof(KeySlice));
864  KeyLength remainder = full_length - layer * sizeof(KeySlice);
865  if (is_cur_key_next_layer()) {
866  // treat this case separately
867  if (cur_key_in_layer_slice_ < slice) {
868  return kCurKeySmaller;
869  } else if (cur_key_in_layer_slice_ > slice) {
870  return kCurKeyLarger;
871  } else if (remainder < sizeof(KeySlice)) {
872  return kCurKeyLarger;
873  } else if (remainder == sizeof(KeySlice)) {
874  return kCurKeyBeingsWith;
875  } else {
876  return kCurKeyContains;
877  }
878  } else if (cur_key_in_layer_slice_ < slice) {
879  return kCurKeySmaller;
880  } else if (cur_key_in_layer_slice_ > slice) {
881  return kCurKeyLarger;
882  } else {
883  if (cur_key_in_layer_remainder_ <= sizeof(KeySlice)) {
884  // then simply length determines the <,>,== relation
885  if (cur_key_in_layer_remainder_ < remainder) {
886  return kCurKeySmaller;
887  } else if (cur_key_in_layer_remainder_ == remainder) {
888  // exactly matches.
889  return kCurKeyEquals;
890  } else {
891  return kCurKeyLarger;
892  }
893  } else if (remainder <= sizeof(KeySlice)) {
894  return kCurKeyLarger;
895  } else {
896  // we have to compare suffix. which suffix is longer?
897  KeyLength min_length = std::min<KeyLength>(remainder, cur_key_in_layer_remainder_);
898  int cmp = std::memcmp(
899  cur_key_suffix_,
900  full_key + (layer + 1U) * sizeof(KeySlice),
901  min_length - sizeof(KeySlice));
902  if (cmp < 0) {
903  return kCurKeySmaller;
904  } else if (cmp > 0) {
905  return kCurKeyLarger;
906  } else {
907  if (cur_key_in_layer_remainder_ < remainder) {
908  return kCurKeySmaller;
909  } else if (cur_key_in_layer_remainder_ == remainder) {
910  return kCurKeyEquals;
911  } else {
912  return kCurKeyLarger;
913  }
914  }
915  }
916  }
917 }
918 
920 //
921 // Initial open() and locate() methods
922 //
924 
925 void copy_input_key(const char* input_key, KeyLength length, char* buffer, KeySlice* slice_buffer) {
926  std::memcpy(ASSUME_ALIGNED(buffer, 256), input_key, length);
927  for (Layer i = 0; i * sizeof(KeySlice) < length; ++i) {
928  slice_buffer[i] = slice_key(input_key + i * sizeof(KeySlice), length - i * sizeof(KeySlice));
929  }
930 }
931 
933  const char* begin_key,
934  KeyLength begin_key_length,
935  const char* end_key,
936  KeyLength end_key_length,
937  bool forward_cursor,
938  bool for_writes,
939  bool begin_inclusive,
940  bool end_inclusive) {
941  CHECK_ERROR_CODE(allocate_if_not_exist(&routes_));
942  CHECK_ERROR_CODE(allocate_if_not_exist(&search_key_));
943  CHECK_ERROR_CODE(allocate_if_not_exist(&search_key_slices_));
944  CHECK_ERROR_CODE(allocate_if_not_exist(&cur_route_prefix_slices_));
945  CHECK_ERROR_CODE(allocate_if_not_exist(&cur_route_prefix_be_));
946  CHECK_ERROR_CODE(allocate_if_not_exist(&end_key_));
947  CHECK_ERROR_CODE(allocate_if_not_exist(&end_key_slices_));
948 
949  if (!current_xct_->is_active()) {
950  return kErrorCodeXctNoXct;
951  }
952 
953  forward_cursor_ = forward_cursor;
954  reached_end_ = false;
955  for_writes_ = for_writes;
956  end_inclusive_ = end_inclusive;
957  end_key_length_ = end_key_length;
958  route_count_ = 0;
959  if (!is_end_key_supremum()) {
960  copy_input_key(end_key, end_key_length, end_key_, end_key_slices_);
961  }
962 
963  search_key_length_ = begin_key_length;
964  search_type_ = forward_cursor ? (begin_inclusive ? kForwardInclusive : kForwardExclusive)
965  : (begin_inclusive ? kBackwardInclusive : kBackwardExclusive);
966  if (!is_search_key_extremum()) {
967  copy_input_key(begin_key, begin_key_length, search_key_, search_key_slices_);
968  }
969 
971  MasstreeStoragePimpl pimpl(&storage_);
972  CHECK_ERROR_CODE(pimpl.get_first_root(context_, for_writes, &root));
973  CHECK_ERROR_CODE(push_route(root));
974  CHECK_ERROR_CODE(locate_layer(0));
975  ASSERT_ND(route_count_ != 0);
976  ASSERT_ND(cur_route()->page_->is_border());
977  ASSERT_ND(should_skip_cur_route_ || (!should_skip_cur_route_ && is_valid_record()));
978 
979  // In a few cases, open() must move on to next page/key.
980  // 1. locate_xxx doesn't take care of deleted record as it can't proceed to another page.
981  // so, if the initially located record is a deleted record, we need to move on to next key.
982  // 2. unluckily we hit an empty page or the page boundary in initial locate().
983  // Let's go on to next page by proceed_route() and find a next page.
984  // Note, it's a while, not if. It's very unlikely but possible that proceed_route again
985  // results in the same state.
986  while (should_skip_cur_route_
987  || (is_valid_record() && cur_key_location_.observed_.is_deleted())) {
988  DVLOG(0) << "Rare. open() needs to move on to find non-deleted records/pages";
989  should_skip_cur_route_ = false;
990  CHECK_ERROR_CODE(proceed_route());
991  if (route_count_ == 0) {
992  LOG(INFO) << "The empty page was the last page that might have had the record. we are done";
994  return kErrorCodeOk;
995  }
996  }
997  ASSERT_ND(!should_skip_cur_route_);
998  ASSERT_ND(!is_valid_record() || !cur_key_location_.observed_.is_deleted());
999  check_end_key();
1000  if (is_valid_record()) {
1001  assert_route();
1002  }
1003  return kErrorCodeOk;
1004 }
1005 
1006 inline ErrorCode MasstreeCursor::locate_layer(uint8_t layer) {
1007  MasstreePage* layer_root = cur_route()->page_;
1008  ASSERT_ND(layer_root->is_layer_root());
1009  ASSERT_ND(layer_root->get_layer() == layer);
1010  // set up the search in this layer. What's the slice we will look for in this layer?
1011  KeySlice slice;
1012  search_key_in_layer_extremum_ = false;
1013  if (is_search_key_extremum() || search_key_length_ <= layer * sizeof(KeySlice)) {
1014  slice = forward_cursor_ ? kInfimumSlice : kSupremumSlice;
1015  search_key_in_layer_extremum_ = true;
1016  } else if (search_key_length_ >= (layer + 1U) * sizeof(KeySlice)) {
1017  slice = search_key_slices_[layer];
1018  } else {
1019  // if we don't have a full slice for this layer, cursor has to do a bit special thing.
1020  // remember that we might be doing backward search.
1021  if (forward_cursor_) {
1022  // fill unsed bytes with 0
1023  slice = kInfimumSlice;
1024  } else {
1025  // fill unsed bytes with FF
1026  slice = kSupremumSlice;
1027  }
1028  std::memcpy(
1029  &slice,
1030  search_key_ + layer * sizeof(KeySlice),
1031  search_key_length_ - layer * sizeof(KeySlice));
1032  slice = assorted::read_bigendian<KeySlice>(&slice);
1033  }
1034 
1035  ASSERT_ND(layer != 0 || !layer_root->is_border()); // layer0-root is always intermediate
1036  if (!layer_root->is_border()) {
1037  CHECK_ERROR_CODE(locate_descend(slice));
1038  }
1039  ASSERT_ND(cur_route()->page_->is_border());
1040  CHECK_ERROR_CODE(locate_border(slice));
1041 
1042 #ifndef NDEBUG
1043  if (is_valid_record()) {
1044  KeySlice cur_key_slice_this_layer;
1045  if (cur_route()->layer_ == layer) {
1046  cur_key_slice_this_layer = cur_key_in_layer_slice_;
1047  } else {
1048  ASSERT_ND(cur_route()->layer_ > layer);
1049  cur_key_slice_this_layer = cur_route_prefix_slices_[layer];
1050  }
1051  if (forward_cursor_) {
1052  ASSERT_ND(cur_key_slice_this_layer >= slice);
1053  } else {
1054  ASSERT_ND(cur_key_slice_this_layer <= slice);
1055  }
1056  }
1057 #endif // NDEBUG
1058 
1059  return kErrorCodeOk;
1060 }
1061 
1062 ErrorCode MasstreeCursor::locate_border(KeySlice slice) {
1063  while (true) {
1064  CHECK_ERROR_CODE(follow_foster_border(slice));
1065  // Master-Tree invariant: we are in a page that contains this slice.
1066  // the only exception is that it's a backward-exclusive search and slice==high fence
1067  Route* route = cur_route();
1068  MasstreeBorderPage* border = reinterpret_cast<MasstreeBorderPage*>(route->page_);
1069  ASSERT_ND(border->get_low_fence() <= slice);
1070  ASSERT_ND(border->get_high_fence() >= slice);
1071  ASSERT_ND(search_type_ == kBackwardExclusive || border->within_fences(slice));
1072  ASSERT_ND(search_type_ != kBackwardExclusive
1073  || border->within_fences(slice)
1074  || border->get_high_fence() == slice);
1075 
1076  ASSERT_ND(!route->was_stably_moved());
1077 
1078  if (route->key_count_ == 0) {
1079  LOG(INFO) << "Huh, rare but possible. Cursor's Initial locate() hits an empty border page.";
1080  set_should_skip_cur_route();
1081  break;
1082  }
1083 
1084  uint8_t layer = border->get_layer();
1085  // find right record. be aware of backward-exclusive case!
1086  SlotIndex index;
1087 
1088  // no need for fast-path for supremum.
1089  // almost always supremum-search is for backward search, so anyway it finds it first.
1090  // if we have supremum-search for forward search, we miss opportunity, but who does it...
1091  // same for infimum-search for backward.
1092  if (search_type_ == kForwardExclusive || search_type_ == kForwardInclusive) {
1093  for (index = 0; index < route->key_count_; ++index) {
1094  SlotIndex record = route->get_original_index(index);
1095  if (border->get_slice(record) < slice) {
1096  // if slice is strictly smaller, we are sure it's not the record we want. skip without
1097  // reading. Because of key-immutability and purely-increasing key count, this doesn't
1098  // violate serializability
1099  continue;
1100  }
1101  CHECK_ERROR_CODE(fetch_cur_record_logical(border, record));
1102  ASSERT_ND(cur_key_in_layer_slice_ >= slice);
1103  KeyCompareResult result = compare_cur_key_aginst_search_key(slice, layer);
1104  if (result == kCurKeySmaller ||
1105  (result == kCurKeyEquals && search_type_ == kForwardExclusive)) {
1106  continue;
1107  }
1108 
1109  // okay, this seems to satisfy our search.
1110  break;
1111  }
1112  } else {
1113  for (index = route->key_count_ - 1; index < route->key_count_; --index) {
1114  SlotIndex record = route->get_original_index(index);
1115  if (border->get_slice(record) > slice) {
1116  continue;
1117  }
1118  CHECK_ERROR_CODE(fetch_cur_record_logical(border, record));
1119  ASSERT_ND(cur_key_in_layer_slice_ <= slice);
1120  KeyCompareResult result = compare_cur_key_aginst_search_key(slice, layer);
1121  if (result == kCurKeyLarger ||
1122  (result == kCurKeyEquals && search_type_ == kBackwardExclusive)) {
1123  continue;
1124  }
1125  break;
1126  }
1127  }
1128  route->index_ = index;
1129  // done about this page
1130  if (route->index_ >= route->key_count_) {
1131  DVLOG(2) << "Initial locate() hits page boundary.";
1132  set_should_skip_cur_route();
1133  }
1134  break;
1135  }
1136 
1137  if (is_valid_record() && is_cur_key_next_layer()) {
1138  return locate_next_layer();
1139  }
1140 
1141  return kErrorCodeOk;
1142 }
1143 
1144 
1145 ErrorCode MasstreeCursor::locate_next_layer() {
1146  Route* route = cur_route();
1147  MasstreeBorderPage* border = reinterpret_cast<MasstreeBorderPage*>(route->page_);
1148  Layer layer = border->get_layer();
1149  KeySlice record_slice = border->get_slice(route->get_cur_original_index());
1150  cur_route_prefix_slices_[layer] = record_slice;
1151  assorted::write_bigendian<KeySlice>(
1152  record_slice,
1153  cur_route_prefix_be_ + (layer * sizeof(KeySlice)));
1154  DualPagePointer* pointer = border->get_next_layer(route->get_cur_original_index());
1155  MasstreePage* next;
1157  MasstreeStoragePimpl(&storage_).follow_page(context_, for_writes_, pointer, &next));
1158  CHECK_ERROR_CODE(push_route(next));
1159  return locate_layer(layer + 1U);
1160 }
1161 
1162 ErrorCode MasstreeCursor::locate_descend(KeySlice slice) {
1163  while (true) {
1164  // In intermediate page, we do not care foster twins.
1165  // Even without following them, Master-Tree invariant guarantees that we will reach
1166  // the correct pages.
1167  Route* route = cur_route();
1168  ASSERT_ND(!route->page_->is_border());
1169  MasstreeIntermediatePage* cur = reinterpret_cast<MasstreeIntermediatePage*>(route->page_);
1170  ASSERT_ND(search_type_ == kBackwardExclusive || cur->within_fences(slice));
1171  ASSERT_ND(search_type_ != kBackwardExclusive
1172  || cur->within_fences(slice)
1173  || cur->get_high_fence() == slice);
1174 
1175  // find right minipage. be aware of backward-exclusive case!
1176  SlotIndex index = 0;
1177  // fast path for supremum-search.
1178  if (search_key_in_layer_extremum_) {
1179  if (forward_cursor_) {
1180  index = 0;
1181  } else {
1182  index = route->key_count_;
1183  }
1184  } else if (search_type_ != kBackwardExclusive) {
1185  for (; index < route->key_count_; ++index) {
1186  if (slice < cur->get_separator(index)) {
1187  break;
1188  }
1189  }
1190  } else {
1191  for (; index < route->key_count_; ++index) {
1192  if (slice <= cur->get_separator(index)) {
1193  break;
1194  }
1195  }
1196  }
1197  route->index_ = index;
1198  MasstreeIntermediatePage::MiniPage& minipage = cur->get_minipage(route->index_);
1199 
1200  minipage.prefetch();
1201  route->key_count_mini_ = minipage.key_count_;
1202 
1203  SlotIndex index_mini = 0;
1204  if (search_key_in_layer_extremum_) {
1205  // fast path for supremum-search.
1206  if (forward_cursor_) {
1207  index_mini = 0;
1208  } else {
1209  index_mini = route->key_count_mini_;
1210  }
1211  } else if (search_type_ != kBackwardExclusive) {
1212  for (; index_mini < route->key_count_mini_; ++index_mini) {
1213  if (slice < minipage.separators_[index_mini]) {
1214  break;
1215  }
1216  }
1217  } else {
1218  for (; index_mini < route->key_count_mini_; ++index_mini) {
1219  if (slice <= minipage.separators_[index_mini]) {
1220  break;
1221  }
1222  }
1223  }
1224  route->index_mini_ = index_mini;
1225 
1226  KeySlice separator_low, separator_high;
1227  extract_separators(&separator_low, &separator_high);
1228  if (UNLIKELY(slice < separator_low || slice > separator_high)) {
1229  VLOG(0) << "Interesting5. separator doesn't match. concurrent adopt. local retry.";
1231  route->key_count_ = cur->get_key_count();
1232  continue;
1233  }
1234  ASSERT_ND(separator_low <= slice && slice <= separator_high);
1235 
1236  DualPagePointer& pointer = minipage.pointers_[route->index_mini_];
1237  ASSERT_ND(!pointer.is_both_null());
1238  MasstreePage* next;
1240  MasstreeStoragePimpl(&storage_).follow_page(context_, for_writes_, &pointer, &next));
1241 
1242  // Master-tree invariant
1243  // verify that the followed page covers the key range we want.
1244  // as far as we check it, we don't need the hand-over-hand version verification.
1245  // what this page once covered will be forever reachable from this page.
1246  if (UNLIKELY(next->get_low_fence() != separator_low ||
1247  next->get_high_fence() != separator_high)) {
1248  VLOG(0) << "Interesting. separator doesn't match. concurrent adopt. local retry.";
1250  route->key_count_ = cur->get_key_count();
1251  continue;
1252  }
1253 
1254  route->latest_separator_ = forward_cursor_ ? separator_high : separator_low;
1255  ASSERT_ND(next->get_low_fence() <= slice);
1256  ASSERT_ND(next->get_high_fence() >= slice);
1257  CHECK_ERROR_CODE(push_route(next));
1258  if (next->is_border()) {
1259  return kErrorCodeOk;
1260  } else {
1261  continue;
1262  }
1263  }
1264 }
1265 
1267 //
1268 // APIs to modify current record
1269 //
1272  const void* payload,
1273  PayloadLength payload_offset,
1274  PayloadLength payload_count) {
1275  assert_modify();
1276  char key[kMaxKeyLength];
1277  copy_combined_key(key);
1278  return MasstreeStoragePimpl(&storage_).overwrite_general(
1279  context_,
1280  cur_key_location_,
1281  key,
1282  cur_key_length_,
1283  payload,
1284  payload_offset,
1285  payload_count);
1286 }
1287 
1288 template <typename PAYLOAD>
1290  PAYLOAD payload,
1291  PayloadLength payload_offset) {
1292  assert_modify();
1293  char key[kMaxKeyLength];
1294  copy_combined_key(key);
1295  return MasstreeStoragePimpl(&storage_).overwrite_general(
1296  context_,
1297  cur_key_location_,
1298  key,
1299  cur_key_length_,
1300  &payload,
1301  payload_offset,
1302  sizeof(PAYLOAD));
1303 }
1304 
1306  assert_modify();
1307  char key[kMaxKeyLength];
1308  copy_combined_key(key);
1309  return MasstreeStoragePimpl(&storage_).delete_general(
1310  context_,
1311  cur_key_location_,
1312  key,
1313  cur_key_length_);
1314 }
1315 
1316 template <typename PAYLOAD>
1318  assert_modify();
1319  char key[kMaxKeyLength];
1320  copy_combined_key(key);
1321  return MasstreeStoragePimpl(&storage_).increment_general<PAYLOAD>(
1322  context_,
1323  cur_key_location_,
1324  key,
1325  cur_key_length_,
1326  value,
1327  payload_offset);
1328 }
1329 
1330 void MasstreeCursor::assert_route_impl() const {
1331  for (uint16_t i = 0; i + 1U < route_count_; ++i) {
1332  const Route* route = routes_ + i;
1333  ASSERT_ND(route->page_);
1334  ASSERT_ND(route->layer_ == route->page_->get_layer());
1335  if (route->was_stably_moved()) {
1336  // then we don't use any information in this path
1337  } else if (reinterpret_cast<Page*>(route->page_)->get_header().get_page_type()
1339  ASSERT_ND(route->index_ < kMaxRecords);
1340  ASSERT_ND(route->index_ < route->key_count_);
1341  } else {
1342  ASSERT_ND(route->index_ <= route->key_count_);
1343  ASSERT_ND(route->index_ <= kMaxIntermediateSeparators);
1344  ASSERT_ND(route->index_mini_ <= route->key_count_mini_);
1345  ASSERT_ND(route->index_mini_ <= kMaxIntermediateMiniSeparators);
1346  }
1347  }
1348 }
1349 
1350 void MasstreeCursor::set_should_skip_cur_route() {
1351  Route* route = cur_route();
1352  ASSERT_ND(route->page_->is_border());
1353  should_skip_cur_route_ = true;
1354  // so that proceed_route will immediately pop this page.
1355  if (forward_cursor_) {
1356  route->index_ = route->key_count_;
1357  } else {
1358  route->index_ = 0;
1359  }
1360 }
1361 
1362 } // namespace masstree
1363 } // namespace storage
1364 } // namespace foedus
void assert_aligned_page(const void *page)
Definition: page.hpp:402
const memory::GlobalVolatilePageResolver & get_global_volatile_page_resolver() const
Returns the page resolver to convert page ID to page pointer.
Definition: thread.cpp:125
const KeySlice kInfimumSlice
SlotIndex get_key_count() const __attribute__((always_inline))
physical key count (those keys might be deleted) in this page.
This means either was_stably_moved() is false or it's intermediate page, in which case we don't neeed...
uint16_t PayloadLength
Represents a byte-length of a payload in this package.
Definition: masstree_id.hpp:92
ErrorCode overwrite_record_primitive(PAYLOAD payload, PayloadLength payload_offset)
uint16_t SlotIndex
Index of a record in a (border) page.
const KeyLength kMaxKeyLength
Max length of a key.
Definition: masstree_id.hpp:75
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
bool is_border() const __attribute__((always_inline))
Represents one thread running on one NUMA core.
Definition: thread.hpp:48
void prefetch_general() const __attribute__((always_inline))
prefetch upto keys/separators, whether this page is border or interior.
ErrorCode next()
Moves the cursor to next record.
ErrorCode open(const char *begin_key=nullptr, KeyLength begin_key_length=kKeyLengthExtremum, const char *end_key=nullptr, KeyLength end_key_length=kKeyLengthExtremum, bool forward_cursor=true, bool for_writes=false, bool begin_inclusive=true, bool end_inclusive=false)
Represents one border page in Masstree Storage.
bool is_active() const
Returns whether the object is an active transaction.
Definition: xct.hpp:121
void copy_combined_key(char *buffer) const
Copies the entire big-endian key of the current record to the given buffer.
ErrorCode overwrite_general(thread::Thread *context, const RecordLocation &location, const void *be_key, KeyLength key_length, const void *payload, PayloadLength payload_offset, PayloadLength payload_count)
implementation of overwrite_record family.
uint64_t KeySlice
Each key slice is an 8-byte integer.
bool is_layer_root() const __attribute__((always_inline))
const uint16_t kMaxIntermediateMiniSeparators
Max number of separators stored in the second level of intermediate pages.
uint16_t KeyLength
Represents a byte-length of a key in this package.
Definition: masstree_id.hpp:69
#define ASSUME_ALIGNED(x, y)
Wraps GCC's __builtin_assume_aligned.
Definition: compiler.hpp:111
Common base of MasstreeIntermediatePage and MasstreeBorderPage.
#define LIKELY(x)
Hints that x is highly likely true.
Definition: compiler.hpp:103
Represents one page in the current search path from layer0-root.
void copy_combined_key_part(KeyLength offset, KeyLength len, char *buffer) const
Another version to get a part of the key, from the offset for len bytes.
void copy_input_key(const char *input_key, KeyLength length, char *buffer, KeySlice *slice_buffer)
KeySlice slice_key(const void *be_bytes, KeyLength slice_length)
Extract a part of a big-endian byte array of given length as KeySlice.
uint8_t Layer
Represents the depth of a B-trie layer.
Definition: masstree_id.hpp:42
bool is_valid_record() const __attribute__((always_inline))
0 means no-error.
Definition: error_code.hpp:87
const uint16_t kMaxIntermediateSeparators
Max number of separators stored in the first level of intermediate pages.
const KeyLength kInitiallyNextLayer
A special key length value that denotes the record in a border page was initially a next-layer pointe...
Definition: masstree_id.hpp:86
void assert_within_valid_volatile_page(const memory::GlobalVolatilePageResolver &resolver, const void *address)
Definition: page.hpp:428
ErrorCode get_first_root(thread::Thread *context, bool for_write, MasstreeIntermediatePage **root)
Root-node related, such as a method to retrieve 1st-root, to grow, etc.
bool is_next_layer() const __attribute__((always_inline))
Definition: xct_id.hpp:1042
PageVersion page_version_
Used in several storage types as concurrency control mechanism for the page.
Definition: page.hpp:272
uint8_t get_layer() const __attribute__((always_inline))
Layer-0 stores the first 8 byte slice, Layer-1 next 8 byte...
bool is_deleted() const __attribute__((always_inline))
Definition: xct_id.hpp:1040
ErrorCode increment_general(thread::Thread *context, const RecordLocation &location, const void *be_key, KeyLength key_length, PAYLOAD *value, PayloadLength payload_offset)
implementation of increment_record family.
ErrorCode delete_general(thread::Thread *context, const RecordLocation &location, const void *be_key, KeyLength key_length)
implementation of delete_record family.
#define CHECK_ERROR_CODE(x)
This macro calls x and checks its returned error code.
Definition: error_code.hpp:155
MasstreeCursor(MasstreeStorage storage, thread::Thread *context)
PageVersionStatus status_
Definition: page.hpp:172
ErrorCode add_to_page_version_set(const storage::PageVersion *version_address, storage::PageVersionStatus observed)
Add the given page version to the page version set of this transaction.
Definition: xct.cpp:242
Atomic fence methods and load/store with fences that work for both C++11/non-C++11 code...
void memory_fence_consume()
Equivalent to std::atomic_thread_fence(std::memory_order_consume).
SlotIndex key_count_
same as stable_.get_key_count()
void memory_fence_acquire()
Equivalent to std::atomic_thread_fence(std::memory_order_acquire).
const KeySlice kSupremumSlice
Layer layer_
Shorthand for page_->get_layer()
SlotIndex order_[kBorderPageMaxSlots]
only for border.
Represents one intermediate page in Masstree Storage.
void memory_fence_seq_cst()
Equivalent to std::atomic_thread_fence(std::memory_order_seq_cst).
KeyLength calculate_suffix_length(KeyLength remainder_length) __attribute__((always_inline))
ErrorCode acquire_local_work_memory(uint32_t size, void **out, uint32_t alignment=8)
Get a tentative work memory of the specified size from pre-allocated thread-private memory...
Definition: xct.hpp:397
0x0A04 : "XCTION : This thread is not running any transaction." .
Definition: error_code.hpp:199
bool snapshot_
Whether this page image is of a snapshot page.
Definition: page.hpp:211
#define UNLIKELY(x)
Hints that x is highly likely false.
Definition: compiler.hpp:104
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
ErrorCode overwrite_record(const void *payload, PayloadLength payload_offset, PayloadLength payload_count)
xct::XctId observed_
TID as of locate_record() identifying the record.
ErrorCode
Enum of error codes defined in error_code.xmacro.
Definition: error_code.hpp:85
ErrorCode increment_record(PAYLOAD *value, PayloadLength payload_offset)
ErrorCode populate_logical(xct::Xct *cur_xct, MasstreeBorderPage *page, SlotIndex index, bool intended_for_write)
Populates the result with XID and possibly readset.
0x0814 : "STORAGE: MASSTREE: Cursor encountered a too deep path" .
Definition: error_code.hpp:183
const PageVersion & get_version() const __attribute__((always_inline))