libfoedus-core
FOEDUS Core Library
masstree_page_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <glog/logging.h>
21 
22 #include <algorithm>
23 #include <cstring>
24 #include <thread>
25 #include <vector>
26 
27 #include "foedus/engine.hpp"
38 #include "foedus/thread/thread.hpp"
39 #include "foedus/xct/xct.hpp"
42 
43 namespace foedus {
44 namespace storage {
45 namespace masstree {
46 
48  StorageId storage_id,
49  VolatilePagePointer page_id,
50  PageType page_type,
51  uint8_t layer,
52  uint8_t level,
53  KeySlice low_fence,
54  KeySlice high_fence) {
55  // std::memset(this, 0, kPageSize); // expensive
56  header_.init_volatile(page_id, storage_id, page_type);
57  header_.masstree_layer_ = layer;
59  ASSERT_ND((page_type == kMasstreeIntermediatePageType) == (level > 0));
60  high_fence_ = high_fence;
61  low_fence_ = low_fence;
62  foster_fence_ = low_fence;
63  foster_twin_[0].word = 0;
64  foster_twin_[1].word = 0;
65  ASSERT_ND(get_key_count() == 0);
66 }
67 
69  StorageId storage_id,
70  SnapshotPagePointer page_id,
71  PageType page_type,
72  uint8_t layer,
73  uint8_t level,
74  KeySlice low_fence,
75  KeySlice high_fence) {
76  header_.init_snapshot(page_id, storage_id, page_type);
77  header_.masstree_layer_ = layer;
79  ASSERT_ND((page_type == kMasstreeIntermediatePageType) == (level > 0));
80  high_fence_ = high_fence;
81  low_fence_ = low_fence;
82  foster_fence_ = low_fence;
83  foster_twin_[0].word = 0;
84  foster_twin_[1].word = 0;
85  ASSERT_ND(get_key_count() == 0);
86 }
87 
89  StorageId storage_id,
90  VolatilePagePointer page_id,
91  uint8_t layer,
92  uint8_t level,
93  KeySlice low_fence,
94  KeySlice high_fence) {
96  storage_id,
97  page_id,
99  layer,
100  level,
101  low_fence,
102  high_fence);
103  for (uint16_t i = 0; i <= kMaxIntermediateSeparators; ++i) {
104  get_minipage(i).key_count_ = 0;
105  }
106 }
107 
109  StorageId storage_id,
110  SnapshotPagePointer page_id,
111  uint8_t layer,
112  uint8_t level,
113  KeySlice low_fence,
114  KeySlice high_fence) {
116  storage_id,
117  page_id,
119  layer,
120  level,
121  low_fence,
122  high_fence);
123  for (uint16_t i = 0; i <= kMaxIntermediateSeparators; ++i) {
124  get_minipage(i).key_count_ = 0;
125  }
126 }
127 
129  StorageId storage_id,
130  VolatilePagePointer page_id,
131  uint8_t layer,
132  KeySlice low_fence,
133  KeySlice high_fence) {
135  storage_id,
136  page_id,
138  layer,
139  0,
140  low_fence,
141  high_fence);
142  consecutive_inserts_ = true; // initially key_count = 0, so of course sorted
143  next_offset_ = 0; // well, already implicitly zero-ed, but to be clear
144 }
145 
147  StorageId storage_id,
148  SnapshotPagePointer page_id,
149  uint8_t layer,
150  KeySlice low_fence,
151  KeySlice high_fence) {
153  storage_id,
154  page_id,
156  layer,
157  0,
158  low_fence,
159  high_fence);
160  consecutive_inserts_ = true; // snapshot pages are always completely sorted
161  next_offset_ = 0; // well, already implicitly zero-ed, but to be clear
162 }
163 
165  const memory::GlobalVolatilePageResolver& page_resolver,
166  memory::PageReleaseBatch* batch) {
168  MasstreeBorderPage* casted = reinterpret_cast<MasstreeBorderPage*>(this);
169  casted->release_pages_recursive(page_resolver, batch);
170  } else {
172  MasstreeIntermediatePage* casted = reinterpret_cast<MasstreeIntermediatePage*>(this);
173  casted->release_pages_recursive(page_resolver, batch);
174  }
175 }
176 
177 
179  const memory::GlobalVolatilePageResolver& page_resolver
181  MasstreePage* p = reinterpret_cast<MasstreePage*>(page_resolver.resolve_offset(pointer));
182  memory::PageReleaseBatch release_batch(engine);
183  p->release_pages_recursive_common(page_resolver, &release_batch);
184  release_batch.release_all();
185 }
186 
187 
189  // so far, we spawn a thread for every single pointer.
190  // it might be an oversubscription, but not a big issue.
191  std::vector<std::thread> threads;
193  for (int i = 0; i < 2; ++i) {
194  ASSERT_ND(!foster_twin_[i].is_null());
195  threads.emplace_back(release_parallel, engine, foster_twin_[i]);
196  }
197  // if this page is moved, following pointers in this page results in double-release.
198  // we just follow pointers in non-moved page.
199  } else {
200  uint16_t key_count = get_key_count();
202  for (uint8_t i = 0; i < key_count + 1; ++i) {
203  MiniPage& minipage = get_minipage(i);
204  uint16_t mini_count = minipage.key_count_;
206  for (uint8_t j = 0; j < mini_count + 1; ++j) {
207  VolatilePagePointer pointer = minipage.pointers_[j].volatile_pointer_;
208  if (!pointer.is_null()) {
209  threads.emplace_back(release_parallel, engine, pointer);
210  }
211  }
212  }
213  }
214 
215  for (auto& t : threads) {
216  t.join();
217  }
218 
219  VolatilePagePointer volatile_id;
220  volatile_id.word = header().page_id_;
222  volatile_id.get_numa_node())->get_volatile_pool();
223  pool->release_one(volatile_id.get_offset());
224 }
225 
227  const memory::GlobalVolatilePageResolver& page_resolver,
228  memory::PageReleaseBatch* batch) {
229  if (!is_empty_range()) {
231  for (int i = 0; i < 2; ++i) {
232  ASSERT_ND(!foster_twin_[i].is_null());
234  reinterpret_cast<MasstreeIntermediatePage*>(
235  page_resolver.resolve_offset(foster_twin_[i]));
236  p->release_pages_recursive(page_resolver, batch);
237  foster_twin_[i].word = 0;
238  }
239  } else {
240  uint16_t key_count = get_key_count();
242  for (uint8_t i = 0; i < key_count + 1; ++i) {
243  MiniPage& minipage = get_minipage(i);
244  uint16_t mini_count = minipage.key_count_;
246  for (uint8_t j = 0; j < mini_count + 1; ++j) {
247  VolatilePagePointer pointer = minipage.pointers_[j].volatile_pointer_;
248  if (!pointer.is_null()) {
249  MasstreePage* child = reinterpret_cast<MasstreePage*>(
250  page_resolver.resolve_offset(pointer));
251  child->release_pages_recursive_common(page_resolver, batch);
252  }
253  }
254  }
255  }
256  } else {
257  ASSERT_ND(!is_moved());
258  ASSERT_ND(get_key_count() == 0);
259  }
260 
261  VolatilePagePointer volatile_id;
262  volatile_id.word = header().page_id_;
263  batch->release(volatile_id);
264 }
265 
267  const memory::GlobalVolatilePageResolver& page_resolver,
268  memory::PageReleaseBatch* batch) {
270  for (int i = 0; i < 2; ++i) {
271  ASSERT_ND(!foster_twin_[i].is_null());
273  = reinterpret_cast<MasstreeBorderPage*>(page_resolver.resolve_offset(foster_twin_[i]));
274  p->release_pages_recursive(page_resolver, batch);
275  foster_twin_[i].word = 0;
276  }
277  } else {
278  SlotIndex key_count = get_key_count();
279  ASSERT_ND(key_count <= kBorderPageMaxSlots);
280  for (SlotIndex i = 0; i < key_count; ++i) {
281  if (does_point_to_layer(i)) {
282  DualPagePointer* pointer = get_next_layer(i);
283  if (!pointer->volatile_pointer_.is_null()) {
284  MasstreePage* child = reinterpret_cast<MasstreePage*>(
285  page_resolver.resolve_offset(pointer->volatile_pointer_));
286  child->release_pages_recursive_common(page_resolver, batch);
287  }
288  }
289  }
290  }
291 
292  VolatilePagePointer volatile_id;
293  volatile_id.word = header().page_id_;
294  batch->release(volatile_id);
295 }
296 
297 
299  const MasstreeBorderPage* copy_from,
300  SlotIndex copy_index) {
301  ASSERT_ND(get_key_count() == 0);
302  ASSERT_ND(!is_locked()); // we don't lock a new page
303  ASSERT_ND(copy_from->get_owner_id(copy_index)->is_keylocked());
304 
305  // This is a new page, so no worry on race.
306  const SlotIndex new_index = 0;
307  Slot* slot = get_new_slot(new_index);
308  const Slot* parent_slot = copy_from->get_slot(copy_index);
309  const SlotLengthPart parent_lengthes = parent_slot->lengthes_.components;
310  const char* parent_record = copy_from->get_record_from_offset(parent_lengthes.offset_);
311 
312  KeyLength parent_remainder = parent_slot->remainder_length_;
313  ASSERT_ND(parent_remainder != kInitiallyNextLayer);
314  ASSERT_ND(parent_remainder > sizeof(KeySlice));
315  KeyLength remainder = parent_remainder - sizeof(KeySlice);
316 
317  // retrieve the first 8 byte (or less) as the new slice.
318  const KeySlice new_slice = slice_key(parent_record, parent_remainder);
319  const PayloadLength payload_length = parent_lengthes.payload_length_;
320  const KeyLength suffix_length_aligned = calculate_suffix_length_aligned(remainder);
321  const DataOffset record_size = to_record_length(remainder, payload_length);
322  const DataOffset new_offset = next_offset_;
323 
324  set_slice(new_index, new_slice);
325 
326  ASSERT_ND(next_offset_ == 0);
327  slot->lengthes_.components.offset_ = new_offset;
328  slot->lengthes_.components.unused_ = 0;
329  slot->lengthes_.components.physical_record_length_ = record_size;
330  slot->lengthes_.components.payload_length_ = payload_length;
331  slot->original_physical_record_length_ = record_size;
332  slot->remainder_length_ = remainder;
333  slot->original_offset_ = new_offset;
334  next_offset_ += record_size;
335  consecutive_inserts_ = true;
336 
337  // use the same xct ID. This means we also inherit deleted flag.
338  slot->tid_.xct_id_ = parent_slot->tid_.xct_id_;
340  // but we don't want to inherit locks
341  slot->tid_.lock_.reset();
342  if (suffix_length_aligned > 0) {
343  // because suffix parts are 8-byte aligned with zero padding, we can memcpy in 8-bytes unit
344  std::memcpy(
345  get_record_from_offset(new_offset),
346  parent_record + sizeof(KeySlice),
347  suffix_length_aligned);
348  }
349  if (payload_length > 0) {
350  std::memcpy(
351  get_record_payload_from_offsets(new_offset, remainder),
352  copy_from->get_record_payload(copy_index),
353  assorted::align8(payload_length)); // payload is zero-padded to 8 bytes, too.
354  }
355 
356  // as we don't lock this page, we directly increment it to avoid is_locked assertion
358 }
359 
361  PayloadLength payload_count,
362  SlotIndex record_index) {
364  ASSERT_ND(is_locked());
365  ASSERT_ND(!is_moved());
366  ASSERT_ND(record_index < get_key_count());
367  DVLOG(2) << "Expanding record.. current max=" << get_max_payload_length(record_index)
368  << ", which must become " << payload_count;
369 
370  ASSERT_ND(verify_slot_lengthes(record_index));
371  Slot* old_slot = get_slot(record_index);
372  ASSERT_ND(!old_slot->tid_.is_moved());
373  ASSERT_ND(old_slot->tid_.is_keylocked());
374  ASSERT_ND(!old_slot->does_point_to_layer());
375  const SlotLengthPart lengthes = old_slot->read_lengthes_oneshot();
376  const KeyLength remainder_length = old_slot->remainder_length_;
377  const DataOffset record_length = to_record_length(remainder_length, payload_count);
378  const DataOffset available = available_space();
379 
380  // 1. Trivial expansion if the record is placed at last. Fastest.
381  if (get_next_offset() == lengthes.offset_ + lengthes.physical_record_length_) {
382  const DataOffset diff = record_length - lengthes.physical_record_length_;
383  DVLOG(1) << "Lucky, expanding a record at last record region. diff=" << diff;
384  if (available >= diff) {
385  DVLOG(2) << "woo. yes, we can just increase the length";
386  old_slot->lengthes_.components.physical_record_length_ = record_length;
388  increase_next_offset(diff);
390  return true;
391  }
392  }
393 
394  // 2. In-page expansion. Fast.
395  if (available >= record_length) {
396  DVLOG(2) << "Okay, in-page record expansion.";
397  // We have to make sure all threads see a valid state, either new or old.
398  SlotLengthPart new_lengthes = lengthes;
399  new_lengthes.offset_ = get_next_offset();
400  new_lengthes.physical_record_length_ = record_length;
401  const char* old_record = get_record_from_offset(lengthes.offset_);
402  char* new_record = get_record_from_offset(new_lengthes.offset_);
403 
404  // 2-a. Create the new record region.
405  if (lengthes.physical_record_length_ > 0) {
406  std::memcpy(new_record, old_record, lengthes.physical_record_length_);
408  }
409 
410  // 2-b. announce the new location in one-shot.
411  old_slot->write_lengthes_oneshot(new_lengthes);
413  // We don't have to change TID here because we did nothing logically.
414  // Reading transactions are safe to read either old or new record regions.
415  // See comments in MasstreeCommonLogType::apply_record_prepare() for how we make it safe
416  // for writing transactions.
417 
418  increase_next_offset(record_length);
420  return true;
421  }
422 
423  // 3. ouch. we need to split the page to complete it. beyond this method.
424  DVLOG(1) << "Umm, we need to split this page for record expansion. available="
425  << available << ", record_length=" << record_length
426  << ", record_index=" << record_index
427  << ", key_count=" << get_key_count();
428  return false;
429 }
430 
432  VolatilePagePointer page_id,
433  MasstreeBorderPage* parent,
434  SlotIndex parent_index) {
435  // This method assumes that the record's payload space is spacious enough.
436  // The caller must make it sure as pre-condition.
437  ASSERT_ND(parent->is_locked());
438  ASSERT_ND(!parent->is_moved());
439  Slot* parent_slot = parent->get_slot(parent_index);
440  ASSERT_ND(parent_slot->tid_.is_keylocked());
441  ASSERT_ND(!parent_slot->tid_.is_moved());
442  ASSERT_ND(!parent_slot->does_point_to_layer());
443  ASSERT_ND(parent->get_max_payload_length(parent_index) >= sizeof(DualPagePointer));
444  DualPagePointer pointer;
445  pointer.snapshot_pointer_ = 0;
446  pointer.volatile_pointer_ = page_id;
447 
448  // initialize the root page by copying the record
450  parent->header_.storage_id_,
451  page_id,
452  parent->get_layer() + 1,
453  kInfimumSlice, // infimum slice
454  kSupremumSlice); // high-fence is supremum
455  ASSERT_ND(!is_locked());
456  initialize_layer_root(parent, parent_index);
457  ASSERT_ND(!is_moved());
458  ASSERT_ND(!is_retired());
459 
460  SlotLengthPart new_lengthes = parent_slot->read_lengthes_oneshot();
461  new_lengthes.payload_length_ = sizeof(DualPagePointer);
462  char* parent_payload = parent->get_record_payload(parent_index);
463 
464  // point to the new page. Be careful on ordering.
465  std::memcpy(parent_payload, &pointer, sizeof(pointer));
467  parent_slot->write_lengthes_oneshot(new_lengthes);
469  parent_slot->tid_.xct_id_.set_next_layer(); // which also turns off delete-bit
470 
471  ASSERT_ND(parent->get_next_layer(parent_index)->volatile_pointer_ == page_id);
473 }
474 
476 #ifndef NDEBUG
477  if (is_empty_range()) {
478  ASSERT_ND(get_key_count() == 0);
479  ASSERT_ND(!is_moved());
480  return;
481  }
482  for (uint8_t i = 0; i <= get_key_count(); ++i) {
483  KeySlice low, high;
484  if (i < get_key_count()) {
485  if (i > 0) {
486  low = separators_[i - 1];
487  } else {
488  low = low_fence_;
489  }
490  high = separators_[i];
491  ASSERT_ND(separators_[i] > low);
492  } else {
493  ASSERT_ND(i == get_key_count());
494  if (i == 0) {
495  low = low_fence_;
496  } else {
497  low = separators_[i - 1];
498  }
499  high = high_fence_;
500  }
501  const MiniPage& minipage = get_minipage(i);
502  for (uint8_t j = 0; j <= minipage.key_count_; ++j) {
503  ASSERT_ND(!minipage.pointers_[j].is_both_null());
504  if (j < minipage.key_count_) {
505  ASSERT_ND(minipage.separators_[j] > low);
506  ASSERT_ND(minipage.separators_[j] < high);
507  }
508  }
509  }
510 #endif // NDEBUG
511 }
512 
513 MasstreeBorderPage* MasstreeBorderPage::track_foster_child(
514  KeySlice slice,
515  const memory::GlobalVolatilePageResolver& resolver) {
516  MasstreeBorderPage* cur_page = this;
517  while (cur_page->is_moved()) {
518  ASSERT_ND(cur_page->has_foster_child());
519  ASSERT_ND(!cur_page->is_empty_range());
520  if (cur_page->within_foster_minor(slice)) {
521  ASSERT_ND(!cur_page->within_foster_major(slice));
522  cur_page = reinterpret_cast<MasstreeBorderPage*>(
523  resolver.resolve_offset(cur_page->get_foster_minor()));
524  } else {
525  ASSERT_ND(cur_page->within_foster_major(slice));
526  cur_page = reinterpret_cast<MasstreeBorderPage*>(
527  resolver.resolve_offset(cur_page->get_foster_major()));
528  }
529  ASSERT_ND(!cur_page->is_empty_range());
530  }
531  return cur_page;
532 }
533 
535  Engine* engine,
536  xct::RwLockableXctId* owner_address,
537  xct::WriteXctAccess* /*write_set*/) {
538  ASSERT_ND(owner_address->is_moved() || owner_address->is_next_layer());
539  ASSERT_ND(!header().snapshot_);
540  ASSERT_ND(header().get_page_type() == kMasstreeBorderPageType);
541 
542  // Slot begins with TID, so it's also the slot address
543  Slot* slot = reinterpret_cast<Slot*>(owner_address);
544  ASSERT_ND(&slot->tid_ == owner_address);
545  const SlotIndex original_index = to_slot_index(slot);
546  ASSERT_ND(original_index < kBorderPageMaxSlots);
547  ASSERT_ND(original_index < get_key_count());
548 
549  const KeyLength remainder = slot->remainder_length_;
550  // If it's originally a next-layer record, why we took it as a record? This shouldn't happen.
551  ASSERT_ND(remainder != kInitiallyNextLayer);
552 
553  if (does_point_to_layer(original_index)) {
554  return track_moved_record_next_layer(engine, owner_address);
555  }
556 
557  // otherwise, we can track without key information within this layer.
558  // the slice and key length is enough to identify the record.
559  ASSERT_ND(is_moved());
563  const char* suffix = get_record(original_index);
564  KeySlice slice = get_slice(original_index);
565 
566  // if remainder <= 8 : this layer can have only one record that has this slice and this length.
567  // if remainder > 8 : this layer has either the exact record, or it's now a next layer pointer.
568 
569  // recursively track. although probably it's only one level
570  MasstreeBorderPage* cur_page = this;
571  const memory::GlobalVolatilePageResolver& resolver
573  while (true) {
574  cur_page = cur_page->track_foster_child(slice, resolver);
575 
576  // now cur_page must be the page that contains the record.
577  // the only exception is
578  // 1) again the record is being moved concurrently
579  // 2) the record was moved to another layer
580  SlotIndex index = cur_page->find_key(slice, suffix, remainder);
581  if (index == kBorderPageMaxSlots) {
582  // this can happen rarely because we are not doing the stable version trick here.
583  // this is rare, so we just abort. no safety violation.
584  VLOG(0) << "Very interesting. moved record not found due to concurrent updates";
586  }
587 
588  Slot* cur_slot = cur_page->get_slot(index);
589  xct::RwLockableXctId* new_owner_address = &cur_slot->tid_;
590  char* new_record_address = cur_page->get_record(index);
591  if (cur_page->does_point_to_layer(index)) {
592  // another rare case. the record has been now moved to another layer.
593  if (remainder <= sizeof(KeySlice)) {
594  // the record we are looking for can't be stored in next layer..
595  VLOG(0) << "Wtf 2. moved record in next layer not found. Probably due to concurrent thread";
597  }
598 
599  VLOG(0) << "Interesting. moved record are now in another layer. further track.";
600  return cur_page->track_moved_record_next_layer(engine, new_owner_address);
601  }
602 
603  // Otherwise, this is it!
604  // be careful, we give get_record() as "payload". the word "payload" is a bit overused here.
605  return xct::TrackMovedRecordResult(new_owner_address, new_record_address);
606  }
607 }
608 
610  Engine* engine,
611  xct::RwLockableXctId* owner_address) {
612  ASSERT_ND(!header().snapshot_);
613  ASSERT_ND(header().get_page_type() == kMasstreeBorderPageType);
614  ASSERT_ND(owner_address->xct_id_.is_next_layer());
615 
616  Slot* slot = reinterpret_cast<Slot*>(owner_address);
617  ASSERT_ND(&slot->tid_ == owner_address);
618  const SlotIndex original_index = to_slot_index(slot);
619  ASSERT_ND(original_index < kBorderPageMaxSlots);
620  ASSERT_ND(original_index < get_key_count());
621  ASSERT_ND(does_point_to_layer(original_index));
622 
623  // The new masstree page layout leaves the original suffix untouched.
624  // Thus, we can always retrieve the original key from it no matter whether
625  // the record moved offset to expand. In case the record was expanded in-page,
626  // we use original_xxx below.
627  const KeyLength remainder = slot->remainder_length_;
628  ASSERT_ND(remainder != kInitiallyNextLayer);
629  if (UNLIKELY(remainder <= sizeof(KeySlice))) {
630  LOG(ERROR) << "WTF. The original record was too short to get moved to next-layer, but it did?";
632  }
633 
634  const char* original_suffix = get_record(original_index);
635  const uint8_t cur_layer = get_layer();
636  const uint8_t next_layer = cur_layer + 1U;
637  const KeySlice next_slice = slice_key(original_suffix, remainder - sizeof(KeySlice));
638  const KeyLength next_remainder = remainder - sizeof(KeySlice);
639  const char* next_suffix = original_suffix + sizeof(KeySlice);
640 
641  VolatilePagePointer root_pointer = get_next_layer(original_index)->volatile_pointer_;
642  if (root_pointer.is_null()) {
643  VLOG(0) << "Wtf. moved record in next layer not found. Probably due to concurrent thread";
645  }
646 
647  const memory::GlobalVolatilePageResolver& resolver
649  MasstreePage* cur_page
650  = reinterpret_cast<MasstreeBorderPage*>(resolver.resolve_offset(root_pointer));
651  ASSERT_ND(cur_page->get_low_fence() == kInfimumSlice && cur_page->is_high_fence_supremum());
652  ASSERT_ND(cur_page->get_layer() == next_layer);
653 
654  while (true) {
655  ASSERT_ND(cur_page->get_layer() == next_layer);
656  ASSERT_ND(cur_page->within_fences(next_slice));
657 
658  if (!cur_page->is_border()) {
659  MasstreeIntermediatePage* casted = reinterpret_cast<MasstreeIntermediatePage*>(cur_page);
660  uint8_t index = casted->find_minipage(next_slice);
661  MasstreeIntermediatePage::MiniPage& minipage = casted->get_minipage(index);
662  uint8_t index_mini = minipage.find_pointer(next_slice);
663  VolatilePagePointer pointer = minipage.pointers_[index_mini].volatile_pointer_;
664  if (pointer.is_null()) {
665  VLOG(0) << "Wtf 1. moved record in next layer not found. Probably due to concurrent thread";
667  }
668  cur_page = reinterpret_cast<MasstreePage*>(resolver.resolve_offset(pointer));
669  ASSERT_ND(cur_page->get_layer() == next_layer);
670  ASSERT_ND(cur_page->within_fences(next_slice));
671  continue;
672  }
673 
674  // now cur_page must be the page that contains the record.
675  // the only exception is
676  // 1) again the record is being moved concurrently
677  // 2) the record was moved to another layer
678  MasstreeBorderPage* casted = reinterpret_cast<MasstreeBorderPage*>(cur_page);
679  // we track foster child in border pages only
680  casted = casted->track_foster_child(next_slice, resolver);
681  ASSERT_ND(casted != this);
682  SlotIndex index = casted->find_key(next_slice, next_suffix, next_remainder);
683  if (index == kBorderPageMaxSlots) {
684  VLOG(0) << "Very interesting. moved record not found due to concurrent updates";
686  }
687 
688  xct::RwLockableXctId* new_owner_address = casted->get_owner_id(index);
689  ASSERT_ND(new_owner_address != owner_address);
690  char* new_record_address = casted->get_record(index);
691  if (casted->does_point_to_layer(index)) {
692  // the record has been now moved to yet another layer.
693  if (next_remainder <= sizeof(KeySlice)) {
694  // the record we are looking for can't be stored in next layer..
695  VLOG(0) << "Wtf 2. moved record in next layer not found. Probably due to concurrent thread";
697  }
698 
699  VLOG(0) << "Interesting. moved record are now in another layer. further track.";
700  return casted->track_moved_record_next_layer(engine, new_owner_address);
701  }
702 
703  // be careful, we give get_record() as "payload". the word "payload" is a bit overused here.
704  return xct::TrackMovedRecordResult(new_owner_address, new_record_address);
705  }
706 }
707 
709  const Slot* slot = get_slot(index);
710  SlotLengthPart lengthes = slot->lengthes_.components;
711  if (lengthes.offset_ % 8 != 0) {
712  ASSERT_ND(false);
713  return false;
714  }
715  if (lengthes.physical_record_length_ % 8 != 0) {
716  ASSERT_ND(false);
717  return false;
718  }
719  if (lengthes.offset_ + lengthes.physical_record_length_ > sizeof(data_)) {
720  ASSERT_ND(false);
721  return false;
722  }
723 
724  if (slot->remainder_length_ == kInitiallyNextLayer) {
725  if (!slot->does_point_to_layer()) {
726  ASSERT_ND(false);
727  return false;
728  }
729  }
730  // the only case we move the record in-page is to get a larger record size.
731  if (lengthes.offset_ != slot->original_offset_) {
732  if (lengthes.offset_ < slot->original_offset_) {
733  ASSERT_ND(false);
734  return false;
735  }
737  ASSERT_ND(false);
738  return false;
739  }
740  }
741 
742  if (slot->does_point_to_layer()) {
743  if (slot->remainder_length_ <= sizeof(KeySlice)) {
744  ASSERT_ND(false);
745  return false;
746  }
747  if (lengthes.payload_length_ != sizeof(DualPagePointer)) {
748  ASSERT_ND(false);
749  return false;
750  }
751  } else {
752  if (lengthes.payload_length_ > slot->get_max_payload_peek()) {
753  ASSERT_ND(false);
754  return false;
755  }
756  }
757  return true;
758 }
759 
760 } // namespace masstree
761 } // namespace storage
762 } // namespace foedus
void set_slice(SlotIndex index, KeySlice slice) __attribute__((always_inline))
DataOffset physical_record_length_
Byte count this record occupies.
const KeySlice kInfimumSlice
void write_lengthes_oneshot(SlotLengthPart new_value) __attribute__((always_inline))
Writes lengthes_ in one-shot.
storage::Page * resolve_offset(uint8_t numa_node, PagePoolOffset offset) const __attribute__((always_inline))
Resolves offset plus NUMA node ID to storage::Page*.
KeySlice high_fence_
Inclusive high fence of this page.
SlotIndex get_key_count() const __attribute__((always_inline))
physical key count (those keys might be deleted) in this page.
PayloadLength get_max_payload_peek() const __attribute__((always_inline))
This might be affected by concurrent threads.
KeySlice get_slice(SlotIndex index) const __attribute__((always_inline))
T align8(T value)
8-alignment.
void release_pages_recursive_common(const memory::GlobalVolatilePageResolver &page_resolver, memory::PageReleaseBatch *batch)
void release_pages_recursive(const memory::GlobalVolatilePageResolver &page_resolver, memory::PageReleaseBatch *batch)
KeyLength calculate_suffix_length_aligned(KeyLength remainder_length) __attribute__((always_inline))
SlotLengthUnion lengthes_
Stores mutable length information of the record.
Represents a pointer to another page (usually a child page).
Definition: storage_id.hpp:271
bool is_empty_range() const __attribute__((always_inline))
An empty-range page, either intermediate or border, never has any entries.
uint16_t PayloadLength
Represents a byte-length of a payload in this package.
Definition: masstree_id.hpp:92
void release_all()
Called at the end to return all batched pages to their pools.
Definition: page_pool.cpp:189
uint16_t SlotIndex
Index of a record in a (border) page.
const char * get_record_payload_from_offsets(DataOffset record_offset, KeyLength remainder_length) const __attribute__((always_inline))
void initialize_snapshot_page(StorageId storage_id, SnapshotPagePointer page_id, uint8_t layer, KeySlice low_fence, KeySlice high_fence)
void initialize_as_layer_root_physical(VolatilePagePointer page_id, MasstreeBorderPage *parent, SlotIndex parent_index)
A physical-only method to initialize this page as a volatile page of a layer-root pointed from the gi...
PayloadLength get_max_payload_length(SlotIndex index) const __attribute__((always_inline))
Page pool for volatile read/write store (VolatilePage) and the read-only bufferpool (SnapshotPage)...
Definition: page_pool.hpp:173
bool is_locked() const __attribute__((always_inline))
uint32_t StorageId
Unique ID for storage.
Definition: storage_id.hpp:55
void init_volatile(VolatilePagePointer page_id, StorageId storage_id, PageType page_type) __attribute__((always_inline))
Definition: page.hpp:284
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
Represents a record of write-access during a transaction.
Definition: xct_access.hpp:168
bool is_moved() const __attribute__((always_inline))
bool is_border() const __attribute__((always_inline))
KeySlice low_fence_
Inclusive low fence of this page.
uint8_t masstree_layer_
used only in masstree.
Definition: page.hpp:226
StorageId storage_id_
ID of the storage this page belongs to.
Definition: page.hpp:196
Declares all log types used in this storage type.
const GlobalVolatilePageResolver & get_global_volatile_page_resolver() const
Returns the page resolver to convert volatile page ID to page pointer.
Slot * get_new_slot(SlotIndex index) __attribute__((always_inline))
uint16_t DataOffset
Byte-offset in a page.
bool is_foster_minor_null() const __attribute__((always_inline))
PageType
The following 1-byte value is stored in the common page header.
Definition: page.hpp:46
bool is_foster_major_null() const __attribute__((always_inline))
Result of track_moved_record().
Definition: xct_id.hpp:1180
void release_one(PagePoolOffset offset)
Returns only one page.
Definition: page_pool.cpp:144
KeySlice foster_fence_
Inclusive low_fence of foster child.
Represents a pointer to a volatile page with modification count for preventing ABA.
Definition: storage_id.hpp:194
MiniPage & get_minipage(uint8_t index) __attribute__((always_inline))
SlotLengthPart read_lengthes_oneshot() const __attribute__((always_inline))
Reads lengthes_ of this slot in one-shot.
Represents one border page in Masstree Storage.
DualPagePointer * get_next_layer(SlotIndex index) __attribute__((always_inline))
XctId xct_id_
the second 64bit: Persistent status part of TID.
Definition: xct_id.hpp:1137
uint64_t KeySlice
Each key slice is an 8-byte integer.
const uint16_t kMaxIntermediateMiniSeparators
Max number of separators stored in the second level of intermediate pages.
VolatilePagePointer foster_twin_[2]
Points to foster children, or tentative child pages.
uint16_t KeyLength
Represents a byte-length of a key in this package.
Definition: masstree_id.hpp:69
Common base of MasstreeIntermediatePage and MasstreeBorderPage.
char * get_record(SlotIndex index) __attribute__((always_inline))
bool is_retired() const __attribute__((always_inline))
McsRwLock lock_
the first 64bit: Locking part of TID
Definition: xct_id.hpp:1134
bool does_point_to_layer() const __attribute__((always_inline))
void initialize_volatile_common(StorageId storage_id, VolatilePagePointer page_id, PageType page_type, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
The MCS reader-writer lock variant of LockableXctId.
Definition: xct_id.hpp:1132
KeySlice slice_key(const void *be_bytes, KeyLength slice_length)
Extract a part of a big-endian byte array of given length as KeySlice.
DualPagePointer pointers_[kMaxIntermediateMiniSeparators+1]
DataOffset available_space() const
Returns usable data space in bytes.
VolatilePagePointer volatile_pointer_
Definition: storage_id.hpp:308
bool is_moved() const __attribute__((always_inline))
Definition: page.hpp:139
void release_pages_recursive_parallel(Engine *engine)
This method is used when we release a large number of volatile pages, most likely when we drop a stor...
uint8_t masstree_in_layer_level_
used only in masstree.
Definition: page.hpp:243
bool is_moved() const __attribute__((always_inline))
Definition: xct_id.hpp:1142
uint16_t key_count_
physical key count (those keys might be deleted) in this page.
Definition: page.hpp:219
DataOffset original_offset_
The value of offset_ as of the creation of this record.
memory::PagePoolOffset get_offset() const
Definition: storage_id.hpp:202
const uint16_t kMaxIntermediateSeparators
Max number of separators stored in the first level of intermediate pages.
A helper class to return a bunch of pages to individual nodes.
Definition: page_pool.hpp:276
uint64_t SnapshotPagePointer
Page ID of a snapshot page.
Definition: storage_id.hpp:79
DataOffset original_physical_record_length_
The value of physical_record_length_ as of the creation of this record.
const KeyLength kInitiallyNextLayer
A special key length value that denotes the record in a border page was initially a next-layer pointe...
Definition: masstree_id.hpp:86
PageType get_page_type() const
Definition: page.hpp:280
Database engine object that holds all resources and provides APIs.
Definition: engine.hpp:109
NumaNodeMemoryRef * get_node_memory(foedus::thread::ThreadGroupId group) const
Fix-sized slot for each record, which is placed at the end of data region.
void initialize_snapshot_common(StorageId storage_id, SnapshotPagePointer page_id, PageType page_type, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
const Slot * get_slot(SlotIndex index) const __attribute__((always_inline))
SnapshotPagePointer snapshot_pointer_
Definition: storage_id.hpp:307
const SlotIndex kBorderPageMaxSlots
Maximum number of slots in one MasstreeBorderPage.
bool is_next_layer() const __attribute__((always_inline))
Definition: xct_id.hpp:1042
A piece of Slot object that must be read/written in one-shot, meaning no one reads half-written value...
PageVersion page_version_
Used in several storage types as concurrency control mechanism for the page.
Definition: page.hpp:272
uint8_t get_layer() const __attribute__((always_inline))
Layer-0 stores the first 8 byte slice, Layer-1 next 8 byte...
bool within_foster_minor(KeySlice slice) const __attribute__((always_inline))
KeySlice separators_[kMaxIntermediateMiniSeparators]
Same semantics as separators_ in enclosing class.
void initialize_layer_root(const MasstreeBorderPage *copy_from, SlotIndex copy_index)
Copy the initial record that will be the only record for a new root page.
SlotIndex to_slot_index(const Slot *slot) const __attribute__((always_inline))
void release_parallel(Engine *engine, VolatilePagePointer pointer)
bool has_foster_child() const __attribute__((always_inline))
KeySlice get_low_fence() const __attribute__((always_inline))
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, uint8_t layer, KeySlice low_fence, KeySlice high_fence)
SlotIndex find_key(KeySlice slice, const void *suffix, KeyLength remainder) const __attribute__((always_inline))
Navigates a searching key-slice to one of the record in this page.
bool try_expand_record_in_page_physical(PayloadLength payload_count, SlotIndex record_index)
A physical-only method to expand a record within this page without any logical change.
bool is_high_fence_supremum() const __attribute__((always_inline))
xct::TrackMovedRecordResult track_moved_record(Engine *engine, xct::RwLockableXctId *old_address, xct::WriteXctAccess *write_set)
xct::TrackMovedRecordResult track_moved_record_next_layer(Engine *engine, xct::RwLockableXctId *old_address)
This one further tracks it to next layer.
KeyLength remainder_length_
Followings are immutable.
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
void release(storage::VolatilePagePointer page_id)
Returns the given in-memory volatile page to appropriate NUMA node.
Definition: page_pool.hpp:292
void init_snapshot(SnapshotPagePointer page_id, StorageId storage_id, PageType page_type) __attribute__((always_inline))
Definition: page.hpp:301
char * get_record_payload(SlotIndex index) __attribute__((always_inline))
void release_pages_recursive(const memory::GlobalVolatilePageResolver &page_resolver, memory::PageReleaseBatch *batch)
Atomic fence methods and load/store with fences that work for both C++11/non-C++11 code...
static DataOffset to_record_length(KeyLength remainder_length, PayloadLength payload_length)
returns minimal physical_record_length_ for the given remainder/payload length.
Resolves an offset in a volatile page pool to an actual pointer and vice versa.
const KeySlice kSupremumSlice
xct::RwLockableXctId * get_owner_id(SlotIndex index) __attribute__((always_inline))
Represents one intermediate page in Masstree Storage.
#define UNLIKELY(x)
Hints that x is highly likely false.
Definition: compiler.hpp:104
bool snapshot_
Whether this page image is of a snapshot page.
Definition: page.hpp:211
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
bool within_foster_major(KeySlice slice) const __attribute__((always_inline))
xct::RwLockableXctId tid_
TID of the record.
void memory_fence_release()
Equivalent to std::atomic_thread_fence(std::memory_order_release).
bool is_next_layer() const __attribute__((always_inline))
Definition: xct_id.hpp:1143
bool within_fences(KeySlice slice) const __attribute__((always_inline))
bool does_point_to_layer(SlotIndex index) const __attribute__((always_inline))
memory::EngineMemory * get_memory_manager() const
See Memory Manager.
Definition: engine.cpp:50
uint8_t find_minipage(KeySlice slice) const __attribute__((always_inline))
Navigates a searching key-slice to one of the mini pages in this page.
DataOffset offset_
Byte offset in data_ where this record starts.
uint8_t find_pointer(KeySlice slice) const __attribute__((always_inline))
Navigates a searching key-slice to one of pointers in this mini-page.
bool is_keylocked() const __attribute__((always_inline))
Definition: xct_id.hpp:1140
void initialize_snapshot_page(StorageId storage_id, SnapshotPagePointer page_id, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
char * get_record_from_offset(DataOffset record_offset) __attribute__((always_inline))
Offset versions.
uint64_t page_id_
Page ID of this page.
Definition: page.hpp:191