libfoedus-core
FOEDUS Core Library
masstree_split_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <glog/logging.h>
21 
22 #include <algorithm>
23 
24 #include "foedus/assert_nd.hpp"
28 #include "foedus/thread/thread.hpp"
29 
30 namespace foedus {
31 namespace storage {
32 namespace masstree {
33 
42 
44  DVLOG(1) << "Splitting a page... ";
45 
46  // First, lock the page, The page's lock state is before all the records in the page,
47  // so we can simply lock it first.
48  CHECK_ERROR_CODE(context_->sysxct_page_lock(sysxct_workspace, reinterpret_cast<Page*>(target_)));
49 
50  // The lock involves atomic operation, so now all we see are finalized.
51  if (target_->has_foster_child()) {
52  DVLOG(0) << "Interesting. the page has been already split";
53  return kErrorCodeOk;
54  }
55  if (target_->get_key_count() <= 1U) {
56  DVLOG(0) << "This page has too few records. Can't split it";
57  return kErrorCodeOk;
58  }
59 
60  const SlotIndex key_count = target_->get_key_count();
61 
62  // 2 free volatile pages needed.
63  // foster-minor/major (will be placed in successful case)
64  memory::PagePoolOffset offsets[2];
65  thread::GrabFreeVolatilePagesScope free_pages_scope(context_, offsets);
66  CHECK_ERROR_CODE(free_pages_scope.grab(2));
67  const auto& resolver = context_->get_local_volatile_page_resolver();
68 
69  SplitStrategy strategy; // small. just place it on stack
70  decide_strategy(&strategy);
71  ASSERT_ND(target_->get_low_fence() <= strategy.mid_slice_);
73  MasstreeBorderPage* twin[2];
74  VolatilePagePointer new_page_ids[2];
75  for (int i = 0; i < 2; ++i) {
76  twin[i] = reinterpret_cast<MasstreeBorderPage*>(resolver.resolve_offset_newpage(offsets[i]));
77  new_page_ids[i].set(context_->get_numa_node(), offsets[i]);
78  twin[i]->initialize_volatile_page(
80  new_page_ids[i],
81  target_->get_layer(),
82  i == 0 ? target_->get_low_fence() : strategy.mid_slice_, // low-fence
83  i == 0 ? strategy.mid_slice_ : target_->get_high_fence()); // high-fence
84  }
85 
86  // lock all records
87  CHECK_ERROR_CODE(lock_existing_records(sysxct_workspace));
88 
89  if (strategy.no_record_split_) {
91  // in this case, we can move all records in one memcpy.
92  // well, actually two : one for slices and another for data.
93  std::memcpy(twin[0]->slices_, target_->slices_, sizeof(KeySlice) * key_count);
94  std::memcpy(twin[0]->data_, target_->data_, sizeof(target_->data_));
95  twin[0]->set_key_count(key_count);
96  twin[1]->set_key_count(0);
97  twin[0]->consecutive_inserts_ = target_->is_consecutive_inserts();
98  twin[1]->consecutive_inserts_ = true;
99  twin[0]->next_offset_ = target_->get_next_offset();
100  twin[1]->next_offset_ = 0;
101  for (SlotIndex i = 0; i < key_count; ++i) {
102  xct::RwLockableXctId* owner_id = twin[0]->get_owner_id(i);
103  ASSERT_ND(owner_id->is_keylocked());
104  owner_id->get_key_lock()->reset(); // no race
105  }
106  } else {
108  strategy.smallest_slice_,
109  strategy.mid_slice_ - 1, // to make it inclusive
110  twin[0]);
112  strategy.mid_slice_,
113  strategy.largest_slice_, // this is inclusive (to avoid supremum hassles)
114  twin[1]);
115  }
116 
117  // Now we will install the new pages. **From now on no error-return allowed**
119  // We install pointers to the pages AFTER we initialize the pages.
120  target_->install_foster_twin(new_page_ids[0], new_page_ids[1], strategy.mid_slice_);
121  free_pages_scope.dispatch(0);
122  free_pages_scope.dispatch(1);
124 
125  // invoking set_moved is the point we announce all of these changes. take fence to make it right
128 
129  // set the "moved" bit so that concurrent transactions
130  // check foster-twin for read-set/write-set checks.
131  for (SlotIndex i = 0; i < key_count; ++i) {
132  xct::RwLockableXctId* owner_id = target_->get_owner_id(i);
133  owner_id->xct_id_.set_moved();
134  }
135 
137 
138  watch.stop();
139  DVLOG(1) << "Costed " << watch.elapsed() << " cycles to split a page. original page physical"
140  << " record count: " << static_cast<int>(key_count)
141  << "->" << static_cast<int>(twin[0]->get_key_count())
142  << " + " << static_cast<int>(twin[1]->get_key_count());
143  return kErrorCodeOk;
144 }
145 
148  const SlotIndex key_count = target_->get_key_count();
149  ASSERT_ND(key_count > 0);
150  out->original_key_count_ = key_count;
151  out->no_record_split_ = false;
152  out->smallest_slice_ = target_->get_slice(0);
153  out->largest_slice_ = target_->get_slice(0);
154 
155  // if consecutive_inserts_, we are already sure about the key distributions, so easy.
157  out->largest_slice_ = target_->get_slice(key_count - 1);
159  out->no_record_split_ = true;
160  DVLOG(1) << "Obviously no record split. key_count=" << static_cast<int>(key_count);
161  out->mid_slice_ = out->largest_slice_ + 1;
162  } else {
164  DVLOG(1) << "No-record split was possible, but disable_no_record_split specified."
165  << " simply splitting in half...";
166  }
167  DVLOG(1) << "Breaks a sequential page. key_count=" << static_cast<int>(key_count);
168  out->mid_slice_ = target_->get_slice(key_count / 2);
169  }
170  return;
171  }
172 
173  for (SlotIndex i = 1; i < key_count; ++i) {
174  const KeySlice this_slice = target_->get_slice(i);
175  out->smallest_slice_ = std::min<KeySlice>(this_slice, out->smallest_slice_);
176  out->largest_slice_ = std::max<KeySlice>(this_slice, out->largest_slice_);
177  }
178 
179  ASSERT_ND(key_count >= 2U); // because it's not consecutive, there must be at least 2 records.
180 
181  {
182  // even if not, there is another easy case where two "tides" mix in this page;
183  // one tide from left sequentially inserts keys while another tide from right also sequentially
184  // inserts keys that are larger than left tide. This usually happens at the boundary of
185  // two largely independent partitions (eg multiple threads inserting keys of their partition).
186  // In that case, we should cleanly separate the two tides by picking the smallest key from
187  // right-tide as the separator.
188  KeySlice tides_max[2];
189  KeySlice second_tide_min = kInfimumSlice;
190  bool first_tide_broken = false;
191  bool both_tides_broken = false;
192  tides_max[0] = target_->get_slice(0);
193  // for example, consider the following case:
194  // 1 2 32 33 3 4 34 x
195  // There are two tides 1- and 32-. We detect them as follows.
196  // We initially consider 1,2,32,33 as the first tide because they are sequential.
197  // Then, "3" breaks the first tide. We then consider 1- and 32- as the two tides.
198  // If x breaks the tide again, we give up.
199  for (SlotIndex i = 1; i < key_count; ++i) {
200  // look for "tide breaker" that is smaller than the max of the tide.
201  // as soon as we found two of them (meaning 3 tides or more), we give up.
202  KeySlice slice = target_->get_slice(i);
203  if (!first_tide_broken) {
204  if (slice >= tides_max[0]) {
205  tides_max[0] = slice;
206  continue; // ok!
207  } else {
208  // let's find where a second tide starts.
209  first_tide_broken = true;
210  SlotIndex first_breaker;
211  for (first_breaker = 0; first_breaker < i; ++first_breaker) {
212  const KeySlice breaker_slice = target_->get_slice(first_breaker);
213  if (breaker_slice > slice) {
214  break;
215  }
216  }
217  ASSERT_ND(first_breaker < i);
218  tides_max[0] = slice;
219  ASSERT_ND(second_tide_min == kInfimumSlice);
220  second_tide_min = target_->get_slice(first_breaker);
221  tides_max[1] = target_->get_slice(i - 1);
222  ASSERT_ND(tides_max[0] < tides_max[1]);
223  ASSERT_ND(tides_max[0] < second_tide_min);
224  ASSERT_ND(second_tide_min <= tides_max[1]);
225  }
226  } else {
227  if (slice < second_tide_min && slice >= tides_max[0]) {
228  tides_max[0] = slice;
229  continue; // fine, in the first tide
230  } else if (slice >= tides_max[1]) {
231  tides_max[1] = slice; // okay, in the second tide
232  } else {
233  DVLOG(2) << "Oops, third tide. not the easy case";
234  both_tides_broken = true;
235  break;
236  }
237  }
238  }
239 
240  // Already sorted? (seems consecutive_inserts_ has some false positives)
241  if (!first_tide_broken) {
243  out->no_record_split_ = true;
244  DVLOG(1) << "Obviously no record split. key_count=" << static_cast<int>(key_count);
245  out->mid_slice_ = out->largest_slice_ + 1;
246  } else {
248  DVLOG(1) << "No-record split was possible, but disable_no_record_split specified."
249  << " simply splitting in half...";
250  }
251  DVLOG(1) << "Breaks a sequential page. key_count=" << static_cast<int>(key_count);
252  out->mid_slice_ = target_->get_slice(key_count / 2);
253  }
254  return;
255  }
256 
257  ASSERT_ND(first_tide_broken);
258  if (!both_tides_broken) {
259  DVLOG(0) << "Yay, figured out two-tides meeting in a page.";
260  out->mid_slice_ = second_tide_min;
261  return;
262  }
263  }
264 
265 
266  // now we have to pick separator. as we don't sort in-page, this is approximate median selection.
267  // there are a few smart algorithm out there, but we don't need that much accuracy.
268  // just randomly pick a few. good enough.
269  assorted::UniformRandom uniform_random(12345);
270  const SlotIndex kSamples = 7;
271  KeySlice choices[kSamples];
272  for (uint8_t i = 0; i < kSamples; ++i) {
273  choices[i] = target_->get_slice(uniform_random.uniform_within(0, key_count - 1));
274  }
275  std::sort(choices, choices + kSamples);
276  out->mid_slice_ = choices[kSamples / 2];
277 
278  // scan through again to make sure the new separator is not used multiple times as key.
279  // this is required for the invariant "same slices must be in same page"
280  while (true) {
281  bool observed = false;
282  bool retry = false;
283  for (SlotIndex i = 0; i < key_count; ++i) {
284  const KeySlice this_slice = target_->get_slice(i);
285  if (this_slice == out->mid_slice_) {
286  if (observed) {
287  // the key appeared twice! let's try another slice.
288  ++out->mid_slice_;
289  retry = true;
290  break;
291  } else {
292  observed = true;
293  }
294  }
295  }
296  if (retry) {
297  continue;
298  } else {
299  break;
300  }
301  }
302 }
303 
305  debugging::RdtscWatch watch; // check how expensive this is
307  const SlotIndex key_count = target_->get_key_count();
308  ASSERT_ND(key_count > 0);
309 
310  // We use the batched interface. It internally sorts, but has better performance if
311  // we provide an already-sorted input. Remember that slots grow backwards,
312  // so larger indexes have smaller lock IDs.
314  for (SlotIndex i = 0; i < key_count; ++i) {
315  record_locks[i] = target_->get_owner_id(key_count - 1U - i); // larger indexes first
316  }
317 
320  sysxct_workspace,
321  page_id,
322  key_count,
323  record_locks));
324 
325 #ifndef NDEBUG
326  for (SlotIndex i = 0; i < key_count; ++i) {
327  xct::RwLockableXctId* owner_id = target_->get_owner_id(i);
328  ASSERT_ND(owner_id->is_keylocked());
329  }
330 #endif // NDEBUG
331  watch.stop();
332  DVLOG(1) << "Costed " << watch.elapsed() << " cycles to lock all of "
333  << static_cast<int>(key_count) << " records while splitting";
334  if (watch.elapsed() > (1ULL << 26)) {
335  // if we see this often, we have to optimize this somehow.
336  LOG(WARNING) << "wait, wait, it costed " << watch.elapsed() << " cycles to lock all of "
337  << static_cast<int>(key_count) << " records while splitting!! that's a lot! storage="
339  << ", thread ID=" << context_->get_thread_id();
340  }
341 
342  return kErrorCodeOk;
343 }
344 
346  KeySlice inclusive_from,
347  KeySlice inclusive_to,
348  MasstreeBorderPage* dest) const {
350  const auto& copy_from = *target_;
351  const SlotIndex key_count = target_->get_key_count();
352  ASSERT_ND(dest->get_key_count() == 0);
353  dest->next_offset_ = 0;
354  SlotIndex migrated_count = 0;
355  DataOffset unused_space = sizeof(dest->data_);
356  bool sofar_consecutive = true;
357  KeySlice prev_slice = kSupremumSlice;
358  KeyLength prev_remainder = kMaxKeyLength;
359 
360  // Simply iterate over and memcpy one-by-one.
361  // We previously did a bit more complex thing to copy as many records as
362  // possible in one memcpy, but not worth it with the new page layout.
363  // We will keep an eye on the cost of this method, and optimize when it becomes bottleneck.
364  for (SlotIndex i = 0; i < key_count; ++i) {
365  const KeySlice from_slice = copy_from.get_slice(i);
366  if (from_slice >= inclusive_from && from_slice <= inclusive_to) {
367  // move this record.
368  auto* to_slot = dest->get_new_slot(migrated_count);
369  const auto* from_slot = copy_from.get_slot(i);
370  ASSERT_ND(from_slot->tid_.is_keylocked());
371  const KeyLength from_remainder = from_slot->remainder_length_;
372  const KeyLength from_suffix = calculate_suffix_length(from_remainder);
373  const PayloadLength payload = from_slot->lengthes_.components.payload_length_;
374  const KeyLength to_remainder
375  = to_slot->tid_.xct_id_.is_next_layer() ? kInitiallyNextLayer : from_remainder;
376  const KeyLength to_suffix = calculate_suffix_length(to_remainder);
377  if (to_remainder != from_remainder) {
378  ASSERT_ND(to_remainder == kInitiallyNextLayer);
379  ASSERT_ND(from_remainder != kInitiallyNextLayer && from_remainder <= kMaxKeyLength);
380  DVLOG(2) << "the old record is now a next-layer record, this new record can be initially"
381  " a next-layer, saving space for suffixes. from_remainder=" << from_remainder;
382  }
383 
384  dest->set_slice(migrated_count, from_slice);
385  to_slot->tid_.xct_id_ = from_slot->tid_.xct_id_;
386  to_slot->tid_.lock_.reset();
387  to_slot->remainder_length_ = to_remainder;
388  to_slot->lengthes_.components.payload_length_ = payload;
389  // offset/physical_length set later
390 
391  if (sofar_consecutive && migrated_count > 0) {
392  if (prev_slice > from_slice
393  || (prev_slice == from_slice && prev_remainder > from_remainder)) {
394  sofar_consecutive = false;
395  }
396  }
397  prev_slice = from_slice;
398  prev_remainder = to_remainder;
399 
400  // we migh shrink the physical record size.
401  const DataOffset record_length = MasstreeBorderPage::to_record_length(to_remainder, payload);
402  ASSERT_ND(record_length % 8 == 0);
403  ASSERT_ND(record_length <= from_slot->lengthes_.components.physical_record_length_);
404  to_slot->lengthes_.components.physical_record_length_ = record_length;
405  to_slot->lengthes_.components.offset_ = dest->next_offset_;
406  to_slot->original_physical_record_length_ = record_length;
407  to_slot->original_offset_ = dest->next_offset_;
408  dest->next_offset_ += record_length;
409  unused_space -= record_length - sizeof(*to_slot);
410 
411  // Copy the record. We want to do it in one memcpy if possible.
412  // Be careful on the case where suffix length has changed (kInitiallyNextLayer case)
413  if (record_length > 0) {
414  char* to_record = dest->get_record_from_offset(to_slot->lengthes_.components.offset_);
415  if (from_suffix != to_suffix) {
416  ASSERT_ND(to_remainder == kInitiallyNextLayer);
417  ASSERT_ND(from_remainder != kInitiallyNextLayer && from_remainder <= kMaxKeyLength);
418  ASSERT_ND(to_suffix == 0);
419  // Skip suffix part and copy only the payload.
420  std::memcpy(
421  to_record,
422  copy_from.get_record_payload(i),
423  assorted::align8(payload));
424  } else {
425  // Copy suffix (if exists) and payload together.
426  std::memcpy(to_record, copy_from.get_record(i), record_length);
427  }
428  }
429 
430  ++migrated_count;
431  dest->set_key_count(migrated_count);
432  }
433  }
434 
435  dest->consecutive_inserts_ = sofar_consecutive;
436 }
437 
444  DVLOG(1) << "Preparing to split an intermediate page... ";
445 
446  // First, lock the page(s)
447  Page* target_page = reinterpret_cast<Page*>(target_);
449  Page* pages[2];
450  pages[0] = target_page;
451  pages[1] = reinterpret_cast<Page*>(piggyback_adopt_child_);
452  // lock target_ and piggyback_adopt_child_ in batched mode to avoid deadlock.
453  CHECK_ERROR_CODE(context_->sysxct_batch_page_locks(sysxct_workspace, 2, pages));
454  } else {
455  CHECK_ERROR_CODE(context_->sysxct_page_lock(sysxct_workspace, target_page));
456  }
457 
458  // The lock involves atomic operation, so now all we see are finalized.
459  if (target_->has_foster_child()) {
460  DVLOG(0) << "Interesting. the page has been already split";
461  return kErrorCodeOk;
462  }
464  VLOG(0) << "Interesting. this child is now retired, so someone else has already adopted.";
465  return kErrorCodeOk; // fine. the goal is already achieved
466  }
467 
468  // 3 free volatile pages needed.
469  // foster-minor/major (will be placed in successful case), and SplitStrategy (will be discarded)
470  memory::PagePoolOffset offsets[2];
471  thread::GrabFreeVolatilePagesScope free_pages_scope(context_, offsets);
472  CHECK_ERROR_CODE(free_pages_scope.grab(2));
473  split_impl_no_error(&free_pages_scope);
474  return kErrorCodeOk;
475 }
477  // similar to border page's split, but simpler in a few places because
478  // 1) intermediate page doesn't have owner_id for each pointer (no lock concerns).
479  // 2) intermediate page is already completely sorted.
480  // thus, this is just a physical operation without any transactional behavior.
481  // even not a system transaction
484 
485  debugging::RdtscWatch watch;
486  DVLOG(1) << "Splitting an intermediate page... ";
487 
488  const uint8_t key_count = target_->get_key_count();
490 
491  // 2 free volatile pages needed.
492  // foster-minor/major (will be placed in successful case)
493  const auto& resolver = context_->get_local_volatile_page_resolver();
494 
495  // SplitStrategy on heap. 4kb is not too large on heap
496  SplitStrategy strategy;
497  decide_strategy(&strategy);
498  const KeySlice new_foster_fence = strategy.mid_separator_;
499 
500  MasstreeIntermediatePage* twin[2];
501  VolatilePagePointer new_pointers[2];
502  for (int i = 0; i < 2; ++i) {
503  twin[i]
504  = reinterpret_cast<MasstreeIntermediatePage*>(
505  resolver.resolve_offset_newpage(free_pages->get(i)));
506  new_pointers[i].set(context_->get_numa_node(), free_pages->get(i));
507 
508  twin[i]->initialize_volatile_page(
510  new_pointers[i],
511  target_->get_layer(),
512  target_->get_btree_level(), // foster child has the same level as foster-parent
513  i == 0 ? target_->low_fence_ : new_foster_fence,
514  i == 0 ? new_foster_fence : target_->high_fence_);
515  }
516 
517  migrate_pointers(strategy, 0, strategy.mid_index_ + 1, new_foster_fence, twin[0]);
518  if (strategy.compact_adopt_) {
519  twin[1]->set_key_count(0); // right is empty-range
520  } else {
522  strategy,
523  strategy.mid_index_ + 1,
524  strategy.total_separator_count_,
526  twin[1]);
527  }
528 
529  // Now we will install the new pages. **From now on no error-return allowed**
531  // We install pointers to the pages AFTER we initialize the pages.
532  target_->install_foster_twin(new_pointers[0], new_pointers[1], new_foster_fence);
533  free_pages->dispatch(0);
534  free_pages->dispatch(1);
536  // invoking set_moved is the point we announce all of these changes. take fence to make it right
537  target_->set_moved();
538 
540  // piggyback_adopt_child_ is adopted and retired.
543  }
544 
545  watch.stop();
546  DVLOG(1) << "Costed " << watch.elapsed() << " cycles to split a node. original node"
547  << " key count: " << static_cast<int>(key_count);
548 
550 }
551 
554  out->compact_adopt_ = false;
555  out->total_separator_count_ = 0;
556 
557  // While collecting pointers from the old page, we look for the piggyback_adopt_child_
558  // and replace it with new pointers (so, this increases the total # of separators).
559  KeySlice old_separator = 0;
560  KeySlice new_separator = 0;
561  VolatilePagePointer old_pointer;
564  old_separator = piggyback_adopt_child_->get_low_fence();
565  new_separator = piggyback_adopt_child_->get_foster_fence();
567  }
568 
569  bool found_old = false;
570  for (MasstreeIntermediatePointerIterator iter(target_); iter.is_valid(); iter.next()) {
571  const KeySlice low = iter.get_low_key();
572  const KeySlice high = iter.get_high_key();
573  const DualPagePointer& pointer = iter.get_pointer();
574  if (piggyback_adopt_child_ && low == old_separator) {
575  // Found the existing pointer to replace with foster-minor
576  ASSERT_ND(pointer.volatile_pointer_.is_equivalent(old_pointer));
578  out->separators_[out->total_separator_count_] = new_separator;
582  ++(out->total_separator_count_);
583 
584  // Also add foster-major as a new entry
585  out->separators_[out->total_separator_count_] = high;
589  ++(out->total_separator_count_);
590  found_old = true;
591  } else {
592  out->separators_[out->total_separator_count_] = high;
593  out->pointers_[out->total_separator_count_] = pointer;
594  ++(out->total_separator_count_);
595  }
596  ASSERT_ND(piggyback_adopt_child_ == nullptr || low < old_separator || found_old);
597  }
598 
599  ASSERT_ND(piggyback_adopt_child_ == nullptr || found_old);
600  ASSERT_ND(out->total_separator_count_ >= 2U);
602 
604  // Then, compact_adopt:
605  // we create a dummy foster with empty
606  // range on right side, and just re-structures target page onto left foster twin.
607  // In other words, this is compaction/restructure that is done in another page in RCU fashion.
608  out->compact_adopt_ = true;
609  out->mid_index_ = out->total_separator_count_ - 1U;
611  } else if (piggyback_adopt_child_
614  DVLOG(0) << "Seems like a sequential insert. let's do no-record split";
615  out->mid_index_ = out->total_separator_count_ - 2U;
616  out->mid_separator_ = out->separators_[out->mid_index_];
617  } else {
618  // Usual: otherwise, we split them into 50-50 distribution.
619  out->mid_index_ = (out->total_separator_count_ - 1U) / 2;
620  out->mid_separator_ = out->separators_[out->mid_index_];
621  }
622 }
623 
625  const SplitIntermediate::SplitStrategy& strategy,
626  uint16_t from,
627  uint16_t to,
628  KeySlice expected_last_separator,
629  MasstreeIntermediatePage* dest) const {
630  // construct dest page. copy the separators and pointers.
631  // we distribute them as much as possible in first level. if mini pages have little
632  // entries to start with, following adoption would be only local.
633  float entries_per_mini = static_cast<float>(to - from) / (kMaxIntermediateSeparators + 1);
634  ASSERT_ND(to > from);
635  const uint16_t move_count = to - from;
636 
637  // it looks a bit complicated because each separator is "one-off" due to first-level separator.
638  // so we buffer one separator.
639  float next_mini_threshold = entries_per_mini;
640  uint8_t cur_mini = 0;
641  uint8_t cur_mini_separators = 0;
642  auto* cur_mini_page = &dest->get_minipage(0);
643  cur_mini_page->pointers_[0] = strategy.pointers_[from];
644  ASSERT_ND(!strategy.pointers_[from].is_both_null());
645  KeySlice next_separator = strategy.separators_[from];
646 
647  for (uint16_t i = 1; i < move_count; ++i) {
648  uint16_t original_index = i + from;
649  ASSERT_ND(!strategy.pointers_[original_index].is_both_null());
650  if (i >= next_mini_threshold && cur_mini < kMaxIntermediateSeparators) {
651  // switch to next mini page. so, the separator goes to the first level
652  assorted::memory_fence_release(); // set key count after all
653  cur_mini_page->key_count_ = cur_mini_separators; // close the current
654  ASSERT_ND(cur_mini_page->key_count_ <= kMaxIntermediateMiniSeparators);
655 
656  dest->separators_[cur_mini] = next_separator;
657 
658  next_mini_threshold += entries_per_mini;
659  cur_mini_separators = 0;
660  ++cur_mini;
661  cur_mini_page = &dest->get_minipage(cur_mini);
662  cur_mini_page->pointers_[0] = strategy.pointers_[original_index];
663  } else {
664  // still the same mini page. so, the separator goes to the second level
665  cur_mini_page->separators_[cur_mini_separators] = next_separator;
666  ++cur_mini_separators;
667  ASSERT_ND(cur_mini_separators <= kMaxIntermediateMiniSeparators);
668 
669  cur_mini_page->pointers_[cur_mini_separators] = strategy.pointers_[original_index];
670  }
671  next_separator = strategy.separators_[original_index];
672  }
673  cur_mini_page->key_count_ = cur_mini_separators; // close the last one
674  ASSERT_ND(cur_mini_page->key_count_ <= kMaxIntermediateMiniSeparators);
676  dest->header_.set_key_count(cur_mini); // set key count after all
678 
679  // the last separator is ignored because it's foster-fence/high-fence.
680  ASSERT_ND(next_separator == expected_last_separator);
681 
682  dest->verify_separators();
683 }
686 
687 } // namespace masstree
688 } // namespace storage
689 } // namespace foedus
void set_slice(SlotIndex index, KeySlice slice) __attribute__((always_inline))
void migrate_records(KeySlice inclusive_from, KeySlice inclusive_to, MasstreeBorderPage *dest) const
Subroutine to construct a new page.
const KeySlice kInfimumSlice
KeySlice separators_[kMaxSeparators]
pointers_[n] points to page that is responsible for keys separators_[n - 1] <= key < separators_[n]...
void decide_strategy(SplitStrategy *out) const
Subroutine to decide how we will split this page.
bool compact_adopt_
When this says true, we create a dummy foster with empty range on right side, and just re-structures ...
void migrate_pointers(const SplitStrategy &strategy, uint16_t from, uint16_t to, KeySlice expected_last_separator, MasstreeIntermediatePage *dest) const
Subroutine to construct a new page.
KeySlice high_fence_
Inclusive high fence of this page.
SlotIndex get_key_count() const __attribute__((always_inline))
physical key count (those keys might be deleted) in this page.
virtual ErrorCode run(xct::SysxctWorkspace *sysxct_workspace) override
Interior node's Split.
KeySlice get_slice(SlotIndex index) const __attribute__((always_inline))
T align8(T value)
8-alignment.
void set_moved() __attribute__((always_inline))
Definition: xct_id.hpp:1029
Represents a pointer to another page (usually a child page).
Definition: storage_id.hpp:271
bool is_empty_range() const __attribute__((always_inline))
An empty-range page, either intermediate or border, never has any entries.
uint16_t PayloadLength
Represents a byte-length of a payload in this package.
Definition: masstree_id.hpp:92
MasstreeIntermediatePage *const target_
The page to split.
uint16_t SlotIndex
Index of a record in a (border) page.
ErrorCode sysxct_batch_page_locks(xct::SysxctWorkspace *sysxct_workspace, uint32_t lock_count, storage::Page **pages)
Takes a bunch of page locks for a sysxct running under this thread.
const KeyLength kMaxKeyLength
Max length of a key.
Definition: masstree_id.hpp:75
bool is_locked() const __attribute__((always_inline))
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
ErrorCode sysxct_batch_record_locks(xct::SysxctWorkspace *sysxct_workspace, storage::VolatilePagePointer page_id, uint32_t lock_count, xct::RwLockableXctId **locks)
Takes a bunch of locks in the same page for a sysxct running under this thread.
bool is_moved() const __attribute__((always_inline))
uint32_t PagePoolOffset
Offset in PagePool that compactly represents the page address (unlike 8 bytes pointer).
Definition: memory_id.hpp:44
KeySlice low_fence_
Inclusive low fence of this page.
void collect_retired_volatile_page(storage::VolatilePagePointer ptr)
Keeps the specified volatile page as retired as of the current epoch.
Definition: thread.cpp:113
StorageId storage_id_
ID of the storage this page belongs to.
Definition: page.hpp:196
Slot * get_new_slot(SlotIndex index) __attribute__((always_inline))
uint16_t DataOffset
Byte-offset in a page.
ThreadId get_thread_id() const
Definition: thread.cpp:53
Represents a pointer to a volatile page with modification count for preventing ABA.
Definition: storage_id.hpp:194
MiniPage & get_minipage(uint8_t index) __attribute__((always_inline))
Represents one border page in Masstree Storage.
bool is_consecutive_inserts() const
Whether this page is receiving only sequential inserts.
XctId xct_id_
the second 64bit: Persistent status part of TID.
Definition: xct_id.hpp:1137
void set_moved() __attribute__((always_inline))
uint64_t KeySlice
Each key slice is an 8-byte integer.
bool is_equivalent(const VolatilePagePointer &other) const
Definition: storage_id.hpp:215
const uint16_t kMaxIntermediateMiniSeparators
Max number of separators stored in the second level of intermediate pages.
uint16_t KeyLength
Represents a byte-length of a key in this package.
Definition: masstree_id.hpp:69
memory::PagePoolOffset get(uint32_t index) const
Definition: thread.hpp:413
VolatilePagePointer get_foster_minor() const __attribute__((always_inline))
bool is_retired() const __attribute__((always_inline))
VolatilePagePointer get_foster_major() const __attribute__((always_inline))
ErrorCode grab(uint32_t count)
If this thread doesn't have enough free pages, no page is obtained, returning kErrorCodeMemoryNoFreeP...
Definition: thread.cpp:147
The MCS reader-writer lock variant of LockableXctId.
Definition: xct_id.hpp:1132
DualPagePointer pointers_[kMaxIntermediateMiniSeparators+1]
VolatilePagePointer volatile_pointer_
Definition: storage_id.hpp:308
Constructed by hierarchically reading all separators and pointers in old page.
ErrorCode lock_existing_records(xct::SysxctWorkspace *sysxct_workspace)
Subroutine to lock existing records in target_.
0 means no-error.
Definition: error_code.hpp:87
void dispatch(uint32_t index)
Call this when the page is placed somewhere.
Definition: thread.hpp:409
uint64_t elapsed() const __attribute__((always_inline))
Definition: rdtsc_watch.hpp:52
McsRwLock * get_key_lock() __attribute__((always_inline))
Definition: xct_id.hpp:1139
const uint16_t kMaxIntermediateSeparators
Max number of separators stored in the first level of intermediate pages.
ErrorCode sysxct_page_lock(xct::SysxctWorkspace *sysxct_workspace, storage::Page *page)
Takes a page lock in the same page for a sysxct running under this thread.
void install_foster_twin(VolatilePagePointer minor, VolatilePagePointer major, KeySlice foster_fence)
const KeyLength kInitiallyNextLayer
A special key length value that denotes the record in a border page was initially a next-layer pointe...
Definition: masstree_id.hpp:86
uint8_t get_btree_level() const __attribute__((always_inline))
used only in masstree.
bool no_record_split_
whether this page seems to have had sequential insertions, in which case we do "no-record split" as o...
MasstreePage *const piggyback_adopt_child_
An optimization for the common case: splitting the parent page to adopt foster twins of a child page...
SnapshotPagePointer snapshot_pointer_
Definition: storage_id.hpp:307
const SlotIndex kBorderPageMaxSlots
Maximum number of slots in one MasstreeBorderPage.
Just a marker to denote that the memory region represents a data page.
Definition: page.hpp:334
void split_impl_no_error(thread::GrabFreeVolatilePagesScope *free_pages)
The core implementation after locking relevant pages and acquiring free page resource.
void set(uint8_t numa_node, memory::PagePoolOffset offset)
Definition: storage_id.hpp:212
uint8_t get_layer() const __attribute__((always_inline))
Layer-0 stores the first 8 byte slice, Layer-1 next 8 byte...
void set_retired() __attribute__((always_inline))
KeySlice get_high_fence() const __attribute__((always_inline))
const KeySlice trigger_
The key that triggered this split.
KeySlice mid_slice_
This will be the new foster fence.
const memory::LocalPageResolver & get_local_volatile_page_resolver() const
Returns page resolver to convert only local page ID to page pointer.
Definition: thread.cpp:80
bool has_foster_child() const __attribute__((always_inline))
KeySlice get_low_fence() const __attribute__((always_inline))
void set_moved() __attribute__((always_inline))
Definition: page.hpp:146
KeySlice get_foster_fence() const __attribute__((always_inline))
uint32_t uniform_within(uint32_t from, uint32_t to)
In TPCC terminology, from=x, to=y.
#define CHECK_ERROR_CODE(x)
This macro calls x and checks its returned error code.
Definition: error_code.hpp:155
const PageVersion * get_version_address() const __attribute__((always_inline))
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
VolatilePagePointer get_volatile_page_id() const
MasstreeBorderPage *const target_
The page to split.
A very simple and deterministic random generator that is more aligned with standard benchmark such as...
A RDTSC-based low-overhead stop watch.
Definition: rdtsc_watch.hpp:37
ThreadGroupId get_numa_node() const
Definition: thread.hpp:66
const uint32_t kMaxIntermediatePointers
Max number of pointers (if completely filled) stored in an intermediate pages.
static DataOffset to_record_length(KeyLength remainder_length, PayloadLength payload_length)
returns minimal physical_record_length_ for the given remainder/payload length.
thread::Thread *const context_
Thread context.
uint64_t stop() __attribute__((always_inline))
Take another current time tick.
Definition: rdtsc_watch.hpp:47
#define STATIC_SIZE_CHECK(desired, actual)
void set_key_count(uint8_t key_count) __attribute__((always_inline))
Definition: page.hpp:319
const KeySlice kSupremumSlice
xct::RwLockableXctId * get_owner_id(SlotIndex index) __attribute__((always_inline))
Represents one intermediate page in Masstree Storage.
KeyLength calculate_suffix_length(KeyLength remainder_length) __attribute__((always_inline))
virtual ErrorCode run(xct::SysxctWorkspace *sysxct_workspace) override
Border node's Split.
bool snapshot_
Whether this page image is of a snapshot page.
Definition: page.hpp:211
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
const bool disable_no_record_split_
If true, we never do no-record-split (NRS).
void decide_strategy(SplitStrategy *out) const
Subroutine to decide how we will split this page.
void memory_fence_release()
Equivalent to std::atomic_thread_fence(std::memory_order_release).
Obtains multiple free volatile pages at once and releases them automatically when this object gets ou...
Definition: thread.hpp:391
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
Definition: storage_id.hpp:45
thread::Thread *const context_
Thread context.
void set_key_count(SlotIndex count) __attribute__((always_inline))
ErrorCode
Enum of error codes defined in error_code.xmacro.
Definition: error_code.hpp:85
Per-thread reused work memory for system transactions.
bool is_keylocked() const __attribute__((always_inline))
Definition: xct_id.hpp:1140
char * get_record_from_offset(DataOffset record_offset) __attribute__((always_inline))
Offset versions.
uint64_t page_id_
Page ID of this page.
Definition: page.hpp:191