20 #include <glog/logging.h>
44 DVLOG(1) <<
"Splitting a page... ";
52 DVLOG(0) <<
"Interesting. the page has been already split";
56 DVLOG(0) <<
"This page has too few records. Can't split it";
75 for (
int i = 0; i < 2; ++i) {
76 twin[i] =
reinterpret_cast<MasstreeBorderPage*
>(resolver.resolve_offset_newpage(offsets[i]));
78 twin[i]->initialize_volatile_page(
93 std::memcpy(twin[0]->slices_,
target_->slices_,
sizeof(
KeySlice) * key_count);
94 std::memcpy(twin[0]->data_,
target_->data_,
sizeof(
target_->data_));
95 twin[0]->set_key_count(key_count);
96 twin[1]->set_key_count(0);
98 twin[1]->consecutive_inserts_ =
true;
100 twin[1]->next_offset_ = 0;
101 for (
SlotIndex i = 0; i < key_count; ++i) {
131 for (
SlotIndex i = 0; i < key_count; ++i) {
139 DVLOG(1) <<
"Costed " << watch.elapsed() <<
" cycles to split a page. original page physical"
140 <<
" record count: " <<
static_cast<int>(key_count)
141 <<
"->" << static_cast<int>(twin[0]->get_key_count())
142 <<
" + " << static_cast<int>(twin[1]->get_key_count());
160 DVLOG(1) <<
"Obviously no record split. key_count=" <<
static_cast<int>(key_count);
164 DVLOG(1) <<
"No-record split was possible, but disable_no_record_split specified."
165 <<
" simply splitting in half...";
167 DVLOG(1) <<
"Breaks a sequential page. key_count=" <<
static_cast<int>(key_count);
173 for (
SlotIndex i = 1; i < key_count; ++i) {
190 bool first_tide_broken =
false;
191 bool both_tides_broken =
false;
199 for (
SlotIndex i = 1; i < key_count; ++i) {
203 if (!first_tide_broken) {
204 if (slice >= tides_max[0]) {
205 tides_max[0] = slice;
209 first_tide_broken =
true;
211 for (first_breaker = 0; first_breaker < i; ++first_breaker) {
213 if (breaker_slice > slice) {
218 tides_max[0] = slice;
223 ASSERT_ND(tides_max[0] < second_tide_min);
224 ASSERT_ND(second_tide_min <= tides_max[1]);
227 if (slice < second_tide_min && slice >= tides_max[0]) {
228 tides_max[0] = slice;
230 }
else if (slice >= tides_max[1]) {
231 tides_max[1] = slice;
233 DVLOG(2) <<
"Oops, third tide. not the easy case";
234 both_tides_broken =
true;
241 if (!first_tide_broken) {
244 DVLOG(1) <<
"Obviously no record split. key_count=" <<
static_cast<int>(key_count);
248 DVLOG(1) <<
"No-record split was possible, but disable_no_record_split specified."
249 <<
" simply splitting in half...";
251 DVLOG(1) <<
"Breaks a sequential page. key_count=" <<
static_cast<int>(key_count);
258 if (!both_tides_broken) {
259 DVLOG(0) <<
"Yay, figured out two-tides meeting in a page.";
272 for (uint8_t i = 0; i < kSamples; ++i) {
275 std::sort(choices, choices + kSamples);
281 bool observed =
false;
283 for (
SlotIndex i = 0; i < key_count; ++i) {
314 for (
SlotIndex i = 0; i < key_count; ++i) {
326 for (
SlotIndex i = 0; i < key_count; ++i) {
332 DVLOG(1) <<
"Costed " << watch.
elapsed() <<
" cycles to lock all of "
333 <<
static_cast<int>(key_count) <<
" records while splitting";
334 if (watch.
elapsed() > (1ULL << 26)) {
336 LOG(WARNING) <<
"wait, wait, it costed " << watch.
elapsed() <<
" cycles to lock all of "
337 <<
static_cast<int>(key_count) <<
" records while splitting!! that's a lot! storage="
350 const auto& copy_from = *
target_;
353 dest->next_offset_ = 0;
355 DataOffset unused_space =
sizeof(dest->data_);
356 bool sofar_consecutive =
true;
364 for (
SlotIndex i = 0; i < key_count; ++i) {
365 const KeySlice from_slice = copy_from.get_slice(i);
366 if (from_slice >= inclusive_from && from_slice <= inclusive_to) {
369 const auto* from_slot = copy_from.get_slot(i);
370 ASSERT_ND(from_slot->tid_.is_keylocked());
371 const KeyLength from_remainder = from_slot->remainder_length_;
373 const PayloadLength payload = from_slot->lengthes_.components.payload_length_;
377 if (to_remainder != from_remainder) {
380 DVLOG(2) <<
"the old record is now a next-layer record, this new record can be initially"
381 " a next-layer, saving space for suffixes. from_remainder=" << from_remainder;
384 dest->
set_slice(migrated_count, from_slice);
385 to_slot->tid_.xct_id_ = from_slot->tid_.xct_id_;
386 to_slot->tid_.lock_.reset();
387 to_slot->remainder_length_ = to_remainder;
388 to_slot->lengthes_.components.payload_length_ = payload;
391 if (sofar_consecutive && migrated_count > 0) {
392 if (prev_slice > from_slice
393 || (prev_slice == from_slice && prev_remainder > from_remainder)) {
394 sofar_consecutive =
false;
397 prev_slice = from_slice;
398 prev_remainder = to_remainder;
403 ASSERT_ND(record_length <= from_slot->lengthes_.components.physical_record_length_);
404 to_slot->lengthes_.components.physical_record_length_ = record_length;
405 to_slot->lengthes_.components.offset_ = dest->next_offset_;
406 to_slot->original_physical_record_length_ = record_length;
407 to_slot->original_offset_ = dest->next_offset_;
408 dest->next_offset_ += record_length;
409 unused_space -= record_length -
sizeof(*to_slot);
413 if (record_length > 0) {
415 if (from_suffix != to_suffix) {
422 copy_from.get_record_payload(i),
426 std::memcpy(to_record, copy_from.get_record(i), record_length);
435 dest->consecutive_inserts_ = sofar_consecutive;
444 DVLOG(1) <<
"Preparing to split an intermediate page... ";
450 pages[0] = target_page;
460 DVLOG(0) <<
"Interesting. the page has been already split";
464 VLOG(0) <<
"Interesting. this child is now retired, so someone else has already adopted.";
486 DVLOG(1) <<
"Splitting an intermediate page... ";
502 for (
int i = 0; i < 2; ++i) {
505 resolver.resolve_offset_newpage(free_pages->
get(i)));
546 DVLOG(1) <<
"Costed " << watch.elapsed() <<
" cycles to split a node. original node"
547 <<
" key count: " <<
static_cast<int>(key_count);
569 bool found_old =
false;
571 const KeySlice low = iter.get_low_key();
572 const KeySlice high = iter.get_high_key();
614 DVLOG(0) <<
"Seems like a sequential insert. let's do no-record split";
635 const uint16_t move_count = to - from;
639 float next_mini_threshold = entries_per_mini;
640 uint8_t cur_mini = 0;
641 uint8_t cur_mini_separators = 0;
647 for (uint16_t i = 1; i < move_count; ++i) {
648 uint16_t original_index = i + from;
653 cur_mini_page->key_count_ = cur_mini_separators;
656 dest->separators_[cur_mini] = next_separator;
658 next_mini_threshold += entries_per_mini;
659 cur_mini_separators = 0;
665 cur_mini_page->separators_[cur_mini_separators] = next_separator;
666 ++cur_mini_separators;
669 cur_mini_page->pointers_[cur_mini_separators] = strategy.
pointers_[original_index];
671 next_separator = strategy.
separators_[original_index];
673 cur_mini_page->key_count_ = cur_mini_separators;
680 ASSERT_ND(next_separator == expected_last_separator);
void set_slice(SlotIndex index, KeySlice slice) __attribute__((always_inline))
void migrate_records(KeySlice inclusive_from, KeySlice inclusive_to, MasstreeBorderPage *dest) const
Subroutine to construct a new page.
const KeySlice kInfimumSlice
KeySlice separators_[kMaxSeparators]
pointers_[n] points to page that is responsible for keys separators_[n - 1] <= key < separators_[n]...
void decide_strategy(SplitStrategy *out) const
Subroutine to decide how we will split this page.
bool compact_adopt_
When this says true, we create a dummy foster with empty range on right side, and just re-structures ...
KeySlice high_fence_
Inclusive high fence of this page.
SlotIndex get_key_count() const __attribute__((always_inline))
physical key count (those keys might be deleted) in this page.
KeySlice get_slice(SlotIndex index) const __attribute__((always_inline))
T align8(T value)
8-alignment.
void set_moved() __attribute__((always_inline))
Represents a pointer to another page (usually a child page).
bool is_empty_range() const __attribute__((always_inline))
An empty-range page, either intermediate or border, never has any entries.
uint16_t PayloadLength
Represents a byte-length of a payload in this package.
uint16_t SlotIndex
Index of a record in a (border) page.
bool is_both_null() const
ErrorCode sysxct_batch_page_locks(xct::SysxctWorkspace *sysxct_workspace, uint32_t lock_count, storage::Page **pages)
Takes a bunch of page locks for a sysxct running under this thread.
const KeyLength kMaxKeyLength
Max length of a key.
bool is_locked() const __attribute__((always_inline))
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
ErrorCode sysxct_batch_record_locks(xct::SysxctWorkspace *sysxct_workspace, storage::VolatilePagePointer page_id, uint32_t lock_count, xct::RwLockableXctId **locks)
Takes a bunch of locks in the same page for a sysxct running under this thread.
bool is_moved() const __attribute__((always_inline))
uint32_t PagePoolOffset
Offset in PagePool that compactly represents the page address (unlike 8 bytes pointer).
KeySlice low_fence_
Inclusive low fence of this page.
DataOffset get_next_offset() const
void collect_retired_volatile_page(storage::VolatilePagePointer ptr)
Keeps the specified volatile page as retired as of the current epoch.
Slot * get_new_slot(SlotIndex index) __attribute__((always_inline))
uint16_t DataOffset
Byte-offset in a page.
ThreadId get_thread_id() const
Represents a pointer to a volatile page with modification count for preventing ABA.
MiniPage & get_minipage(uint8_t index) __attribute__((always_inline))
Represents one border page in Masstree Storage.
bool is_consecutive_inserts() const
Whether this page is receiving only sequential inserts.
XctId xct_id_
the second 64bit: Persistent status part of TID.
void set_moved() __attribute__((always_inline))
uint64_t KeySlice
Each key slice is an 8-byte integer.
bool is_equivalent(const VolatilePagePointer &other) const
DualPagePointer pointers_[kMaxSeparators]
const uint16_t kMaxIntermediateMiniSeparators
Max number of separators stored in the second level of intermediate pages.
uint16_t KeyLength
Represents a byte-length of a key in this package.
memory::PagePoolOffset get(uint32_t index) const
VolatilePagePointer get_foster_minor() const __attribute__((always_inline))
bool is_retired() const __attribute__((always_inline))
VolatilePagePointer get_foster_major() const __attribute__((always_inline))
ErrorCode grab(uint32_t count)
If this thread doesn't have enough free pages, no page is obtained, returning kErrorCodeMemoryNoFreeP...
The MCS reader-writer lock variant of LockableXctId.
DualPagePointer pointers_[kMaxIntermediateMiniSeparators+1]
VolatilePagePointer volatile_pointer_
ErrorCode lock_existing_records(xct::SysxctWorkspace *sysxct_workspace)
Subroutine to lock existing records in target_.
void dispatch(uint32_t index)
Call this when the page is placed somewhere.
uint64_t elapsed() const __attribute__((always_inline))
McsRwLock * get_key_lock() __attribute__((always_inline))
const uint16_t kMaxIntermediateSeparators
Max number of separators stored in the first level of intermediate pages.
ErrorCode sysxct_page_lock(xct::SysxctWorkspace *sysxct_workspace, storage::Page *page)
Takes a page lock in the same page for a sysxct running under this thread.
void install_foster_twin(VolatilePagePointer minor, VolatilePagePointer major, KeySlice foster_fence)
const KeyLength kInitiallyNextLayer
A special key length value that denotes the record in a border page was initially a next-layer pointe...
uint8_t get_btree_level() const __attribute__((always_inline))
used only in masstree.
bool no_record_split_
whether this page seems to have had sequential insertions, in which case we do "no-record split" as o...
SnapshotPagePointer snapshot_pointer_
const SlotIndex kBorderPageMaxSlots
Maximum number of slots in one MasstreeBorderPage.
Just a marker to denote that the memory region represents a data page.
void set(uint8_t numa_node, memory::PagePoolOffset offset)
uint8_t get_layer() const __attribute__((always_inline))
Layer-0 stores the first 8 byte slice, Layer-1 next 8 byte...
void set_retired() __attribute__((always_inline))
KeySlice get_high_fence() const __attribute__((always_inline))
const KeySlice trigger_
The key that triggered this split.
uint16_t total_separator_count_
KeySlice mid_slice_
This will be the new foster fence.
const memory::LocalPageResolver & get_local_volatile_page_resolver() const
Returns page resolver to convert only local page ID to page pointer.
bool has_foster_child() const __attribute__((always_inline))
KeySlice get_low_fence() const __attribute__((always_inline))
void set_moved() __attribute__((always_inline))
KeySlice get_foster_fence() const __attribute__((always_inline))
#define CHECK_ERROR_CODE(x)
This macro calls x and checks its returned error code.
SlotIndex original_key_count_
const PageVersion * get_version_address() const __attribute__((always_inline))
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, uint8_t layer, uint8_t level, KeySlice low_fence, KeySlice high_fence)
VolatilePagePointer get_volatile_page_id() const
MasstreeBorderPage *const target_
The page to split.
A RDTSC-based low-overhead stop watch.
ThreadGroupId get_numa_node() const
const uint32_t kMaxIntermediatePointers
Max number of pointers (if completely filled) stored in an intermediate pages.
static DataOffset to_record_length(KeyLength remainder_length, PayloadLength payload_length)
returns minimal physical_record_length_ for the given remainder/payload length.
uint64_t stop() __attribute__((always_inline))
Take another current time tick.
#define STATIC_SIZE_CHECK(desired, actual)
const KeySlice kSupremumSlice
xct::RwLockableXctId * get_owner_id(SlotIndex index) __attribute__((always_inline))
Represents one intermediate page in Masstree Storage.
KeyLength calculate_suffix_length(KeyLength remainder_length) __attribute__((always_inline))
void verify_separators() const
virtual ErrorCode run(xct::SysxctWorkspace *sysxct_workspace) override
Border node's Split.
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
const bool disable_no_record_split_
If true, we never do no-record-split (NRS).
void memory_fence_release()
Equivalent to std::atomic_thread_fence(std::memory_order_release).
Obtains multiple free volatile pages at once and releases them automatically when this object gets ou...
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
thread::Thread *const context_
Thread context.
void set_key_count(SlotIndex count) __attribute__((always_inline))
ErrorCode
Enum of error codes defined in error_code.xmacro.
Per-thread reused work memory for system transactions.
bool is_keylocked() const __attribute__((always_inline))
char * get_record_from_offset(DataOffset record_offset) __attribute__((always_inline))
Offset versions.