20 #include <glog/logging.h>
54 : engine_(parent->get_engine()),
55 id_(parent->get_storage_id()),
80 if (snapshot_page_id != 0) {
87 LOG(FATAL) <<
"Masstree-" << id_ <<
" partition already designed??:" << *
this;
95 data_->partition_count_ = 1;
97 data_->partitions_[0] = 0;
102 data_->partition_count_ = 0;
103 if (snapshot_page_id == 0) {
109 data_->low_keys_[data_->partition_count_] = it.get_low_key();
112 data_->partitions_[data_->partition_count_] = assignment;
113 ++data_->partition_count_;
119 LOG(INFO) <<
"Masstree-" << id_ << std::endl <<
" partitions:" << *
this;
135 const uint32_t kSamplingWidth = 3;
136 for (uint32_t rep = 0; rep < kSamplingWidth; ++rep) {
137 uint32_t index = unirand->
next_uint32() % (casted->get_key_count() + 1U);
162 ASSERT_ND(subtree_id < result->subtrees_);
168 VLOG(0) <<
"Subtree-" << subtree_id <<
" done in " << watch.
elapsed_us() <<
"us";
172 ErrorStack MasstreePartitioner::design_partition_first(
const MasstreeIntermediatePage* root) {
173 LOG(INFO) <<
"Initial partition design for Masstree-" << id_;
177 std::vector<VolatilePagePointer> pointers;
179 for (MasstreeIntermediatePointerIterator it(root); it.is_valid(); it.next()) {
189 LOG(INFO) <<
"Launching " << data_->
partition_count_ <<
" threads to take random samples..";
191 std::vector< std::thread > threads;
193 for (uint32_t subtree_id = 0; subtree_id < data_->
partition_count_; ++subtree_id) {
194 threads.emplace_back(
197 pointers[subtree_id],
202 LOG(INFO) <<
"Launched. Joining..";
203 for (
auto& t : threads) {
207 samples.assign_owners();
208 LOG(INFO) <<
"Joined. Results:" << samples;
209 for (uint32_t subtree_id = 0; subtree_id < data_->
partition_count_; ++subtree_id) {
210 data_->
partitions_[subtree_id] = samples.get_assignment(subtree_id);
218 for (uint32_t subtree_id = 0; subtree_id <
subtrees_; ++subtree_id) {
219 uint32_t max_node = 0;
220 uint32_t max_count =
at(0, subtree_id);
221 for (uint32_t node = 1; node <
nodes_; ++node) {
222 if (
at(node, subtree_id) > max_count) {
224 max_count =
at(node, subtree_id);
232 o <<
"<OwnerSamples nodes=\"" << v.
nodes_
233 <<
"\" subtrees=\"" << v.
subtrees_ <<
"\">" << std::endl;
234 o <<
"<!-- legend id=\"x\" assignment=\"x\">";
235 for (uint32_t node = 0; node < v.
nodes_; ++node) {
236 o <<
"[Node-" << node <<
"] ";
238 o <<
" -->" << std::endl;
239 for (uint32_t subtree_id = 0; subtree_id < v.
subtrees_; ++subtree_id) {
240 o <<
" <subtree id=\"" << subtree_id
242 for (uint32_t node = 0; node < v.
nodes_; ++node) {
245 o <<
"</subtree>" << std::endl;
247 o << std::endl <<
"</OwnerSamples>";
251 ErrorStack MasstreePartitioner::read_page_safe(MasstreePage* src, MasstreePage* out) {
255 uint32_t before = src->header().page_version_.get_version_counter();
257 bool locked_before = src->header().page_version_.is_locked();
261 uint32_t after = src->header().page_version_.get_version_counter();
263 bool locked_after = src->header().page_version_.is_locked();
265 uint32_t again = src->header().page_version_.get_version_counter();
266 if (locked_before || locked_after) {
267 VLOG(0) <<
"Interesting, observed locked page during OCC-read in partition designer. retry";
270 }
else if (before == after && after == again) {
275 VLOG(0) <<
"Interesting, version conflict during OCC-read in partition designer. retry";
294 const char* key = rec->
get_key();
298 VLOG(0) <<
"Masstree-:" << id_ <<
" took " << stop_watch.
elapsed() <<
"cycles"
313 inline bool operator() (
317 const MasstreeCommonLogType* left_rec =
resolve_log(log_buffer_, left);
318 const MasstreeCommonLogType* right_rec =
resolve_log(log_buffer_, right);
320 return (cmp < 0 || (cmp == 0 && left < right));
334 stop_watch_entire.
stop();
335 VLOG(0) <<
"Masstree-" << id_ <<
" sort_batch_general() done in "
405 uint16_t compressed_epoch,
406 uint32_t in_epoch_ordinal,
409 combined_epoch_ = (
static_cast<uint64_t
>(compressed_epoch) << 32) | in_epoch_ordinal;
437 for (uint32_t i = 0; i < logs_count; ++i) {
457 uint16_t compressed_epoch = epoch.
subtract(base_epoch);
481 stop_watch_entire.
stop();
482 VLOG(0) <<
"Masstree-" << id_ <<
" sort_batch_8bytes() done in "
497 sort_batch_8bytes(args);
499 sort_batch_general(args);
504 o <<
"<MasstreePartitioner>";
506 o <<
"<partition_count_>" << v.data_->
partition_count_ <<
"</partition_count_>"
509 o << std::endl <<
" <partition node=\"" << v.data_->
partitions_[i] <<
"\">"
513 o <<
"</partitions>";
515 o <<
"Not yet designed";
517 o <<
"</MasstreePartitioner>";
528 if (key_length == 0) {
Packages handling of 4-bytes representation of position in log buffers.
uint32_t * written_count_
[OUT] how many logs written to output_buffer.
const KeySlice kInfimumSlice
uint32_t shortest_key_length_
[masstree/hash] shortest key length in the log entries.
0x0033 : foedus::storage::masstree::MasstreeInsertLogType .
DualPagePointer root_page_pointer_
Points to the root page (or something equivalent).
PartitionId * results_
[OUT] this method will set the partition of logs[i] to results[i].
const snapshot::LogBuffer & log_buffer_
Converts from positions to physical pointers.
thread::ThreadGroupId PartitionId
As partition=NUMA node, this is just a synonym of foedus::thread::ThreadGroupId.
uint16_t partitions_[kMaxIntermediatePointers]
Represents a pointer to another page (usually a child page).
uint32_t logs_count_
number of entries to process.
uint16_t find_partition(const char *key, uint16_t key_length) const
Returns the partition (node ID) that should contain the key.
memory::AlignedMemory * work_memory_
Working memory to be used in this method.
Epoch base_epoch_
All log entries in this inputs are assured to be after this epoch.
ErrorCode read_page(storage::SnapshotPagePointer page_id, void *out)
uint16_t partition_count_
std::ostream & operator<<(std::ostream &o, const MasstreeComposeContext::PathLevel &v)
uint32_t subtract(const Epoch &other) const
Returns the number epochs from the given epoch to this epoch accounting for wrap-around.
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
uint32_t at(uint32_t node, uint32_t subtree_id) const
void prepare_sort_entries(const Partitioner::SortBatchArguments &args, SortEntry *entries)
subroutine of sort_batch_8bytes
bool is_border() const __attribute__((always_inline))
uint32_t logs_count_
number of entries to process.
uint32_t BufferPosition
Represents a position in some buffer.
Declares all log types used in this storage type.
const GlobalVolatilePageResolver & get_global_volatile_page_resolver() const
Returns the page resolver to convert volatile page ID to page pointer.
double elapsed_ms() const
uint32_t * assignments_
node_id to be the owner of the subtree
Represents a pointer to a volatile page with modification count for preventing ABA.
Brings error stacktrace information as return value of functions.
ErrorCode assure_capacity(uint64_t required_size, double expand_margin=2.0, bool retain_content=false) noexcept
If the current size is smaller than the given size, automatically expands.
uint32_t get_ordinal() const __attribute__((always_inline))
uint64_t KeySlice
Each key slice is an 8-byte integer.
A base class for MasstreeInsertLogType/MasstreeDeleteLogType/MasstreeOverwriteLogType.
MasstreePartitioner(Partitioner *parent)
MasstreePartitioner methods.
A few macros and helper methods related to byte endian-ness.
Common base of MasstreeIntermediatePage and MasstreeBorderPage.
const uint32_t nodes_
number of nodes
Number of vol pages in each node sampled per pointer in the root page.
KeySlice low_keys_[kMaxIntermediatePointers]
Unlike array's sort entry, we don't always use this because keys are arbitrary lengthes.
snapshot::BufferPosition * output_buffer_
sorted results are written to this variable.
soc::SocId get_soc_count() const
Shorthand for get_options().thread_.group_count_.
0x0032 : foedus::storage::masstree::MasstreeOverwriteLogType .
DualPagePointer pointers_[kMaxIntermediateMiniSeparators+1]
VolatilePagePointer volatile_pointer_
uint64_t elapsed() const __attribute__((always_inline))
static int compare_logs(const MasstreeCommonLogType *left, const MasstreeCommonLogType *right) __attribute__((always_inline))
Returns -1, 0, 1 when left is less than, same, larger than right in terms of key and xct_id...
void retrieve_positions(uint32_t logs_count, const SortEntry *entries, snapshot::BufferPosition *out)
subroutine of sort_batch_8bytes
uint64_t SnapshotPagePointer
Page ID of a snapshot page.
Dynamic information of one partitioner.
void sort_batch(const Partitioner::SortBatchArguments &args) const
#define SPINLOCK_WHILE(x)
A macro to busy-wait (spinlock) with occasional pause.
Database engine object that holds all resources and provides APIs.
uint64_t stop()
Take another current time tick.
SnapshotPagePointer snapshot_pointer_
Epoch get_epoch() const __attribute__((always_inline))
const snapshot::BufferPosition * log_positions_
positions of log records.
void assign_owners()
Determine assignments based on the samples.
Auto-lock scope object for SharedMutex.
Just a marker to denote that the memory region represents a data page.
uint32_t get_assignment(uint32_t subtree_id) const
const MasstreeCommonLogType * resolve_log(const snapshot::LogBuffer &log_buffer, snapshot::BufferPosition pos)
uint32_t longest_key_length_
[masstree/hash] longest key length in the log entries.
void * get_block() const
Returns the memory block.
uint8_t extract_numa_node_from_snapshot_pointer(SnapshotPagePointer pointer)
Represents a Masstree storage.
void spinlock_yield()
Invoke _mm_pause(), x86 PAUSE instruction, or something equivalent in the env.
double elapsed_us() const
snapshot::BufferPosition position_
Shared data of this storage type.
0x0034 : foedus::storage::masstree::MasstreeDeleteLogType .
void increment(uint32_t node, uint32_t subtree_id)
#define CHECK_ERROR(x)
This macro calls x and checks its returned value.
void set(KeySlice first_slice, uint16_t compressed_epoch, uint32_t in_epoch_ordinal, snapshot::BufferPosition position) __attribute__((always_inline))
const ErrorStack kRetOk
Normal return value for no-error case.
0x0035 : foedus::storage::masstree::MasstreeUpdateLogType .
void design_partition_first_parallel(Engine *engine, VolatilePagePointer subtree, uint32_t subtree_id, OwnerSamples *result)
A RDTSC-based low-overhead stop watch.
memory::AlignedMemory * work_memory_
Working memory to be used in this method.
bool is_key_aligned_and_zero_padded(const char *key, KeyLength key_length)
Returns if the given key is 8-bytes aligned and also zero-padded to 8-bytes for easier slicing (which...
Partitioner for a masstree storage.
Partitioning and sorting logic for one storage.
Convenient way of writing hex integers to stream.
const uint32_t kMaxIntermediatePointers
Max number of pointers (if completely filled) stored in an intermediate pages.
Arguments for sort_batch()
uint64_t stop() __attribute__((always_inline))
Take another current time tick.
void memory_fence_acquire()
Equivalent to std::atomic_thread_fence(std::memory_order_acquire).
const snapshot::BufferPosition * log_positions_
positions of log records.
const uint32_t subtrees_
number of pointers to children in the root page
Represents one intermediate page in Masstree Storage.
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
A high-resolution stop watch.
#define WRAP_ERROR_CODE(x)
Same as CHECK_ERROR(x) except it receives only an error code, thus more efficient.
#define ALWAYS_INLINE
A function suffix to hint that the function should always be inlined.
bool is_partitionable() const
CONTROL_BLOCK * get_control_block() const
const snapshot::LogBuffer & log_buffer_
Converts from positions to physical pointers.
ErrorStack design_partition(const Partitioner::DesignPartitionArguments &args)
memory::EngineMemory * get_memory_manager() const
See Memory Manager.
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
log::RecordLogType * resolve(BufferPosition position) const
void design_partition_first_parallel_recurse(const memory::GlobalVolatilePageResolver &resolver, const MasstreePage *page, uint32_t subtree_id, OwnerSamples *result, assorted::UniformRandom *unirand)
cache::SnapshotFileSet * snapshot_files_
Arguments for design_partition()
bool operator<(const SortEntry &rhs) const __attribute__((always_inline))
void partition_batch(const Partitioner::PartitionBatchArguments &args) const
KeySlice normalize_be_bytes_full_aligned(const void *be_bytes)
Convert an aligned big-endian byte array of at least 8-bytes-length to KeySlice.
Arguments for partition_batch()