libfoedus-core
FOEDUS Core Library
masstree_partitioner_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <glog/logging.h>
21 
22 #include <algorithm>
23 #include <cstring>
24 #include <map>
25 #include <ostream>
26 #include <string>
27 #include <thread>
28 #include <utility>
29 #include <vector>
30 
42 
43 namespace foedus {
44 namespace storage {
45 namespace masstree {
46 
47 
54  : engine_(parent->get_engine()),
55  id_(parent->get_storage_id()),
56  metadata_(PartitionerMetadata::get_metadata(engine_, id_)) {
57  if (metadata_->valid_) {
58  data_ = reinterpret_cast<MasstreePartitionerData*>(metadata_->locate_data(engine_));
59  } else {
60  data_ = nullptr;
61  }
62 }
63 
66  MasstreeStorage storage(engine_, id_);
67  MasstreeStorageControlBlock* control_block = storage.get_control_block();
70 
71  // Read current volatile version and previous snapshot version of the root page
73  Page* buffers = reinterpret_cast<Page*>(args.work_memory_->get_block());
74  MasstreeIntermediatePage* vol = reinterpret_cast<MasstreeIntermediatePage*>(buffers);
75  MasstreeIntermediatePage* snp = reinterpret_cast<MasstreeIntermediatePage*>(buffers + 1);
76  SnapshotPagePointer snapshot_page_id = control_block->root_page_pointer_.snapshot_pointer_;
77  MasstreeIntermediatePage* root_volatile = reinterpret_cast<MasstreeIntermediatePage*>(
78  resolver.resolve_offset(control_block->root_page_pointer_.volatile_pointer_));
79  CHECK_ERROR(read_page_safe(root_volatile, vol));
80  if (snapshot_page_id != 0) {
81  WRAP_ERROR_CODE(args.snapshot_files_->read_page(snapshot_page_id, snp));
82  }
83 
84  soc::SharedMutexScope scope(&metadata_->mutex_); // protect this metadata
85  if (metadata_->valid_) {
86  // someone has already initialized this??
87  LOG(FATAL) << "Masstree-" << id_ << " partition already designed??:" << *this;
88  }
89  WRAP_ERROR_CODE(metadata_->allocate_data(engine_, &scope, sizeof(MasstreePartitionerData)));
90  data_ = reinterpret_cast<MasstreePartitionerData*>(metadata_->locate_data(engine_));
91 
92  ASSERT_ND(!vol->is_border());
93  if (engine_->get_soc_count() == 1U) {
94  // no partitioning needed
95  data_->partition_count_ = 1;
96  data_->low_keys_[0] = kInfimumSlice;
97  data_->partitions_[0] = 0;
98  } else {
99  // simply the separators in root page is the partition keys.
100  // if we already have a snapshot page, we use the same partition keys, though
101  // assigned nodes might be changed
102  data_->partition_count_ = 0;
103  if (snapshot_page_id == 0) {
104  CHECK_ERROR(design_partition_first(vol));
105  } else {
106  // So far same assignment as previous snapshot.
107  // TODO(Hideaki) check current volatile pages to change the assignments.
108  for (MasstreeIntermediatePointerIterator it(snp); it.is_valid(); it.next()) {
109  data_->low_keys_[data_->partition_count_] = it.get_low_key();
110  SnapshotPagePointer pointer = it.get_pointer().snapshot_pointer_;
111  uint16_t assignment = extract_numa_node_from_snapshot_pointer(pointer);
112  data_->partitions_[data_->partition_count_] = assignment;
113  ++data_->partition_count_;
114  }
115  }
116  }
117 
118  metadata_->valid_ = true;
119  LOG(INFO) << "Masstree-" << id_ << std::endl << " partitions:" << *this;
120  return kRetOk;
121 }
122 
123 
125  const memory::GlobalVolatilePageResolver& resolver,
126  const MasstreePage* page,
127  uint32_t subtree_id,
128  OwnerSamples* result,
129  assorted::UniformRandom* unirand) {
130  uint32_t node = page->header().stat_last_updater_node_;
131  result->increment(node, subtree_id);
132  // we don't care foster twins. this is just for sampling.
133  if (!page->is_border()) {
134  const auto* casted = reinterpret_cast<const MasstreeIntermediatePage*>(page);
135  const uint32_t kSamplingWidth = 3; // follow this many pointers per page.
136  for (uint32_t rep = 0; rep < kSamplingWidth; ++rep) {
137  uint32_t index = unirand->next_uint32() % (casted->get_key_count() + 1U);
138  const MasstreeIntermediatePage::MiniPage& minipage = casted->get_minipage(index);
139  uint32_t index_mini = unirand->next_uint32() % (minipage.key_count_ + 1U);
140  VolatilePagePointer pointer = minipage.pointers_[index_mini].volatile_pointer_;
141  if (pointer.is_null()) {
142  // because now this page might be changing, this is possible
143  continue;
144  }
145  MasstreePage* child = reinterpret_cast<MasstreePage*>(resolver.resolve_offset(pointer));
146  design_partition_first_parallel_recurse(resolver, child, subtree_id, result, unirand);
147  }
148  } else {
149  // so far do not bother going down to next layer. the border page's owner probably
150  // represents it well. it might not, but a worst case always exists (such as just 1 page
151  // in the first layer.. in that case anyway no luck).
152  }
153 }
154 
156  Engine* engine,
157  VolatilePagePointer subtree,
158  uint32_t subtree_id,
159  OwnerSamples* result) {
160  const memory::GlobalVolatilePageResolver& resolver
162  ASSERT_ND(subtree_id < result->subtrees_);
163  debugging::StopWatch watch;
164  MasstreePage* page = reinterpret_cast<MasstreePage*>(resolver.resolve_offset(subtree));
165  assorted::UniformRandom unirand(subtree_id);
166  design_partition_first_parallel_recurse(resolver, page, subtree_id, result, &unirand);
167  watch.stop();
168  VLOG(0) << "Subtree-" << subtree_id << " done in " << watch.elapsed_us() << "us";
169 }
170 
171 
172 ErrorStack MasstreePartitioner::design_partition_first(const MasstreeIntermediatePage* root) {
173  LOG(INFO) << "Initial partition design for Masstree-" << id_;
174  // randomly sample descendant pages to determine the assignment.
175  // we parallelize this step for each pointer in the root page
176  data_->partition_count_ = 0;
177  std::vector<VolatilePagePointer> pointers;
178  pointers.reserve(kMaxIntermediatePointers);
179  for (MasstreeIntermediatePointerIterator it(root); it.is_valid(); it.next()) {
180  data_->low_keys_[data_->partition_count_] = it.get_low_key();
181  const DualPagePointer& pointer = it.get_pointer();
182  ASSERT_ND(!pointer.volatile_pointer_.is_null()); // because this is first snapshot.
183  pointers.push_back(pointer.volatile_pointer_);
184  data_->partitions_[data_->partition_count_] = 0;
185  ++data_->partition_count_;
186  }
187 
188  ASSERT_ND(data_->partition_count_ == pointers.size());
189  LOG(INFO) << "Launching " << data_->partition_count_ << " threads to take random samples..";
190  OwnerSamples samples(engine_->get_soc_count(), data_->partition_count_);
191  std::vector< std::thread > threads;
192  threads.reserve(data_->partition_count_);
193  for (uint32_t subtree_id = 0; subtree_id < data_->partition_count_; ++subtree_id) {
194  threads.emplace_back(
196  engine_,
197  pointers[subtree_id],
198  subtree_id,
199  &samples);
200  }
201 
202  LOG(INFO) << "Launched. Joining..";
203  for (auto& t : threads) {
204  t.join();
205  }
206 
207  samples.assign_owners();
208  LOG(INFO) << "Joined. Results:" << samples;
209  for (uint32_t subtree_id = 0; subtree_id < data_->partition_count_; ++subtree_id) {
210  data_->partitions_[subtree_id] = samples.get_assignment(subtree_id);
211  }
212 
213  return kRetOk;
214 }
215 
217  // so far simply the node that has majority. but we might want to balance out
218  for (uint32_t subtree_id = 0; subtree_id < subtrees_; ++subtree_id) {
219  uint32_t max_node = 0;
220  uint32_t max_count = at(0, subtree_id);
221  for (uint32_t node = 1; node < nodes_; ++node) {
222  if (at(node, subtree_id) > max_count) {
223  max_node = node;
224  max_count = at(node, subtree_id);
225  }
226  }
227  assignments_[subtree_id] = max_node;
228  }
229 }
230 
231 std::ostream& operator<<(std::ostream& o, const OwnerSamples& v) {
232  o << "<OwnerSamples nodes=\"" << v.nodes_
233  << "\" subtrees=\"" << v.subtrees_ << "\">" << std::endl;
234  o << "<!-- legend id=\"x\" assignment=\"x\">";
235  for (uint32_t node = 0; node < v.nodes_; ++node) {
236  o << "[Node-" << node << "] ";
237  }
238  o << " -->" << std::endl;
239  for (uint32_t subtree_id = 0; subtree_id < v.subtrees_; ++subtree_id) {
240  o << " <subtree id=\"" << subtree_id
241  << "\" assignment=\"" << v.get_assignment(subtree_id) << "\">";
242  for (uint32_t node = 0; node < v.nodes_; ++node) {
243  o << assorted::Hex(v.at(node, subtree_id), 6) << " ";
244  }
245  o << "</subtree>" << std::endl;
246  }
247  o << std::endl << "</OwnerSamples>";
248  return o;
249 }
250 
251 ErrorStack MasstreePartitioner::read_page_safe(MasstreePage* src, MasstreePage* out) {
252  SPINLOCK_WHILE(true) {
253  // a modifying user transaction increments version counter before it unlocks the page,
254  // so the following protocol assures that there happened nothing.
255  uint32_t before = src->header().page_version_.get_version_counter();
257  bool locked_before = src->header().page_version_.is_locked();
259  std::memcpy(out, src, kPageSize);
261  uint32_t after = src->header().page_version_.get_version_counter();
263  bool locked_after = src->header().page_version_.is_locked();
265  uint32_t again = src->header().page_version_.get_version_counter();
266  if (locked_before || locked_after) {
267  VLOG(0) << "Interesting, observed locked page during OCC-read in partition designer. retry";
268  assorted::spinlock_yield(); // snapshot is not in rush. let other threads move on.
269  continue;
270  } else if (before == after && after == again) {
271  // as far as it's a consistent read, the current status of the page doesn't matter.
272  // pointers/records were valid at least until recently, so it's safe to follow them.
273  break;
274  }
275  VLOG(0) << "Interesting, version conflict during OCC-read in partition designer. retry";
276  }
277  return kRetOk;
278 }
279 
281  return data_->partition_count_ > 1U;
282 }
283 
285  debugging::RdtscWatch stop_watch;
286  if (!is_partitionable()) {
287  std::memset(args.results_, 0, sizeof(PartitionId) * args.logs_count_);
288  return;
289  }
290 
291  for (uint32_t i = 0; i < args.logs_count_; ++i) {
292  const MasstreeCommonLogType* rec = resolve_log(args.log_buffer_, args.log_positions_[i]);
293  uint16_t key_length = rec->key_length_;
294  const char* key = rec->get_key();
295  args.results_[i] = data_->find_partition(key, key_length);
296  }
297  stop_watch.stop();
298  VLOG(0) << "Masstree-:" << id_ << " took " << stop_watch.elapsed() << "cycles"
299  << " to partition " << args.logs_count_ << " entries. #partitions=" << data_->partition_count_;
300  // if these binary searches are too costly, let's optimize them.
301 }
302 
303 void MasstreePartitioner::sort_batch_general(const Partitioner::SortBatchArguments& args) const {
304  debugging::StopWatch stop_watch_entire;
305 
306  // Unlike array's sort_batch, we don't do any advanced optimization here.
307  // Keys are arbitrary lengthes, so we have to anyway follow pointers.
308  // Thus, we do a super-simple sort by following BufferPosition everytime.
309  // If this causes too much overhead, let's store a fixed number of slices as sort entries.
310  struct Comparator {
311  explicit Comparator(const snapshot::LogBuffer& log_buffer) : log_buffer_(log_buffer) {}
313  inline bool operator() (
316  ASSERT_ND(left != right);
317  const MasstreeCommonLogType* left_rec = resolve_log(log_buffer_, left);
318  const MasstreeCommonLogType* right_rec = resolve_log(log_buffer_, right);
319  int cmp = MasstreeCommonLogType::compare_logs(left_rec, right_rec);
320  return (cmp < 0 || (cmp == 0 && left < right));
321  }
322  const snapshot::LogBuffer& log_buffer_;
323  };
324 
325  std::memcpy(
326  args.output_buffer_,
327  args.log_positions_,
328  args.logs_count_ * sizeof(snapshot::BufferPosition));
329  Comparator comparator(args.log_buffer_);
330  std::sort(args.output_buffer_, args.output_buffer_ + args.logs_count_, comparator);
331 
332  // No compaction for masstree yet. Anyway this method is not optimized
333  *args.written_count_ = args.logs_count_;
334  stop_watch_entire.stop();
335  VLOG(0) << "Masstree-" << id_ << " sort_batch_general() done in "
336  << stop_watch_entire.elapsed_ms() << "ms for " << args.logs_count_ << " log entries. "
337  << " shortest_key=" << args.shortest_key_length_
338  << " longest_key=" << args.longest_key_length_;
339 }
340 
341 /* Left for future reference (yes, yes, this should be moved to documents rather than code)
342 This one was way slower than the uint128_t + std::sort below. 8-10 Mlogs/sec/core -> 2.5M
343 as far as we cause L1 miss for each comparison, it's basically same as the general func above.
344 
345 void MasstreePartitioner::sort_batch_8bytes(const Partitioner::SortBatchArguments& args) const {
346  ASSERT_ND(args.shortest_key_length_ == sizeof(KeySlice));
347  ASSERT_ND(args.longest_key_length_ == sizeof(KeySlice));
348 
349  debugging::StopWatch stop_watch_entire;
350 
351  // Only a slightly different comparator that exploits the fact that all keys are slices
352  struct SliceComparator {
353  explicit SliceComparator(const snapshot::LogBuffer& log_buffer) : log_buffer_(log_buffer) {}
354  bool operator() (snapshot::BufferPosition left, snapshot::BufferPosition right) const {
355  ASSERT_ND(left != right);
356  const MasstreeCommonLogType* left_rec = resolve_log(log_buffer_, left);
357  const MasstreeCommonLogType* right_rec = resolve_log(log_buffer_, right);
358 
359  KeySlice left_slice = normalize_be_bytes_full_aligned(left_rec->get_key());
360  KeySlice right_slice = normalize_be_bytes_full_aligned(right_rec->get_key());
361  if (left_slice != right_slice) {
362  return left_slice < right_slice;
363  }
364  int cmp = left_rec->header_.xct_id_.compare_epoch_and_orginal(right_rec->header_.xct_id_);
365  if (cmp != 0) {
366  return cmp < 0;
367  }
368  return left < right;
369  }
370  const snapshot::LogBuffer& log_buffer_;
371  };
372 
373  std::memcpy(
374  args.output_buffer_,
375  args.log_positions_,
376  args.logs_count_ * sizeof(snapshot::BufferPosition));
377  SliceComparator comparator(args.log_buffer_);
378  std::sort(args.output_buffer_, args.output_buffer_ + args.logs_count_, comparator);
379 
380  *args.written_count_ = args.logs_count_;
381  stop_watch_entire.stop();
382  VLOG(0) << "Masstree-" << id_ << " sort_batch_8bytes() done in "
383  << stop_watch_entire.elapsed_ms() << "ms for " << args.logs_count_ << " log entries. "
384  << " shortest_key=" << args.shortest_key_length_
385  << " longest_key=" << args.longest_key_length_;
386 }
387 */
388 
389 // typedef uint32_t LogIndex;
390 
402 struct SortEntry {
403  inline void set(
404  KeySlice first_slice,
405  uint16_t compressed_epoch,
406  uint32_t in_epoch_ordinal,
408  first_slice_ = first_slice;
409  combined_epoch_ = (static_cast<uint64_t>(compressed_epoch) << 32) | in_epoch_ordinal;
410  position_ = position;
411  }
412  inline bool operator<(const SortEntry& rhs) const ALWAYS_INLINE {
413  if (first_slice_ != rhs.first_slice_) {
414  return first_slice_ < rhs.first_slice_;
415  }
416  if (combined_epoch_ != rhs.combined_epoch_) {
417  return combined_epoch_ < rhs.combined_epoch_;
418  }
419  return position_ < rhs.position_;
420  }
421 
423  uint64_t combined_epoch_; // compressed_epoch_ << 32 | in_epoch_ordinal_
425  uint32_t dummy2_;
426  // so unfortunate that this doesn't fit in 16 bytes.
427  // because it's now 24 bytes and not 16b aligned, we can't use uint128_t either.
428 };
429 
431 // __attribute__ ((noinline)) // was useful to forcibly show it on cpu profile. nothing more.
433  uint32_t logs_count,
434  const SortEntry* entries,
436  // CPU profile of partition_masstree_perf: 2-3%. (10-15% if the "index" idea is used)
437  for (uint32_t i = 0; i < logs_count; ++i) {
438  out[i] = entries[i].position_;
439  }
440 }
441 
443 // __attribute__ ((noinline)) // was useful to forcibly show it on cpu profile. nothing more.
445  const Epoch base_epoch = args.base_epoch_;
446  // CPU profile of partition_masstree_perf: 9-10%.
447  for (uint32_t i = 0; i < args.logs_count_; ++i) {
448  const MasstreeCommonLogType* log_entry = reinterpret_cast<const MasstreeCommonLogType*>(
449  args.log_buffer_.resolve(args.log_positions_[i]));
454  ASSERT_ND(log_entry->key_length_ == sizeof(KeySlice));
455  Epoch epoch = log_entry->header_.xct_id_.get_epoch();
456  ASSERT_ND(epoch.subtract(base_epoch) < (1U << 16));
457  uint16_t compressed_epoch = epoch.subtract(base_epoch);
458  entries[i].set(
460  compressed_epoch,
461  log_entry->header_.xct_id_.get_ordinal(),
462  args.log_positions_[i]);
463  }
464 }
465 
466 void MasstreePartitioner::sort_batch_8bytes(const Partitioner::SortBatchArguments& args) const {
467  ASSERT_ND(args.shortest_key_length_ == sizeof(KeySlice));
468  ASSERT_ND(args.longest_key_length_ == sizeof(KeySlice));
469  args.work_memory_->assure_capacity(sizeof(SortEntry) * args.logs_count_);
470 
471  debugging::StopWatch stop_watch_entire;
472  ASSERT_ND(sizeof(SortEntry) == 24U);
473  SortEntry* entries = reinterpret_cast<SortEntry*>(args.work_memory_->get_block());
474  prepare_sort_entries(args, entries);
475 
476  // CPU profile of partition_masstree_perf: 80% (introsort_loop) + 9% (other inlined parts).
477  std::sort(entries, entries + args.logs_count_);
478 
479  retrieve_positions(args.logs_count_, entries, args.output_buffer_);
480  *args.written_count_ = args.logs_count_;
481  stop_watch_entire.stop();
482  VLOG(0) << "Masstree-" << id_ << " sort_batch_8bytes() done in "
483  << stop_watch_entire.elapsed_ms() << "ms for " << args.logs_count_ << " log entries. "
484  << " shortest_key=" << args.shortest_key_length_
485  << " longest_key=" << args.longest_key_length_;
486 }
487 
488 
490  if (args.logs_count_ == 0) {
491  *args.written_count_ = 0;
492  return;
493  }
494 
495  if (args.longest_key_length_ == sizeof(KeySlice)
496  && args.shortest_key_length_ == sizeof(KeySlice)) {
497  sort_batch_8bytes(args);
498  } else {
499  sort_batch_general(args);
500  }
501 }
502 
503 std::ostream& operator<<(std::ostream& o, const MasstreePartitioner& v) {
504  o << "<MasstreePartitioner>";
505  if (v.data_) {
506  o << "<partition_count_>" << v.data_->partition_count_ << "</partition_count_>"
507  << "<partitions>";
508  for (uint16_t i = 0; i < v.data_->partition_count_; ++i) {
509  o << std::endl << " <partition node=\"" << v.data_->partitions_[i] << "\">"
510  << assorted::Hex(v.data_->low_keys_[i], 16)
511  << "</partition>";
512  }
513  o << "</partitions>";
514  } else {
515  o << "Not yet designed";
516  }
517  o << "</MasstreePartitioner>";
518  return o;
519 }
520 
521 
527 uint16_t MasstreePartitionerData::find_partition(const char* key, uint16_t key_length) const {
528  if (key_length == 0) {
529  return partitions_[0];
530  }
531 
532  // so far we do a simple sequential search here. we might want
533  // 1) binary search until candidate count becomes less than 8, 2) sequential search.
534  ASSERT_ND(is_key_aligned_and_zero_padded(key, key_length));
536  uint16_t i;
537  for (i = 1; i < partition_count_; ++i) {
538  if (low_keys_[i] > slice) {
539  break;
540  }
541  }
542  // now, i points to the first partition whose low_key is strictly larger than key.
543  // thus, the one before it should be the right partition.
544  return partitions_[i - 1];
545 }
546 
547 
548 } // namespace masstree
549 } // namespace storage
550 } // namespace foedus
Packages handling of 4-bytes representation of position in log buffers.
Definition: log_buffer.hpp:38
bool valid_
Whether this partitioner information (metadata+data) has been constructed.
uint32_t * written_count_
[OUT] how many logs written to output_buffer.
const KeySlice kInfimumSlice
uint32_t shortest_key_length_
[masstree/hash] shortest key length in the log entries.
0x0033 : foedus::storage::masstree::MasstreeInsertLogType .
Definition: log_type.hpp:127
storage::Page * resolve_offset(uint8_t numa_node, PagePoolOffset offset) const __attribute__((always_inline))
Resolves offset plus NUMA node ID to storage::Page*.
DualPagePointer root_page_pointer_
Points to the root page (or something equivalent).
PartitionId * results_
[OUT] this method will set the partition of logs[i] to results[i].
const snapshot::LogBuffer & log_buffer_
Converts from positions to physical pointers.
thread::ThreadGroupId PartitionId
As partition=NUMA node, this is just a synonym of foedus::thread::ThreadGroupId.
Definition: storage_id.hpp:65
Represents a pointer to another page (usually a child page).
Definition: storage_id.hpp:271
uint32_t logs_count_
number of entries to process.
uint16_t find_partition(const char *key, uint16_t key_length) const
Returns the partition (node ID) that should contain the key.
memory::AlignedMemory * work_memory_
Working memory to be used in this method.
Epoch base_epoch_
All log entries in this inputs are assured to be after this epoch.
ErrorCode read_page(storage::SnapshotPagePointer page_id, void *out)
std::ostream & operator<<(std::ostream &o, const MasstreeComposeContext::PathLevel &v)
uint32_t subtract(const Epoch &other) const
Returns the number epochs from the given epoch to this epoch accounting for wrap-around.
Definition: epoch.hpp:137
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
uint32_t at(uint32_t node, uint32_t subtree_id) const
void prepare_sort_entries(const Partitioner::SortBatchArguments &args, SortEntry *entries)
subroutine of sort_batch_8bytes
bool is_border() const __attribute__((always_inline))
uint32_t logs_count_
number of entries to process.
uint32_t BufferPosition
Represents a position in some buffer.
Definition: snapshot_id.hpp:72
Declares all log types used in this storage type.
const GlobalVolatilePageResolver & get_global_volatile_page_resolver() const
Returns the page resolver to convert volatile page ID to page pointer.
double elapsed_ms() const
Definition: stop_watch.hpp:48
uint32_t * assignments_
node_id to be the owner of the subtree
Represents a pointer to a volatile page with modification count for preventing ABA.
Definition: storage_id.hpp:194
Brings error stacktrace information as return value of functions.
Definition: error_stack.hpp:81
ErrorCode assure_capacity(uint64_t required_size, double expand_margin=2.0, bool retain_content=false) noexcept
If the current size is smaller than the given size, automatically expands.
Represents a time epoch.
Definition: epoch.hpp:61
uint32_t get_ordinal() const __attribute__((always_inline))
Definition: xct_id.hpp:976
uint64_t KeySlice
Each key slice is an 8-byte integer.
A base class for MasstreeInsertLogType/MasstreeDeleteLogType/MasstreeOverwriteLogType.
MasstreePartitioner(Partitioner *parent)
MasstreePartitioner methods.
A few macros and helper methods related to byte endian-ness.
Common base of MasstreeIntermediatePage and MasstreeBorderPage.
Number of vol pages in each node sampled per pointer in the root page.
Unlike array's sort entry, we don't always use this because keys are arbitrary lengthes.
snapshot::BufferPosition * output_buffer_
sorted results are written to this variable.
soc::SocId get_soc_count() const
Shorthand for get_options().thread_.group_count_.
Definition: engine.cpp:74
0x0032 : foedus::storage::masstree::MasstreeOverwriteLogType .
Definition: log_type.hpp:126
DualPagePointer pointers_[kMaxIntermediateMiniSeparators+1]
VolatilePagePointer volatile_pointer_
Definition: storage_id.hpp:308
uint64_t elapsed() const __attribute__((always_inline))
Definition: rdtsc_watch.hpp:52
static int compare_logs(const MasstreeCommonLogType *left, const MasstreeCommonLogType *right) __attribute__((always_inline))
Returns -1, 0, 1 when left is less than, same, larger than right in terms of key and xct_id...
void retrieve_positions(uint32_t logs_count, const SortEntry *entries, snapshot::BufferPosition *out)
subroutine of sort_batch_8bytes
uint64_t SnapshotPagePointer
Page ID of a snapshot page.
Definition: storage_id.hpp:79
void sort_batch(const Partitioner::SortBatchArguments &args) const
#define SPINLOCK_WHILE(x)
A macro to busy-wait (spinlock) with occasional pause.
Database engine object that holds all resources and provides APIs.
Definition: engine.hpp:109
ErrorCode allocate_data(Engine *engine, soc::SharedMutexScope *locked, uint32_t data_size)
Allocates a patitioner data in shared memory of the given size.
Definition: partitioner.cpp:61
uint64_t stop()
Take another current time tick.
Definition: stop_watch.cpp:35
SnapshotPagePointer snapshot_pointer_
Definition: storage_id.hpp:307
Epoch get_epoch() const __attribute__((always_inline))
Definition: xct_id.hpp:964
const snapshot::BufferPosition * log_positions_
positions of log records.
void assign_owners()
Determine assignments based on the samples.
xct::XctId xct_id_
Epoch and in-epoch ordinal of this log.
Auto-lock scope object for SharedMutex.
Just a marker to denote that the memory region represents a data page.
Definition: page.hpp:334
uint32_t get_assignment(uint32_t subtree_id) const
const MasstreeCommonLogType * resolve_log(const snapshot::LogBuffer &log_buffer, snapshot::BufferPosition pos)
uint16_t log_type_code_
Actually of LogCode defined in the X-Macro, but we want to make sure the type size is 2 bytes...
uint32_t longest_key_length_
[masstree/hash] longest key length in the log entries.
void * get_block() const
Returns the memory block.
soc::SharedMutex mutex_
Serialize concurrent initialization of this partitioner.
uint8_t extract_numa_node_from_snapshot_pointer(SnapshotPagePointer pointer)
Definition: storage_id.hpp:95
void spinlock_yield()
Invoke _mm_pause(), x86 PAUSE instruction, or something equivalent in the env.
double elapsed_us() const
Definition: stop_watch.hpp:45
0x0034 : foedus::storage::masstree::MasstreeDeleteLogType .
Definition: log_type.hpp:128
Tiny metadata of partitioner for every storage used while log gleaning.
void increment(uint32_t node, uint32_t subtree_id)
#define CHECK_ERROR(x)
This macro calls x and checks its returned value.
void * locate_data(Engine *engine)
Returns the partitioner data pointed from this metadata.
Definition: partitioner.cpp:51
void set(KeySlice first_slice, uint16_t compressed_epoch, uint32_t in_epoch_ordinal, snapshot::BufferPosition position) __attribute__((always_inline))
A very simple and deterministic random generator that is more aligned with standard benchmark such as...
const ErrorStack kRetOk
Normal return value for no-error case.
0x0035 : foedus::storage::masstree::MasstreeUpdateLogType .
Definition: log_type.hpp:129
void design_partition_first_parallel(Engine *engine, VolatilePagePointer subtree, uint32_t subtree_id, OwnerSamples *result)
A RDTSC-based low-overhead stop watch.
Definition: rdtsc_watch.hpp:37
memory::AlignedMemory * work_memory_
Working memory to be used in this method.
bool is_key_aligned_and_zero_padded(const char *key, KeyLength key_length)
Returns if the given key is 8-bytes aligned and also zero-padded to 8-bytes for easier slicing (which...
Partitioning and sorting logic for one storage.
Definition: partitioner.hpp:70
Convenient way of writing hex integers to stream.
const uint32_t kMaxIntermediatePointers
Max number of pointers (if completely filled) stored in an intermediate pages.
uint64_t stop() __attribute__((always_inline))
Take another current time tick.
Definition: rdtsc_watch.hpp:47
uint8_t stat_last_updater_node_
A loosely maintained statistics for volatile pages.
Definition: page.hpp:251
void memory_fence_acquire()
Equivalent to std::atomic_thread_fence(std::memory_order_acquire).
Resolves an offset in a volatile page pool to an actual pointer and vice versa.
const snapshot::BufferPosition * log_positions_
positions of log records.
const uint32_t subtrees_
number of pointers to children in the root page
Represents one intermediate page in Masstree Storage.
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
A high-resolution stop watch.
Definition: stop_watch.hpp:30
#define WRAP_ERROR_CODE(x)
Same as CHECK_ERROR(x) except it receives only an error code, thus more efficient.
#define ALWAYS_INLINE
A function suffix to hint that the function should always be inlined.
Definition: compiler.hpp:106
CONTROL_BLOCK * get_control_block() const
Definition: attachable.hpp:97
const snapshot::LogBuffer & log_buffer_
Converts from positions to physical pointers.
ErrorStack design_partition(const Partitioner::DesignPartitionArguments &args)
memory::EngineMemory * get_memory_manager() const
See Memory Manager.
Definition: engine.cpp:50
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
Definition: storage_id.hpp:45
log::RecordLogType * resolve(BufferPosition position) const
Definition: log_buffer.hpp:42
void design_partition_first_parallel_recurse(const memory::GlobalVolatilePageResolver &resolver, const MasstreePage *page, uint32_t subtree_id, OwnerSamples *result, assorted::UniformRandom *unirand)
bool operator<(const SortEntry &rhs) const __attribute__((always_inline))
void partition_batch(const Partitioner::PartitionBatchArguments &args) const
KeySlice normalize_be_bytes_full_aligned(const void *be_bytes)
Convert an aligned big-endian byte array of at least 8-bytes-length to KeySlice.