foedus_code/merge__sort_8cpp_source.html

 /*

  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.

  * This program is free software; you can redistribute it and/or modify it

  * under the terms of the GNU General Public License as published by the Free

  * Software Foundation; either version 2 of the License, or (at your option)

  * any later version.

  *

  * This program is distributed in the hope that it will be useful, but WITHOUT

  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for

  * more details. You should have received a copy of the GNU General Public

  * License along with this program; if not, write to the Free Software

  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

  *

  * HP designates this particular file as subject to the "Classpath" exception

  * as provided by HP in the LICENSE.txt file that accompanied this code.

  */

 #include "foedus/snapshot/merge_sort.hpp"


 #include <glog/logging.h>


 #include <algorithm>


 #include "foedus/epoch.hpp"

 #include "foedus/assorted/assorted_func.hpp"

 #include "foedus/assorted/cacheline.hpp"

 #include "foedus/debugging/stop_watch.hpp"

 #include "foedus/memory/aligned_memory.hpp"

 #include "foedus/snapshot/log_buffer.hpp"

 #include "foedus/storage/page.hpp"

 #include "foedus/storage/storage_id.hpp"

 #include "foedus/storage/array/array_log_types.hpp"

 #include "foedus/storage/hash/hash_log_types.hpp"

 #include "foedus/storage/masstree/masstree_log_types.hpp"


 namespace foedus {

 namespace snapshot {


 const MergeSort::InputIndex kInvalidInput = static_cast<MergeSort::InputIndex>(-1U);

 const float kWindowMoveThreshold = 0.95;


 uint16_t extract_shortest_key_length(SortedBuffer* const* inputs, uint16_t inputs_count) {

   uint16_t ret = inputs[0]->get_cur_block_shortest_key_length();

   for (uint16_t i = 1; i < inputs_count; ++i) {

     ret = std::min<uint16_t>(ret, inputs[i]->get_cur_block_shortest_key_length());

   }

   return ret;

 }

 uint16_t extract_longest_key_length(SortedBuffer* const* inputs, uint16_t inputs_count) {

   uint16_t ret = inputs[0]->get_cur_block_longest_key_length();

   for (uint16_t i = 1; i < inputs_count; ++i) {

     ret = std::max<uint16_t>(ret, inputs[i]->get_cur_block_longest_key_length());

   }

   return ret;

 }


 MergeSort::MergeSort(

   storage::StorageId id,

   storage::StorageType type,

   Epoch base_epoch,

   SortedBuffer* const* inputs,

   uint16_t inputs_count,

   uint16_t max_original_pages,

   memory::AlignedMemory* const work_memory,

   uint16_t chunk_batch_size)

   : DefaultInitializable(),

     id_(id),

     type_(type),

     base_epoch_(base_epoch),

     shortest_key_length_(extract_shortest_key_length(inputs, inputs_count)),

     longest_key_length_(extract_longest_key_length(inputs, inputs_count)),

     inputs_(inputs),

     inputs_count_(inputs_count),

     max_original_pages_(max_original_pages),

     chunk_batch_size_(chunk_batch_size),

     work_memory_(work_memory) {

   ASSERT_ND(shortest_key_length_ <= longest_key_length_);

   ASSERT_ND(shortest_key_length_ > 0);

   ASSERT_ND(chunk_batch_size_ > 0);

   current_count_ = 0;

   sort_entries_ = nullptr;

   position_entries_ = nullptr;

   original_pages_ = nullptr;

   inputs_status_ = nullptr;

 }


 ErrorStack MergeSort::initialize_once() {

   // in each batch, we might include tuples from an input even if we didn't fully pick a chunk from

   // it (at most kLogChunk-1 such tuples). so, conservatively chunk_batch_size_ + inputs_count_.

   uint32_t buffer_capacity = kLogChunk * (chunk_batch_size_ + inputs_count_);

   buffer_capacity_ = assorted::align<uint32_t, 512U>(buffer_capacity);

   uint64_t byte_size = buffer_capacity_ * (sizeof(SortEntry) + sizeof(PositionEntry));

   ASSERT_ND(byte_size % 4096U == 0);

   byte_size += storage::kPageSize * (max_original_pages_ + 1U);

   byte_size += sizeof(InputStatus) * inputs_count_;

   WRAP_ERROR_CODE(work_memory_->assure_capacity(byte_size));


   // assign pointers

   char* block = reinterpret_cast<char*>(work_memory_->get_block());

 #ifndef NDEBUG

   std::memset(block, 0xDA, work_memory_->get_size());

 #endif  // NDEBUG

   uint64_t offset = 0;

   sort_entries_ = reinterpret_cast<SortEntry*>(block + offset);

   offset += sizeof(SortEntry) * buffer_capacity;

   position_entries_ = reinterpret_cast<PositionEntry*>(block + offset);

   offset += sizeof(PositionEntry) * buffer_capacity;

   original_pages_ = reinterpret_cast<storage::Page*>(block + offset);

   offset += sizeof(storage::Page) * (max_original_pages_ + 1U);

   inputs_status_ = reinterpret_cast<InputStatus*>(block + offset);

   offset += sizeof(InputStatus) * inputs_count_;

   ASSERT_ND(offset == byte_size);


   // initialize inputs_status_

   for (InputIndex i = 0; i < inputs_count_; ++i) {

     InputStatus* status = inputs_status_ + i;

     SortedBuffer* input = inputs_[i];

     input->assert_checks();

     status->window_ = input->get_buffer();

     status->window_offset_ = input->get_offset();

     status->window_size_ = input->get_buffer_size();

     uint64_t cur_abs = input->get_cur_block_abosulte_begin();

     // this is the initial read of this block, so we are sure cur_block_abosulte_begin is the window

     ASSERT_ND(cur_abs >= status->window_offset_);

     status->cur_relative_pos_ = cur_abs - status->window_offset_;

     status->chunk_relative_pos_ = status->cur_relative_pos_;  // hence the chunk has only 1 log

     status->previous_chunk_relative_pos_ = status->chunk_relative_pos_;


     uint64_t end_abs = input->get_cur_block_abosulte_end();

     status->end_absolute_pos_ = end_abs;


     status->assert_consistent();

   }

   return kRetOk;

 }


 ErrorStack MergeSort::next_batch() {

   ASSERT_ND(is_initialized());

   current_count_ = 0;

   CHECK_ERROR(advance_window());


   if (is_ended_all()) {

     return kRetOk;

   }


   if (is_no_merging()) {

     next_batch_one_input();

   } else {

     InputIndex min_input = pick_chunks();

     batch_sort(min_input);

   }

   return kRetOk;

 }


 void MergeSort::next_batch_one_input() {

   // In this case, we could even skip setting sort_entries_. However, composer benefits from the

   // concise array that tells the most significant 8 bytes key, so we populate it even in this case.

   ASSERT_ND(is_no_merging());

   // Note, even in this case, inputs_[0] might NOT be InMemorySortedBuffer.

   // Example:

   //   in-memory: blocks for storage-1, storage-3

   //   dump-0: blocks for storage-1, storage-2, storage-3

   //   dump-1: blocks for storage-3

   // For storage-2, dump-0 is the only input. Had a bug to overlook this case...

   InputStatus* status = inputs_status_;

   uint64_t relative_pos = status->cur_relative_pos_;

   uint64_t end_pos = status->end_absolute_pos_ - status->window_offset_;

   const uint32_t kLongestLog = 1U << 16;

   if (dynamic_cast<InMemorySortedBuffer*>(inputs_[0])) {

     VLOG(0) << "1-input in-memory case.";

     ASSERT_ND(status->is_last_window());  // then it's always the last window.

     ASSERT_ND(status->window_offset_ == 0);

     ASSERT_ND(!status->is_ended());

     ASSERT_ND(end_pos <= status->window_size_);

   } else {

     VLOG(0) << "1-input dump-file case.";

     ASSERT_ND(dynamic_cast<DumpFileSortedBuffer*>(inputs_[0]));

     // In this case, end_pos might be careful on a log spanning the end.

     // rather than checking the length each time, we conservatively close the current window.

     if (end_pos + kLongestLog > status->window_size_) {

       end_pos = status->window_size_ - kLongestLog;

     }

   }


   ASSERT_ND(relative_pos <= end_pos + kLongestLog);

   uint64_t processed = 0;

   debugging::StopWatch watch;

   if (type_ == storage::kArrayStorage) {

     for (; LIKELY(relative_pos < end_pos && processed < buffer_capacity_); ++processed) {

       relative_pos += populate_entry_array(0, relative_pos);

     }

   } else if (type_ == storage::kMasstreeStorage) {

     for (; LIKELY(relative_pos < end_pos && processed < buffer_capacity_); ++processed) {

       relative_pos += populate_entry_masstree(0, relative_pos);

     }

   } else {

     ASSERT_ND(type_ == storage::kHashStorage);

     for (; LIKELY(relative_pos < end_pos && processed < buffer_capacity_); ++processed) {

       relative_pos += populate_entry_hash(0, relative_pos);

     }

   }

   ASSERT_ND(relative_pos <= end_pos + kLongestLog);

   ASSERT_ND(processed <= buffer_capacity_);

   ASSERT_ND(current_count_ == processed);


   watch.stop();

   VLOG(0) << "1-input case. from=" << status->cur_relative_pos_ << "b. processed " << processed

     << " logs in " << watch.elapsed_ms() << "ms";

   status->cur_relative_pos_ = relative_pos;

   status->chunk_relative_pos_ = relative_pos;

   status->previous_chunk_relative_pos_ = relative_pos;

   status->assert_consistent();

   assert_sorted();

 }


 ErrorStack MergeSort::advance_window() {

   // this method is called while we do not grab anything from the input yet.

   // otherwise we can't move window here.

   ASSERT_ND(current_count_ == 0);

   for (InputIndex i = 0; i < inputs_count_; ++i) {

     InputStatus* status = inputs_status_ + i;

     if (status->is_ended() || status->is_last_window()) {

       continue;

     }

     ASSERT_ND(status->cur_relative_pos_ <= status->chunk_relative_pos_);

     ASSERT_ND(status->cur_relative_pos_ == status->previous_chunk_relative_pos_);

     if (status->cur_relative_pos_

         >= static_cast<uint64_t>(status->window_size_ * kWindowMoveThreshold)

        || status->cur_relative_pos_ + kWindowChunkReserveBytes >= status->window_size_) {

       uint64_t cur_abs_pos = status->to_absolute_pos(status->cur_relative_pos_);


       SortedBuffer* input = inputs_[i];

       WRAP_ERROR_CODE(input->wind(cur_abs_pos));

       status->window_offset_ = input->get_offset();

       ASSERT_ND(status->window_size_ == input->get_buffer_size());

       ASSERT_ND(status->window_ == input->get_buffer());


       ASSERT_ND(cur_abs_pos >= status->window_offset_);

       status->cur_relative_pos_ = cur_abs_pos - status->window_offset_;

       status->chunk_relative_pos_ = status->cur_relative_pos_;

       status->previous_chunk_relative_pos_ = status->cur_relative_pos_;

       status->assert_consistent();

     }

     ASSERT_ND(status->chunk_relative_pos_ + status->get_chunk_log()->header_.log_length_

         <= status->window_size_);

   }


 #ifndef NDEBUG

   // after the conservative move above, all inputs should be either

   // 1) in last window, including already ended

   // 2) in non-last window that has at least kWindowChunkReserveBytes to be consumed

   // let's confirm.

   for (InputIndex i = 0; i < inputs_count_; ++i) {

     InputStatus* status = inputs_status_ + i;

     status->assert_consistent();

     if (status->is_ended() || status->is_last_window()) {

       continue;

     }

     ASSERT_ND(status->cur_relative_pos_ + kWindowChunkReserveBytes <= status->window_size_);

     ASSERT_ND(status->chunk_relative_pos_ + status->get_chunk_log()->header_.log_length_

         <= status->window_size_);

   }

 #endif  // NDEBUG

   return kRetOk;

 }


 uint32_t MergeSort::fetch_logs(

   uint32_t sort_pos,

   uint32_t count,

   log::RecordLogType const** out) const {

   ASSERT_ND(sort_pos <= current_count_);

   uint32_t fetched_count = count;

   if (sort_pos + count > current_count_) {

     fetched_count = current_count_ - sort_pos;

   }


   if (is_no_merging()) {

     // no merge sort.

 #ifndef NDEBUG

     for (uint32_t i = 0; i < fetched_count; ++i) {

       ASSERT_ND(sort_entries_[sort_pos + i].get_position() == sort_pos + i);

     }

 #endif  // NDEBUG

     // in this case, the pointed logs are also contiguous. no point to do prefetching.

     for (uint32_t i = 0; i < fetched_count; ++i) {

       MergedPosition pos = sort_pos + i;

       ASSERT_ND(position_entries_[pos].input_index_ == 0);

       out[i] = inputs_status_[0].from_compact_pos(position_entries_[pos].input_position_);

     }

     return fetched_count;

   }


   // prefetch position entries

   for (uint32_t i = 0; i < fetched_count; ++i) {

     MergedPosition pos = sort_entries_[sort_pos + i].get_position();

     assorted::prefetch_cacheline(position_entries_ + pos);

   }

   // prefetch and fetch logs

   for (uint32_t i = 0; i < fetched_count; ++i) {

     MergedPosition pos = sort_entries_[sort_pos + i].get_position();

     InputIndex input = position_entries_[pos].input_index_;

     out[i] = inputs_status_[input].from_compact_pos(position_entries_[pos].input_position_);

     assorted::prefetch_cacheline(out[i]);

   }

   return fetched_count;

 }


 void MergeSort::next_chunk(InputIndex input_index) {

   InputStatus* status = inputs_status_ + input_index;

   ASSERT_ND(!status->is_ended());

   ASSERT_ND(!status->is_last_chunk_in_window());

   status->assert_consistent();


   uint64_t pos = status->chunk_relative_pos_;

   uint64_t relative_end = status->end_absolute_pos_ - status->window_offset_;

   if (relative_end >= status->window_size_) {

     relative_end = status->window_size_;

   }

   ASSERT_ND(pos + status->from_byte_pos(pos)->header_.log_length_ <= status->window_size_);


   // Be careful on advancing "too much". If the next log spans to next window,

   // we can't use it as a chunk-log. But, we don't know the log length until we advance.

   // We thus maintain two values:

   //   pos : we are sure log entry at this position completely fits in the window.

   //   next_pos : we are considering to set this value to pos

   if (status->is_last_window()) {

     // separately handle last-window, where there is no concern on spanning to next window,

     // but instead we have to include the last log in last chunk.

     for (uint32_t i = 0; i < kLogChunk; ++i) {

       ASSERT_ND(pos < status->window_size_);

       const log::RecordLogType* the_log = status->from_byte_pos(pos);

       uint16_t log_length = the_log->header_.log_length_;

       ASSERT_ND(pos + log_length <= relative_end);  // because it's last window

       if (pos + log_length >= relative_end) {

         break;

       }

       pos += log_length;

     }

   } else {

     uint64_t next_pos = pos;

     for (uint32_t i = 0; i < kLogChunk; ++i) {

       ASSERT_ND(next_pos < status->window_size_);

       const log::RecordLogType* the_log = status->from_byte_pos(next_pos);

       uint16_t log_length = the_log->header_.log_length_;

       if (next_pos + log_length >= relative_end) {

         break;

       }

       pos = next_pos;

       next_pos += log_length;

     }

   }

   ASSERT_ND(pos < relative_end);

   ASSERT_ND(pos + status->from_byte_pos(pos)->header_.log_length_ <= status->window_size_);

   status->previous_chunk_relative_pos_ = status->chunk_relative_pos_;

   status->chunk_relative_pos_ = pos;


   status->assert_consistent();

   // ALWAYS, the chunk-log is fully contained in the window.

   ASSERT_ND(status->chunk_relative_pos_ + status->get_chunk_log()->header_.log_length_

       <= status->window_size_);

 }


 MergeSort::InputIndex MergeSort::determine_min_input() const {

   InputIndex min_input = kInvalidInput;

   for (InputIndex i = 0; i < inputs_count_; ++i) {

     InputStatus* status = inputs_status_ + i;

     status->assert_consistent();

     if (status->is_ended() || status->is_last_chunk_overall()) {

       continue;

     }

     if (min_input == kInvalidInput) {

       min_input = i;

     } else {

       ASSERT_ND(!inputs_status_[min_input].is_ended());

       ASSERT_ND(!inputs_status_[min_input].is_last_chunk_overall());

       if (compare_logs(status->get_chunk_log(), inputs_status_[min_input].get_chunk_log()) < 0) {

         min_input = i;

       }

     }

   }

   return min_input;

 }


 MergeSort::InputIndex MergeSort::pick_chunks() {

   uint32_t chunks;

   for (chunks = 0; chunks < chunk_batch_size_; ++chunks) {

     InputIndex min_input = determine_min_input();

     if (min_input == kInvalidInput) {

       // now all inputs are in the last chunks, we can simply merge them all in one shot!

       return kInvalidInput;

     }


     if (inputs_status_[min_input].is_last_chunk_in_window()) {

       VLOG(1) << "Min Input-" << min_input << " needs to shift window. chunks=" << chunks;

       break;

     }

     next_chunk(min_input);


     inputs_status_[min_input].assert_consistent();

   }


   VLOG(1) << "Now determining batch-threshold... chunks=" << chunks;

   return determine_min_input();

 }


 void MergeSort::batch_sort(MergeSort::InputIndex min_input) {

   batch_sort_prepare(min_input);

   ASSERT_ND(current_count_ <= buffer_capacity_);


   // First, sort it with std::sort, which is (*) smart enough to switch to heap sort for this case.

   // (*) at least gcc's does.

   debugging::StopWatch sort_watch;

   std::sort(&(sort_entries_->data_), &(sort_entries_[current_count_].data_));

   sort_watch.stop();

   VLOG(1) << "Storage-" << id_ << ", merge sort (main) of " << current_count_ << " logs in "

     << sort_watch.elapsed_ms() << "ms";


   // We need additional sorting just for masstree.

   // Array never needs it because 8-byte is enough to compare precisely.

   // Hash neither because it doesn't need the inputs to be fully sorted. Bin-sort is enough.

   // Sequential doesn't need any sorting at all.

   if (type_ == storage::kMasstreeStorage

     && (shortest_key_length_ != 8U || longest_key_length_ != 8U)) {

     // the sorting above has to be adjusted if we need additional logic for key comparison

     batch_sort_adjust_sort();

   }


   assert_sorted();

 }


 void MergeSort::batch_sort_prepare(MergeSort::InputIndex min_input) {

   current_count_ = 0;

   if (min_input == kInvalidInput) {

     // this is the last iteration! get all remaining logs from all inputs

     for (InputIndex i = 0; i < inputs_count_; ++i) {

       InputStatus* status = inputs_status_ + i;

       ASSERT_ND(status->is_last_chunk_overall());

       if (status->is_ended()) {

         continue;

       }

       append_logs(i, status->chunk_relative_pos_ + status->get_chunk_log()->header_.log_length_);

       status->assert_consistent();

     }

   } else {

     // merge-sort upto batch-threshold

     const log::RecordLogType* threshold = inputs_status_[min_input].get_chunk_log();

     for (InputIndex i = 0; i < inputs_count_; ++i) {

       InputStatus* status = inputs_status_ + i;

       if (status->is_ended()) {

         continue;

       }


       if (i == min_input) {

         // the input that provides threshold itself. Hence, all logs before the last log are

         // guaranteed to be strictly smaller than the threshold.

         append_logs(i, status->chunk_relative_pos_);

         ASSERT_ND(status->chunk_relative_pos_ == status->cur_relative_pos_);

       } else {

         // otherwise, we have to add logs that are smaller than the threshold.

         // to avoid key comparison in most cases, we use "previous chunk" hint.

         if (status->previous_chunk_relative_pos_ != status->chunk_relative_pos_) {

           append_logs(i, status->previous_chunk_relative_pos_);

           ASSERT_ND(status->previous_chunk_relative_pos_ == status->cur_relative_pos_);

         }


         // and then we have to check one by one. we could do binary search here, but >90%

         // of logs should be already appended by the previous-chunk optimization. not worth it.

         uint64_t cur = status->cur_relative_pos_;

         uint64_t end = status->chunk_relative_pos_ + status->get_chunk_log()->header_.log_length_;

         ASSERT_ND(cur < end);

         while (cur < end) {

           const log::RecordLogType* the_log = status->from_byte_pos(cur);

           // It must be _strictly_ smaller than the threshold

           if (compare_logs(the_log, threshold) >= 0) {

             break;

           }

           cur += the_log->header_.log_length_;

         }

         ASSERT_ND(cur <= end);

         append_logs(i, cur);

         if (cur == end) {

           // this means we added even the last log. This can happen only at the overall last chunk

           // because we pick batch threshold that is the smallest chunk-last-key.

           ASSERT_ND(status->is_last_chunk_overall());

         }

       }


       status->assert_consistent();

     }

   }

 }


 void MergeSort::batch_sort_adjust_sort() {

   debugging::StopWatch sort_watch;

   uint32_t cur = 0;

   uint32_t debug_stat_run_count = 0;

   uint32_t debug_stat_longest_run = 0;

   uint32_t debug_stat_runs_total = 0;

   while (LIKELY(cur + 1U < current_count_)) {

     // if the 8-bytes key is strictly smaller, we don't need additional check.

     // and it should be the vast majority of cases.

     uint64_t short_key = sort_entries_[cur].get_key();

     ASSERT_ND(short_key <= sort_entries_[cur + 1U].get_key());

     if (LIKELY(short_key < sort_entries_[cur + 1U].get_key())) {

       ++cur;

       continue;

     }


     // figure out how long the run goes on.

     uint32_t next = cur + 2U;

     bool needs_to_check =

       sort_entries_[cur].needs_additional_check()

       || sort_entries_[cur + 1U].needs_additional_check();

     for (next = cur + 2U;

         next < current_count_ && short_key == sort_entries_[next].get_key();

         ++next) {

       ASSERT_ND(short_key <= sort_entries_[next].get_key());

       needs_to_check |= sort_entries_[next].needs_additional_check();

     }

     // now, next points to the first entry that has a different key (or current_count_). thus:

     uint32_t run_length = next - cur;

     debug_stat_runs_total += run_length;

     debug_stat_longest_run = std::max<uint32_t>(debug_stat_longest_run, run_length);

     ++debug_stat_run_count;


     // so far only masstree. hash does not need this either (at the cost of less compression)

     ASSERT_ND(type_ == storage::kMasstreeStorage);

     if (needs_to_check) {  // if all entries in this range are 8-bytes keys, no need.

       AdjustComparatorMasstree comparator(position_entries_, inputs_status_);

       std::sort(sort_entries_ + cur, sort_entries_ + next, comparator);

     }

     cur = next;

   }

   sort_watch.stop();

   VLOG(1) << "Storage-" << id_ << ", merge sort (adjust) of " << current_count_ << " logs in "

     << sort_watch.elapsed_ms() << "ms. run_count=" << debug_stat_run_count << ", "

       << "longest_run=" << debug_stat_longest_run << ", total_runs=" << debug_stat_runs_total;

 }


 template <typename T>

 int compare_logs_as(const log::RecordLogType* lhs, const log::RecordLogType* rhs) {

   const T* lhs_log = reinterpret_cast<const T*>(lhs);

   const T* rhs_log = reinterpret_cast<const T*>(rhs);

   return T::compare_logs(lhs_log, rhs_log);

 }


 int MergeSort::compare_logs(const log::RecordLogType* lhs, const log::RecordLogType* rhs) const {

   ASSERT_ND(lhs->header_.storage_id_ == id_);

   ASSERT_ND(rhs->header_.storage_id_ == id_);

   if (type_ == storage::kArrayStorage) {

     ASSERT_ND(is_array_log_type(lhs->header_.log_type_code_));

     ASSERT_ND(is_array_log_type(rhs->header_.log_type_code_));

     return compare_logs_as< storage::array::ArrayCommonUpdateLogType >(lhs, rhs);

   } else if (type_ == storage::kMasstreeStorage) {

     ASSERT_ND(is_masstree_log_type(lhs->header_.log_type_code_));

     ASSERT_ND(is_masstree_log_type(rhs->header_.log_type_code_));

     return compare_logs_as< storage::masstree::MasstreeCommonLogType >(lhs, rhs);

   } else {

     ASSERT_ND(type_ == storage::kHashStorage);

     ASSERT_ND(is_hash_log_type(lhs->header_.log_type_code_));

     ASSERT_ND(is_hash_log_type(rhs->header_.log_type_code_));

     return compare_logs_as< storage::hash::HashCommonLogType >(lhs, rhs);

   }

 }


 void MergeSort::append_logs(MergeSort::InputIndex input_index, uint64_t upto_relative_pos) {

   InputStatus* status = inputs_status_ + input_index;

   ASSERT_ND(status->to_absolute_pos(upto_relative_pos) <= status->end_absolute_pos_);

   uint64_t relative_pos = status->cur_relative_pos_;

   if (type_ == storage::kArrayStorage) {

     while (LIKELY(relative_pos < upto_relative_pos)) {

       relative_pos += populate_entry_array(input_index, relative_pos);

     }

   } else if (type_ == storage::kMasstreeStorage) {

     while (LIKELY(relative_pos < upto_relative_pos)) {

       relative_pos += populate_entry_masstree(input_index, relative_pos);

     }

   } else {

     ASSERT_ND(type_ == storage::kHashStorage);

     while (LIKELY(relative_pos < upto_relative_pos)) {

       relative_pos += populate_entry_hash(input_index, relative_pos);

     }

   }

   ASSERT_ND(relative_pos == upto_relative_pos);


   if (upto_relative_pos > status->chunk_relative_pos_) {

     // we appeneded even the last log of this chunk! this should happen only at the last chunk.

     ASSERT_ND(status->is_last_chunk_overall());

     status->chunk_relative_pos_ = upto_relative_pos;

   }

   status->cur_relative_pos_ = upto_relative_pos;

   status->previous_chunk_relative_pos_ = upto_relative_pos;

   status->assert_consistent();

 }


 inline uint16_t MergeSort::populate_entry_array(InputIndex input_index, uint64_t relative_pos) {

   InputStatus* status = inputs_status_ + input_index;

   ASSERT_ND(current_count_ < buffer_capacity_);

   ASSERT_ND(relative_pos < status->window_size_);

   ASSERT_ND(relative_pos % 8U == 0);

   const storage::array::ArrayCommonUpdateLogType* the_log

     = reinterpret_cast<const storage::array::ArrayCommonUpdateLogType*>(

         status->from_byte_pos(relative_pos));

   ASSERT_ND(is_array_log_type(the_log->header_.log_type_code_));

   the_log->assert_valid_generic();


   Epoch epoch = the_log->header_.xct_id_.get_epoch();

   ASSERT_ND(epoch.subtract(base_epoch_) < (1U << 16));

   uint16_t compressed_epoch = epoch.subtract(base_epoch_);

   sort_entries_[current_count_].set(

     the_log->offset_,

     compressed_epoch,

     the_log->header_.xct_id_.get_ordinal(),

     false,

     current_count_);

   position_entries_[current_count_].input_index_ = input_index;

   position_entries_[current_count_].log_type_ = the_log->header_.log_type_code_;

   position_entries_[current_count_].input_position_ = to_buffer_position(relative_pos);

   ++current_count_;


   return the_log->header_.log_length_;

 }


 inline uint16_t MergeSort::populate_entry_hash(InputIndex input_index, uint64_t relative_pos) {

   InputStatus* status = inputs_status_ + input_index;

   ASSERT_ND(current_count_ < buffer_capacity_);

   ASSERT_ND(relative_pos < status->window_size_);

   ASSERT_ND(relative_pos % 8U == 0);

   const storage::hash::HashCommonLogType* the_log

     = reinterpret_cast<const storage::hash::HashCommonLogType*>(

         status->from_byte_pos(relative_pos));

   ASSERT_ND(is_hash_log_type(the_log->header_.log_type_code_));

   the_log->assert_type();


   Epoch epoch = the_log->header_.xct_id_.get_epoch();

   ASSERT_ND(epoch.subtract(base_epoch_) < (1U << 16));

   uint16_t compressed_epoch = epoch.subtract(base_epoch_);

   uint16_t key_length = the_log->key_length_;

   ASSERT_ND(key_length >= shortest_key_length_);

   ASSERT_ND(key_length <= longest_key_length_);

   storage::hash::HashBin bin = the_log->hash_ >> (64U - the_log->bin_bits_);

   sort_entries_[current_count_].set(

     bin,

     compressed_epoch,

     the_log->header_.xct_id_.get_ordinal(),

     false,

     current_count_);

   position_entries_[current_count_].input_index_ = input_index;

   position_entries_[current_count_].log_type_ = the_log->header_.log_type_code_;

   position_entries_[current_count_].input_position_ = to_buffer_position(relative_pos);

   ++current_count_;


   return the_log->header_.log_length_;

 }


 inline uint16_t MergeSort::populate_entry_masstree(InputIndex input_index, uint64_t relative_pos) {

   InputStatus* status = inputs_status_ + input_index;

   ASSERT_ND(current_count_ < buffer_capacity_);

   ASSERT_ND(relative_pos < status->window_size_);

   ASSERT_ND(relative_pos % 8U == 0);

   const storage::masstree::MasstreeCommonLogType* the_log

     = reinterpret_cast<const storage::masstree::MasstreeCommonLogType*>(

         status->from_byte_pos(relative_pos));

   ASSERT_ND(is_masstree_log_type(the_log->header_.log_type_code_));

   the_log->assert_valid_generic();


   Epoch epoch = the_log->header_.xct_id_.get_epoch();

   ASSERT_ND(epoch.subtract(base_epoch_) < (1U << 16));

   uint16_t compressed_epoch = epoch.subtract(base_epoch_);

   uint16_t key_length = the_log->key_length_;

   ASSERT_ND(key_length >= shortest_key_length_);

   ASSERT_ND(key_length <= longest_key_length_);

   sort_entries_[current_count_].set(

     the_log->get_first_slice(),

     compressed_epoch,

     the_log->header_.xct_id_.get_ordinal(),

     key_length != sizeof(storage::masstree::KeySlice),

     current_count_);

   position_entries_[current_count_].input_index_ = input_index;

   position_entries_[current_count_].log_type_ = the_log->header_.log_type_code_;

   position_entries_[current_count_].input_position_ = to_buffer_position(relative_pos);

   ++current_count_;


   return the_log->header_.log_length_;

 }


 void MergeSort::assert_sorted() {

 #ifndef NDEBUG

   for (MergedPosition i = 0; i < current_count_; ++i) {

     MergedPosition cur_pos = sort_entries_[i].get_position();

     const log::RecordLogType* cur = inputs_status_[position_entries_[cur_pos].input_index_].

       from_compact_pos(position_entries_[cur_pos].input_position_);


     // does it point to a correct log?

     Epoch epoch = cur->header_.xct_id_.get_epoch();

     uint16_t compressed_epoch = epoch.subtract(base_epoch_);

     SortEntry dummy;

     if (type_ == storage::kArrayStorage) {

       const auto* casted = reinterpret_cast<const storage::array::ArrayCommonUpdateLogType*>(cur);

       dummy.set(

         casted->offset_,

         compressed_epoch,

         cur->header_.xct_id_.get_ordinal(),

         false,

         cur_pos);

     } else if (type_ == storage::kMasstreeStorage) {

       const auto* casted = reinterpret_cast<const storage::masstree::MasstreeCommonLogType*>(cur);

       dummy.set(

         casted->get_first_slice(),

         compressed_epoch,

         cur->header_.xct_id_.get_ordinal(),

         casted->key_length_ != sizeof(storage::masstree::KeySlice),

         cur_pos);

     } else {

       ASSERT_ND(type_ == storage::kHashStorage);

       const auto* casted = reinterpret_cast<const storage::hash::HashCommonLogType*>(cur);

       casted->assert_type();

       storage::hash::HashBin bin = casted->hash_ >> (64U - casted->bin_bits_);

       dummy.set(

         bin,

         compressed_epoch,

         cur->header_.xct_id_.get_ordinal(),

         false,  // hash doesn't need further sorting so far.

         cur_pos);

     }

     ASSERT_ND(dummy.data_ == sort_entries_[i].data_);

     if (i == 0) {

       continue;

     }


     // compare with previous

     MergedPosition prev_pos = sort_entries_[i - 1].get_position();

     ASSERT_ND(prev_pos != cur_pos);

     const log::RecordLogType* prev = inputs_status_[position_entries_[prev_pos].input_index_].

       from_compact_pos(position_entries_[prev_pos].input_position_);

     int cmp = compare_logs(prev, cur);

     ASSERT_ND(cmp <= 0);

     if (cmp == 0) {

       // the last of sort order is position.

       ASSERT_ND(prev_pos < cur_pos);

     }

   }

 #endif  // NDEBUG

 }


 }  // namespace snapshot

 }  // namespace foedus

assorted_func.hpp

foedus::snapshot::MergeSort::InputStatus::window_offset_
uint64_t window_offset_
relative pos counts from this offset
Definition: merge_sort.hpp:177

foedus::snapshot::is_masstree_log_type
bool is_masstree_log_type(uint16_t log_type)
Definition: merge_sort.hpp:526

foedus::snapshot::MergeSort::is_ended_all
bool is_ended_all() const
Definition: merge_sort.hpp:358

aligned_memory.hpp

foedus::snapshot::MergeSort::InputStatus::chunk_relative_pos_
uint64_t chunk_relative_pos_
Definition: merge_sort.hpp:172

epoch.hpp

foedus::snapshot::to_buffer_position
BufferPosition to_buffer_position(uint64_t byte_position)
Definition: snapshot_id.hpp:74

storage_id.hpp
Definitions of IDs in this package and a few related constant values.

foedus::storage::hash::HashCommonLogType::assert_type
void assert_type() const __attribute__((always_inline))
Definition: hash_log_types.hpp:162

array_log_types.hpp
Declares all log types used in this storage type.

foedus::snapshot::compare_logs_as
int compare_logs_as(const log::RecordLogType *lhs, const log::RecordLogType *rhs)
Definition: merge_sort.cpp:548

foedus::snapshot::MergeSort::InputStatus::window_
const char * window_
Definition: merge_sort.hpp:168

foedus::snapshot::MergeSort::fetch_logs
uint32_t fetch_logs(uint32_t sort_pos, uint32_t count, log::RecordLogType const **out) const
To reduce L1 cache miss stall, we prefetch some number of position entries and the pointed log entrie...
Definition: merge_sort.cpp:273

foedus::Epoch::subtract
uint32_t subtract(const Epoch &other) const
Returns the number epochs from the given epoch to this epoch accounting for wrap-around.
Definition: epoch.hpp:137

foedus::storage::StorageId
uint32_t StorageId
Unique ID for storage.
Definition: storage_id.hpp:55

foedus
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44

page.hpp

foedus::snapshot::SortedBuffer::get_buffer
const char * get_buffer() const
Returns the buffer memory.
Definition: log_buffer.hpp:108

masstree_log_types.hpp
Declares all log types used in this storage type.

foedus::assorted::prefetch_cacheline
void prefetch_cacheline(const void *address)
Prefetch one cacheline to L1 cache.
Definition: cacheline.hpp:49

foedus::snapshot::MergeSort::SortEntry::set
void set(uint64_t key, uint16_t compressed_epoch, uint32_t in_epoch_ordinal, bool needs_additional_check, MergedPosition position) __attribute__((always_inline))
Definition: merge_sort.hpp:117

foedus::snapshot::SortedBuffer::get_cur_block_longest_key_length
uint32_t get_cur_block_longest_key_length() const
Current storage block's longest key length.
Definition: log_buffer.hpp:135

foedus::ErrorStack
Brings error stacktrace information as return value of functions.
Definition: error_stack.hpp:81

foedus::snapshot::MergeSort::InputStatus::assert_consistent
void assert_consistent() const __attribute__((always_inline))
Definition: merge_sort.hpp:181

foedus::memory::AlignedMemory::assure_capacity
ErrorCode assure_capacity(uint64_t required_size, double expand_margin=2.0, bool retain_content=false) noexcept
If the current size is smaller than the given size, automatically expands.
Definition: aligned_memory.cpp:182

foedus::Epoch
Represents a time epoch.
Definition: epoch.hpp:61

foedus::xct::XctId::get_ordinal
uint32_t get_ordinal() const __attribute__((always_inline))
Definition: xct_id.hpp:976

foedus::fs::status
FileStatus status(const Path &p)
Returns the status of the file.
Definition: filesystem.cpp:45

foedus::snapshot::SortedBuffer
Represents one input stream of sorted log entries.
Definition: log_buffer.hpp:81

foedus::snapshot::MergeSort::InputStatus::previous_chunk_relative_pos_
uint64_t previous_chunk_relative_pos_
Definition: merge_sort.hpp:175

foedus::DefaultInitializable
Typical implementation of Initializable as a skeleton base class.
Definition: initializable.hpp:157

stop_watch.hpp

foedus::snapshot::MergeSort::PositionEntry
Provides additional information for each entry we are sorting.
Definition: merge_sort.hpp:148

foedus::storage::masstree::KeySlice
uint64_t KeySlice
Each key slice is an 8-byte integer.
Definition: masstree_id.hpp:126

foedus::storage::masstree::MasstreeCommonLogType
A base class for MasstreeInsertLogType/MasstreeDeleteLogType/MasstreeOverwriteLogType.
Definition: masstree_log_types.hpp:86

foedus::snapshot::MergeSort::next_batch
ErrorStack next_batch()
Executes merge-sort on several thousands of logs and provides the result as a batch.
Definition: merge_sort.cpp:143

LIKELY
#define LIKELY(x)
Hints that x is highly likely true.
Definition: compiler.hpp:103

log_buffer.hpp

foedus::snapshot::SortedBuffer::get_offset
uint64_t get_offset() const
Returns the absolute byte position of the buffer's beginning in the entire file.
Definition: log_buffer.hpp:120

foedus::storage::array::ArrayCommonUpdateLogType
A base class for ArrayOverwriteLogType/ArrayIncrementLogType.
Definition: array_log_types.hpp:74

foedus::snapshot::MergeSort::assert_sorted
void assert_sorted()
For debug/test only.
Definition: merge_sort.cpp:695

foedus::snapshot::MergeSort::InputStatus::window_size_
uint64_t window_size_
Definition: merge_sort.hpp:178

foedus::snapshot::MergeSort::InputStatus::end_absolute_pos_
uint64_t end_absolute_pos_
Definition: merge_sort.hpp:179

foedus::snapshot::SortedBuffer::get_cur_block_abosulte_begin
uint64_t get_cur_block_abosulte_begin() const
Current storage block's beginning in absolute byte position in the file.
Definition: log_buffer.hpp:129

foedus::snapshot::MergeSort::MergeSort
MergeSort(storage::StorageId id, storage::StorageType type, Epoch base_epoch, SortedBuffer *const *inputs, uint16_t inputs_count, uint16_t max_original_pages, memory::AlignedMemory *const work_memory, uint16_t chunk_batch_size=kDefaultChunkBatch)
Definition: merge_sort.cpp:63

foedus::snapshot::MergeSort::InputIndex
uint16_t InputIndex
Index in input streams.
Definition: merge_sort.hpp:85

hash_log_types.hpp
Declares all log types used in this storage type.

foedus::storage::kMasstreeStorage
Masstree Storage
Definition: storage_id.hpp:130

foedus::snapshot::MergeSort::PositionEntry::input_index_
uint16_t input_index_
Definition: merge_sort.hpp:149

foedus::snapshot::MergeSort::SortEntry::data_
__uint128_t data_
Definition: merge_sort.hpp:142

cacheline.hpp
Constants and methods related to CPU cacheline and its prefetching.

foedus::snapshot::SortedBuffer::get_cur_block_shortest_key_length
uint32_t get_cur_block_shortest_key_length() const
Current storage block's shortest key length.
Definition: log_buffer.hpp:133

foedus::storage::hash::HashCommonLogType
A base class for HashInsertLogType/HashDeleteLogType/HashOverwriteLogType.
Definition: hash_log_types.hpp:75

foedus::snapshot::MergeSort::InputStatus::from_compact_pos
const log::RecordLogType * from_compact_pos(BufferPosition pos) const __attribute__((always_inline))
Definition: merge_sort.hpp:214

merge_sort.hpp

foedus::snapshot::is_hash_log_type
bool is_hash_log_type(uint16_t log_type)
Definition: merge_sort.hpp:519

foedus::snapshot::MergeSort::initialize_once
ErrorStack initialize_once() override
Definition: merge_sort.cpp:93

foedus::snapshot::kInvalidInput
const MergeSort::InputIndex kInvalidInput
Represents null.
Definition: merge_sort.cpp:40

foedus::xct::XctId::get_epoch
Epoch get_epoch() const __attribute__((always_inline))
Definition: xct_id.hpp:964

foedus::log::LogHeader::xct_id_
xct::XctId xct_id_
Epoch and in-epoch ordinal of this log.
Definition: common_log_types.hpp:75

foedus::storage::Page
Just a marker to denote that the memory region represents a data page.
Definition: page.hpp:334

foedus::snapshot::MergeSort::kWindowChunkReserveBytes
To avoid handling the case where a log spans an end of window, chunks leave at least this many bytes ...
Definition: merge_sort.hpp:101

foedus::log::LogHeader::log_type_code_
uint16_t log_type_code_
Actually of LogCode defined in the X-Macro, but we want to make sure the type size is 2 bytes...
Definition: common_log_types.hpp:55

foedus::memory::AlignedMemory::get_block
void * get_block() const
Returns the memory block.
Definition: aligned_memory.hpp:168

foedus::snapshot::extract_shortest_key_length
uint16_t extract_shortest_key_length(SortedBuffer *const *inputs, uint16_t inputs_count)
Definition: merge_sort.cpp:48

foedus::log::LogHeader::log_length_
uint16_t log_length_
Byte size of this log entry including this header itself and everything.
Definition: common_log_types.hpp:61

foedus::snapshot::MergeSort::InputStatus::get_chunk_log
const log::RecordLogType * get_chunk_log() const __attribute__((always_inline))
Definition: merge_sort.hpp:205

foedus::snapshot::MergeSort::InputStatus::cur_relative_pos_
uint64_t cur_relative_pos_
Definition: merge_sort.hpp:170

foedus::snapshot::MergeSort::PositionEntry::log_type_
uint16_t log_type_
not the enum itself for explicit size.
Definition: merge_sort.hpp:151

foedus::memory::AlignedMemory::get_size
uint64_t get_size() const
Returns the byte size of the memory block.
Definition: aligned_memory.hpp:172

foedus::storage::StorageType
StorageType
Type of the storage, such as hash.
Definition: storage_id.hpp:122

foedus::snapshot::MergeSort::MergedPosition
uint32_t MergedPosition
Position in MergeSort's buffer.
Definition: merge_sort.hpp:83

foedus::storage::hash::HashBin
uint64_t HashBin
Represents a bin of a hash value.
Definition: hash_id.hpp:142

foedus::snapshot::MergeSort::SortEntry::needs_additional_check
bool needs_additional_check() const __attribute__((always_inline))
Definition: merge_sort.hpp:135

CHECK_ERROR
#define CHECK_ERROR(x)
This macro calls x and checks its returned value.
Definition: error_stack.hpp:517

foedus::snapshot::is_array_log_type
bool is_array_log_type(uint16_t log_type)
Definition: merge_sort.hpp:516

foedus::memory::AlignedMemory
Represents one memory block aligned to actual OS/hardware pages.
Definition: aligned_memory.hpp:67

foedus::snapshot::SortedBuffer::get_cur_block_abosulte_end
uint64_t get_cur_block_abosulte_end() const
Current storage block's end in absolute byte position in the file.
Definition: log_buffer.hpp:131

foedus::kRetOk
const ErrorStack kRetOk
Normal return value for no-error case.
Definition: error_stack.hpp:251

foedus::snapshot::kWindowMoveThreshold
const float kWindowMoveThreshold
Also, when the input consumed more than this fraction of current window, we move the window...
Definition: merge_sort.cpp:45

foedus::log::LogHeader::storage_id_
storage::StorageId storage_id_
The storage this loggable operation mainly affects.
Definition: common_log_types.hpp:66

foedus::snapshot::MergeSort::InputStatus
Current status of each input.
Definition: merge_sort.hpp:167

foedus::snapshot::MergeSort::PositionEntry::input_position_
BufferPosition input_position_
Definition: merge_sort.hpp:152

foedus::snapshot::MergeSort::SortEntry::get_key
uint64_t get_key() const __attribute__((always_inline))
Definition: merge_sort.hpp:132

foedus::snapshot::extract_longest_key_length
uint16_t extract_longest_key_length(SortedBuffer *const *inputs, uint16_t inputs_count)
Definition: merge_sort.cpp:55

foedus::log::RecordLogType
Base class for log type of record-wise operation.
Definition: common_log_types.hpp:181

ASSERT_ND
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72

WRAP_ERROR_CODE
#define WRAP_ERROR_CODE(x)
Same as CHECK_ERROR(x) except it receives only an error code, thus more efficient.
Definition: error_stack.hpp:533

foedus::storage::kHashStorage
Hashtable Storage
Definition: storage_id.hpp:128

foedus::storage::kPageSize
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
Definition: storage_id.hpp:45

foedus::snapshot::MergeSort::SortEntry
Entries we actually sort.
Definition: merge_sort.hpp:116

foedus::DefaultInitializable::is_initialized
bool is_initialized() const  override final
Returns whether the object has been already initialized or not.
Definition: initializable.hpp:196

foedus::snapshot::MergeSort::is_no_merging
bool is_no_merging() const __attribute__((always_inline))
Definition: merge_sort.hpp:356

foedus::snapshot::MergeSort::kLogChunk
1024 logs per chunk
Definition: merge_sort.hpp:91

foedus::log::BaseLogType::header_
LogHeader header_
Definition: common_log_types.hpp:113

foedus::snapshot::MergeSort::SortEntry::get_position
MergedPosition get_position() const __attribute__((always_inline))
Definition: merge_sort.hpp:138

foedus::snapshot::SortedBuffer::assert_checks
void assert_checks()
Definition: log_buffer.hpp:177

foedus::snapshot::SortedBuffer::get_buffer_size
uint64_t get_buffer_size() const
Returns the size of buffer memory.
Definition: log_buffer.hpp:111

foedus::storage::kArrayStorage
Array Storage
Definition: storage_id.hpp:126