libfoedus-core
FOEDUS Core Library
hash_page_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <glog/logging.h>
21 
22 #include <cstring>
23 #include <string>
24 #include <thread>
25 #include <vector>
26 
27 #include "foedus/assert_nd.hpp"
28 #include "foedus/engine.hpp"
35 #include "foedus/thread/thread.hpp"
36 
37 namespace foedus {
38 namespace storage {
39 namespace hash {
41  StorageId storage_id,
42  VolatilePagePointer page_id,
43  const HashIntermediatePage* parent,
44  uint8_t level,
45  HashBin start_bin) {
46  std::memset(this, 0, kPageSize);
47  header_.init_volatile(page_id, storage_id, kHashIntermediatePageType);
48  bin_range_.begin_ = start_bin;
49  bin_range_.end_ = bin_range_.begin_ + kHashMaxBins[level + 1U];
50  header_.set_in_layer_level(level);
51  if (parent) {
52  ASSERT_ND(parent->get_level() > 0);
53  ASSERT_ND(level + 1U == parent->get_level());
54  ASSERT_ND(parent->bin_range_.contains(bin_range_));
55  }
56 }
57 
59  StorageId storage_id,
60  SnapshotPagePointer page_id,
61  uint8_t level,
62  HashBin start_bin) {
63  std::memset(this, 0, kPageSize);
64  header_.init_snapshot(page_id, storage_id, kHashIntermediatePageType);
65  bin_range_.begin_ = start_bin;
66  bin_range_.end_ = bin_range_.begin_ + kHashMaxBins[level + 1U];
67  header_.set_in_layer_level(level);
68 }
69 
71  StorageId storage_id,
72  VolatilePagePointer page_id,
73  const Page* parent,
74  HashBin bin,
75  uint8_t bin_shifts) {
76  std::memset(this, 0, kPageSize);
77  header_.init_volatile(page_id, storage_id, kHashDataPageType);
78  bin_ = bin;
79  protected_set_bin_shifts(bin_shifts);
80  ASSERT_ND(parent);
82  const HashIntermediatePage* parent_casted
83  = reinterpret_cast<const HashIntermediatePage*>(parent);
84  ASSERT_ND(parent_casted->get_level() == 0);
85  ASSERT_ND(parent_casted->get_bin_range().contains(bin));
86  } else {
87  const HashDataPage* parent_casted = reinterpret_cast<const HashDataPage*>(parent);
88  ASSERT_ND(parent_casted->get_bin() == bin);
89  ASSERT_ND(parent_casted->get_bin_shifts() == bin_shifts);
90  }
91 }
92 
94  StorageId storage_id,
95  SnapshotPagePointer page_id,
96  HashBin bin,
97  uint8_t bin_bits,
98  uint8_t bin_shifts) {
99  ASSERT_ND(bin_bits + bin_shifts == 64U);
100  std::memset(this, 0, kPageSize);
101  header_.init_snapshot(page_id, storage_id, kHashDataPageType);
102  bin_ = bin;
103  protected_set_bin_shifts(bin_shifts);
104 }
105 
107  HashValue hash,
108  const BloomFilterFingerprint& fingerprint,
109  const void* key,
110  KeyLength key_length,
111  DataPageSlotIndex record_count,
112  DataPageSlotIndex check_from) const {
113  // invariant checks
114  ASSERT_ND(hash == hashinate(key, key_length));
116  ASSERT_ND(record_count <= get_record_count()); // it must be increasing.
117 
118 #ifndef NDEBUG
119  // Check the invariant on check_from
120  for (uint16_t i = 0; i < check_from; ++i) {
121  const Slot& s = get_slot(i);
122  if (LIKELY(s.hash_ != hash) || s.key_length_ != key_length) {
123  continue;
124  } else if (s.tid_.xct_id_.is_moved()) {
125  continue;
126  }
127 
128  const char* data = record_from_offset(s.offset_);
129  ASSERT_ND(s.key_length_ != key_length || std::memcmp(data, key, key_length) != 0);
130  }
131 #endif // NDEBUG
132 
133  // check bloom filter first.
134  if (!bloom_filter_.contains(fingerprint)) {
135  return kSlotNotFound;
136  }
137 
138  // then most likely this page contains it. let's check one by one.
139  for (uint16_t i = check_from; i < record_count; ++i) {
140  const Slot& s = get_slot(i);
141  if (LIKELY(s.hash_ != hash) || s.key_length_ != key_length) {
142  continue;
143  }
144  // At this point, we don't take read-set (this is a physical search).
145  // We thus mind seeing being-written. The logical check will follow.
146  // We can also simply check is_moved because the flag is guaranteed to remain once set.
147  // But, remember, it might be NOW being moved, so a logical read-set must follow.
148  if (s.tid_.xct_id_.is_moved()) {
149  // not so rare. this happens.
150  DVLOG(1) << "Hash matched, but the record was moved";
151  continue;
152  }
153 
154  const char* data = record_from_offset(s.offset_);
155  if (s.key_length_ == key_length && std::memcmp(data, key, key_length) == 0) {
156  return i;
157  }
158  // hash matched, but key didn't match? wow, that's rare
159  DLOG(INFO) << "Hash matched, but key didn't match. interesting. hash="
160  << assorted::Hex(hash, 16) << ", key="
161  << assorted::HexString(std::string(reinterpret_cast<const char*>(key), key_length))
162  << ", key_slot=" << assorted::HexString(std::string(data, s.key_length_));
163  }
164 
165  // should be 1~2%
166  DVLOG(0) << "Nope, bloom filter contained it, but key not found in this page. false positive";
167  return kSlotNotFound;
168 }
169 
171  HashValue hash,
172  const BloomFilterFingerprint& fingerprint,
173  const void* key,
174  uint16_t key_length,
175  uint16_t payload_length) {
176  ASSERT_ND(header_.page_version_.is_locked());
177  ASSERT_ND(available_space() >= HashDataPage::required_space(key_length, payload_length));
179  Slot& slot = get_slot(index);
180  slot.offset_ = next_offset();
181  slot.hash_ = hash;
182  slot.key_length_ = key_length;
183  slot.physical_record_length_ = assorted::align8(key_length) + assorted::align8(payload_length);
184  slot.payload_length_ = 0;
185  char* record = record_from_offset(slot.offset_);
186  std::memcpy(record, key, key_length);
187  if (key_length % 8 != 0) {
188  std::memset(record + key_length, 0, 8 - (key_length % 8));
189  }
190  xct::XctId initial_id;
191  initial_id.set(
192  Epoch::kEpochInitialCurrent, // TODO(Hideaki) this should be something else
193  0);
194  initial_id.set_deleted();
195  // FIXME(tzwang): do this in a more meaningful place,
196  // e.g., have something like get_new_slot.
197  slot.tid_.reset();
198  slot.tid_.xct_id_ = initial_id;
199 
200  // we install the fingerprint to bloom filter BEFORE we increment key count.
201  // it's okay for concurrent reads to see false positives, but false negatives are wrong!
202  bloom_filter_.add(fingerprint);
203 
204  // we increment key count AFTER installing the key because otherwise the optimistic read
205  // might see the record but find that the key doesn't match. we need a fence to prevent it.
207  header_.increment_key_count();
208 
209  return index;
210 }
211 
212 
214  ASSERT_ND(args.parent_); // because this is always called for non-root pages.
215  ASSERT_ND(args.page_);
217  StorageId storage_id = args.parent_->get_header().storage_id_;
218  HashIntermediatePage* page = reinterpret_cast<HashIntermediatePage*>(args.page_);
219 
221  const HashIntermediatePage* parent = reinterpret_cast<const HashIntermediatePage*>(args.parent_);
222 
223  ASSERT_ND(parent->get_level() > 0);
224  parent->assert_range();
225  HashBin interval = kHashMaxBins[parent->get_level()];
226  HashBin parent_begin = parent->get_bin_range().begin_;
227  HashBin begin = parent_begin + args.index_in_parent_ * interval;
228  uint8_t level = parent->get_level() - 1U;
229  page->initialize_volatile_page(storage_id, args.page_id, parent, level, begin);
230 }
231 
233  ASSERT_ND(args.parent_);
234  ASSERT_ND(args.page_);
235  StorageId storage_id = args.parent_->get_header().storage_id_;
236  HashDataPage* page = reinterpret_cast<HashDataPage*>(args.page_);
237  PageType parent_type = args.parent_->get_header().get_page_type();
238  uint64_t bin;
239  if (parent_type == kHashIntermediatePageType) {
240  const HashIntermediatePage* parent
241  = reinterpret_cast<const HashIntermediatePage*>(args.parent_);
242 
244  ASSERT_ND(parent->get_level() == 0);
246  bin = parent->get_bin_range().begin_ + args.index_in_parent_;
247  } else {
248  ASSERT_ND(parent_type == kHashDataPageType);
249  ASSERT_ND(args.index_in_parent_ == 0);
250  const HashDataPage* parent = reinterpret_cast<const HashDataPage*>(args.parent_);
251  bin = parent->get_bin();
252  }
253  HashStorage storage(args.context_->get_engine(), storage_id);
254  uint8_t bin_shifts = storage.get_bin_shifts();
255  page->initialize_volatile_page(storage_id, args.page_id, args.parent_, bin, bin_shifts);
256 }
257 
258 // Parallel page release for shutdown/drop. simpler than masstree package
259 
261  const memory::GlobalVolatilePageResolver& page_resolver
264  = reinterpret_cast<HashIntermediatePage*>(page_resolver.resolve_offset(pointer));
266  memory::PageReleaseBatch release_batch(engine);
267  p->release_pages_recursive(page_resolver, &release_batch);
268  release_batch.release_all();
269 }
270 
272  if (get_level() == 0) {
273  // root page is a leaf page.. don't bother parallelize.
274  const memory::GlobalVolatilePageResolver& page_resolver
276  memory::PageReleaseBatch release_batch(engine);
277  release_pages_recursive(page_resolver, &release_batch);
278  release_batch.release_all();
279  } else {
280  // so far, we spawn a thread for every single pointer.
281  // it might be an oversubscription, but not a big issue.
282  std::vector<std::thread> threads;
283  for (uint8_t i = 0; i < kHashIntermediatePageFanout; ++i) {
284  VolatilePagePointer pointer = pointers_[i].volatile_pointer_;
285  if (!pointer.is_null()) {
286  threads.emplace_back(release_parallel, engine, pointer);
287  }
288  }
289 
290  for (auto& t : threads) {
291  t.join();
292  }
293 
294  VolatilePagePointer volatile_id;
295  volatile_id.word = header().page_id_;
297  volatile_id.get_numa_node())->get_volatile_pool();
298  pool->release_one(volatile_id.get_offset());
299  }
300 }
301 
303  const memory::GlobalVolatilePageResolver& page_resolver,
304  memory::PageReleaseBatch* batch) {
305  for (uint8_t i = 0; i < kHashIntermediatePageFanout; ++i) {
306  VolatilePagePointer pointer = pointers_[i].volatile_pointer_;
307  if (!pointer.is_null()) {
308  Page* page = page_resolver.resolve_offset(pointer);
309  if (get_level() == 0) {
310  HashDataPage* child = reinterpret_cast<HashDataPage*>(page);
312  ASSERT_ND(child->header().get_in_layer_level() == 0);
313  child->release_pages_recursive(page_resolver, batch);
314  } else {
315  HashIntermediatePage* child = reinterpret_cast<HashIntermediatePage*>(page);
317  ASSERT_ND(child->get_level() + 1U == get_level());
318  child->release_pages_recursive(page_resolver, batch);
319  }
320  pointer.clear();
321  }
322  }
323 
324  VolatilePagePointer volatile_id;
325  volatile_id.word = header().page_id_;
326  batch->release(volatile_id);
327 }
328 
330  const memory::GlobalVolatilePageResolver& page_resolver,
331  memory::PageReleaseBatch* batch) {
332  if (!next_page_.volatile_pointer_.is_null()) {
333  HashDataPage* next = reinterpret_cast<HashDataPage*>(
334  page_resolver.resolve_offset(next_page_.volatile_pointer_));
335  ASSERT_ND(next->header().get_in_layer_level() == 0);
336  ASSERT_ND(next->get_bin() == get_bin());
337  next->release_pages_recursive(page_resolver, batch);
338  next_page_.volatile_pointer_.clear();
339  }
340 
341  VolatilePagePointer volatile_id;
342  volatile_id.word = header().page_id_;
343  batch->release(volatile_id);
344 }
345 
346 } // namespace hash
347 } // namespace storage
348 } // namespace foedus
storage::Page * resolve_offset(uint8_t numa_node, PagePoolOffset offset) const __attribute__((always_inline))
Resolves offset plus NUMA node ID to storage::Page*.
T align8(T value)
8-alignment.
The first epoch (before wrap-around) that might have transactions is ep-3.
Definition: epoch.hpp:77
void release_all()
Called at the end to return all batched pages to their pools.
Definition: page_pool.cpp:189
uint16_t next_offset() const
Returns offset of a next record.
void set_deleted() __attribute__((always_inline))
Definition: xct_id.hpp:1027
Page pool for volatile read/write store (VolatilePage) and the read-only bufferpool (SnapshotPage)...
Definition: page_pool.hpp:173
uint32_t StorageId
Unique ID for storage.
Definition: storage_id.hpp:55
const HashBinRange & get_bin_range() const
xct::RwLockableXctId tid_
TID of the record.
void init_volatile(VolatilePagePointer page_id, StorageId storage_id, PageType page_type) __attribute__((always_inline))
Definition: page.hpp:284
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
uint16_t payload_length_
Byte length of the payload.
void increment_key_count() __attribute__((always_inline))
Definition: page.hpp:318
StorageId storage_id_
ID of the storage this page belongs to.
Definition: page.hpp:196
const GlobalVolatilePageResolver & get_global_volatile_page_resolver() const
Returns the page resolver to convert volatile page ID to page pointer.
PageType
The following 1-byte value is stored in the common page header.
Definition: page.hpp:46
const DataPageSlotIndex kSlotNotFound
Definition: hash_id.hpp:197
void release_one(PagePoolOffset offset)
Returns only one page.
Definition: page_pool.cpp:144
Represents a pointer to a volatile page with modification count for preventing ABA.
Definition: storage_id.hpp:194
bool contains(const BloomFilterFingerprint &fingerprint) const __attribute__((always_inline))
Persistent status part of Transaction ID.
Definition: xct_id.hpp:955
XctId xct_id_
the second 64bit: Persistent status part of TID.
Definition: xct_id.hpp:1137
DataPageSlotIndex reserve_record(HashValue hash, const BloomFilterFingerprint &fingerprint, const void *key, uint16_t key_length, uint16_t payload_length)
A system transaction that creates a logically deleted record in this page for the given key...
void initialize_snapshot_page(StorageId storage_id, SnapshotPagePointer page_id, HashBin bin, uint8_t bin_bits, uint8_t bin_shifts)
#define LIKELY(x)
Hints that x is highly likely true.
Definition: compiler.hpp:103
Engine * get_engine() const
Definition: thread.cpp:52
const Slot & get_slot(DataPageSlotIndex record) const __attribute__((always_inline))
uint16_t key_length_
Byte length of key of the record.
uint16_t index_in_parent_
[IN] Some index (meaning depends on page type) of pointer in parent page to the new page...
Definition: page.hpp:375
bool contains(HashBin hash) const
Definition: hash_id.hpp:176
HashValue hash_
Full hash of the key of the record.
void set(Epoch::EpochInteger epoch_int, uint32_t ordinal)
Definition: xct_id.hpp:958
Page * page_
[IN, OUT] The new page to initialize.
Definition: page.hpp:371
VolatilePagePointer volatile_pointer_
Definition: storage_id.hpp:308
thread::Thread * context_
[IN] Thread on which the procedure is running
Definition: page.hpp:367
HashValue hashinate(const void *key, uint16_t key_length)
Calculates hash value for general input.
void protected_set_bin_shifts(uint8_t bin_shifts)
this method should be called only in page creation.
memory::PagePoolOffset get_offset() const
Definition: storage_id.hpp:202
A helper class to return a bunch of pages to individual nodes.
Definition: page_pool.hpp:276
uint64_t SnapshotPagePointer
Page ID of a snapshot page.
Definition: storage_id.hpp:79
PageType get_page_type() const
Definition: page.hpp:280
Database engine object that holds all resources and provides APIs.
Definition: engine.hpp:109
void reset() __attribute__((always_inline))
used only while page initialization
Definition: xct_id.hpp:1150
NumaNodeMemoryRef * get_node_memory(foedus::thread::ThreadGroupId group) const
void release_parallel(Engine *engine, VolatilePagePointer pointer)
const Page * parent_
[IN] Parent of the new page.
Definition: page.hpp:373
Fix-sized slot for each record, which is placed at the end of data region.
Set of arguments, both inputs and outputs, given to each volatile page initializer.
Definition: page.hpp:365
Just a marker to denote that the memory region represents a data page.
Definition: page.hpp:334
PageVersion page_version_
Used in several storage types as concurrency control mechanism for the page.
Definition: page.hpp:272
HashBin begin_
Inclusive beginning of the range.
Definition: hash_id.hpp:191
uint16_t DataPageSlotIndex
Definition: hash_id.hpp:196
Equivalent to std::hex in case the stream doesn't support it.
HashBin end_
Exclusive end of the range.
Definition: hash_id.hpp:193
uint16_t physical_record_length_
Byte count this record occupies.
void release_pages_recursive(const memory::GlobalVolatilePageResolver &page_resolver, memory::PageReleaseBatch *batch)
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, const HashIntermediatePage *parent, uint8_t level, HashBin start_bin)
Called only when this page is initialized.
Represents a key-value store based on a dense and regular hash.
Represents an intermediate page in Hashtable Storage.
void initialize_snapshot_page(StorageId storage_id, SnapshotPagePointer page_id, uint8_t level, HashBin start_bin)
static uint16_t required_space(uint16_t key_length, uint16_t payload_length)
returns physical_record_length_ for a new record
static BloomFilterFingerprint extract_fingerprint(HashValue fullhash)
void hash_intermediate_volatile_page_init(const VolatilePageInitArguments &args)
volatile page initialize callback for HashIntermediatePage.
VolatilePagePointer page_id
[IN] New page ID
Definition: page.hpp:369
char * record_from_offset(uint16_t offset)
uint64_t HashBin
Represents a bin of a hash value.
Definition: hash_id.hpp:142
void add(const BloomFilterFingerprint &fingerprint) __attribute__((always_inline))
Adds the fingerprint to this bloom filter.
Represents an individual data page in Hashtable Storage.
uint16_t available_space() const
Returns usable space in bytes.
void hash_data_volatile_page_init(const VolatilePageInitArguments &args)
volatile page initialize callback for HashDataPage.
void release(storage::VolatilePagePointer page_id)
Returns the given in-memory volatile page to appropriate NUMA node.
Definition: page_pool.hpp:292
void init_snapshot(SnapshotPagePointer page_id, StorageId storage_id, PageType page_type) __attribute__((always_inline))
Definition: page.hpp:301
A fingerprint for bloom filter in each HashDataPage.
void initialize_volatile_page(StorageId storage_id, VolatilePagePointer page_id, const Page *parent, HashBin bin, uint8_t bin_shifts)
uint16_t KeyLength
Represents a byte-length of a key in this package.
Definition: hash_id.hpp:203
Convenient way of writing hex integers to stream.
Resolves an offset in a volatile page pool to an actual pointer and vice versa.
PageHeader & get_header()
At least the basic header exists in all pages.
Definition: page.hpp:336
const uint8_t kHashIntermediatePageFanout
Number of pointers in an intermediate page of hash storage.
Definition: hash_id.hpp:49
const uint64_t kHashMaxBins[]
kHashTotalBins[n] gives the maximum number of hash bins n-level hash can hold.
Definition: hash_id.hpp:74
uint16_t get_record_count() const __attribute__((always_inline))
uint8_t get_in_layer_level() const
Definition: page.hpp:281
bool is_moved() const __attribute__((always_inline))
Definition: xct_id.hpp:1041
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
bool is_locked() const __attribute__((always_inline))
Definition: page.hpp:138
void memory_fence_release()
Equivalent to std::atomic_thread_fence(std::memory_order_release).
void release_pages_recursive(const memory::GlobalVolatilePageResolver &page_resolver, memory::PageReleaseBatch *batch)
memory::EngineMemory * get_memory_manager() const
See Memory Manager.
Definition: engine.cpp:50
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
Definition: storage_id.hpp:45
DataPageSlotIndex search_key_physical(HashValue hash, const BloomFilterFingerprint &fingerprint, const void *key, KeyLength key_length, DataPageSlotIndex record_count, DataPageSlotIndex check_from=0) const
Search for a physical slot that exactly contains the given key.
uint16_t offset_
Byte offset in data_ where this record starts.
uint64_t HashValue
Represents a full 64-bit hash value calculated from a key.
Definition: hash_id.hpp:129
uint64_t page_id_
Page ID of this page.
Definition: page.hpp:191
void set_in_layer_level(uint8_t level)
Definition: page.hpp:282