libfoedus-core
FOEDUS Core Library
cache_hashtable.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
18 
20 
21 #include <glog/logging.h>
22 
23 #include <ostream>
24 
30 #include "foedus/thread/thread.hpp"
31 
32 namespace foedus {
33 namespace cache {
34 
36  ASSERT_ND(physical_buckets >= 1024U);
37  // to speed up, leave space in neighbors of the last bucket.
38  // Instead, we do not wrap-around.
39  // Also, leave space for at least one cacheline so that we never overrun when we prefetch.
40  BucketId buckets = physical_buckets - kHopNeighbors - 64ULL;
41 
42  // to make the division-hashing more effective, make it prime-like.
43  BucketId logical_buckets = assorted::generate_almost_prime_below(buckets);
44  ASSERT_ND(logical_buckets <= physical_buckets);
45  ASSERT_ND((logical_buckets & (logical_buckets - 1U)) != 0); // at least not power of 2
46  return logical_buckets;
47 }
48 
49 uint32_t determine_overflow_list_size(BucketId physical_buckets) {
50  // there should be very few overflow entries. This can be super small.
51  const uint32_t kOverflowFraction = 128U;
52  const uint32_t kOverflowMinSize = 256U;
53  uint32_t overflow_size = physical_buckets / kOverflowFraction;
54  if (overflow_size <= kOverflowMinSize) {
55  overflow_size = kOverflowMinSize;
56  }
57  return overflow_size;
58 }
59 
60 HashFunc::HashFunc(BucketId physical_buckets)
61  : logical_buckets_(determine_logical_buckets(physical_buckets)),
62  physical_buckets_(physical_buckets),
63  bucket_div_(logical_buckets_) {
64 }
65 
66 CacheHashtable::CacheHashtable(BucketId physical_buckets, uint16_t numa_node)
67  : numa_node_(numa_node),
68  overflow_buckets_count_(determine_overflow_list_size(physical_buckets)),
69  hash_func_(physical_buckets),
70  clockhand_(0) {
72  sizeof(CacheBucket) * physical_buckets,
73  1U << 21,
75  numa_node);
77  sizeof(CacheRefCount) * physical_buckets,
78  1U << 21,
80  numa_node);
81  buckets_ = reinterpret_cast<CacheBucket*>(buckets_memory_.get_block());
82  refcounts_ = reinterpret_cast<CacheRefCount*>(refcounts_memory_.get_block());
83 
84  // index-0 should be never used. 0 means null.
85  buckets_[0].reset();
86  refcounts_[0].count_ = 0;
87 
88  // for overflow list
91  1U << 21,
93  numa_node);
96  // index-0 should be never used. 0 means null.
98  overflow_buckets_[0].next_ = 0;
101 
102  // initially all entries are in free list.
104  for (OverflowPointer i = 1U; i < overflow_buckets_count_; ++i) {
105  if (i < overflow_buckets_count_ - 1U) {
106  overflow_buckets_[i].next_ = i + 1U;
107  } else {
108  overflow_buckets_[i].next_ = 0;
109  }
110  }
111 }
112 
113 
115  ASSERT_ND(content != 0);
116 
117  // Grab a bucket to install a new page.
118  // The bucket does not have to be the only bucket to serve the page, so
119  // the logic below is much simpler than typical bufferpool.
120  BucketId ideal_bucket = get_bucket_number(page_id);
121  PageIdTag tag = HashFunc::get_tag(page_id);
122  ASSERT_ND(tag != 0);
123 
124  CacheBucket new_bucket;
125  new_bucket.reset(content, tag);
126 
127  // An opportunistic optimization. if the exact bucket already has the same page_id,
128  // most likely someone else is trying to install it at the same time. let's wait.
129  for (BucketId bucket = ideal_bucket; bucket < ideal_bucket + kHopNeighbors; ++bucket) {
130  if (!buckets_[bucket].is_content_set()) {
131  // looks like this is empty!
132  buckets_[bucket] = new_bucket; // 8-byte implicitly-atomic write
133  refcounts_[bucket].count_ = 1;
134  // this might be immediately overwritten by someone else, but that's fine.
135  // that only causes a future cache miss. no correctness issue.
136  return kErrorCodeOk;
137  }
138  }
139 
140  // unlucky, no empty slot. If this happens often, we are seriously troubled.
141  DVLOG(0) << "Ohhh, we have to add this to overflow list! This should be really rare."
142  << " page_id=" << assorted::Hex(page_id)
143  << ", content=" << assorted::Hex(content)
144  << ", ideal_bucket=" << assorted::Hex(ideal_bucket)
145  << ", tag=" << assorted::Hex(tag)
146  << ", page_id=" << assorted::Hex(page_id);
147 
148  // we come here anyway very occasionally, so taking mutex here wouldn't cause performance issue.
149  // note that this mutex just protects the free-list, which is rarely used.
151  OverflowPointer new_overflow_entry = overflow_free_buckets_head_;
152  if (new_overflow_entry == 0) {
153  LOG(ERROR) << "Oh my god. we consumed all overflow entries, which means we have too many"
154  << " hash collisions. page_id=" << assorted::Hex(page_id)
155  << ", content=" << assorted::Hex(content)
156  << ", ideal_bucket=" << assorted::Hex(ideal_bucket)
157  << ", tag=" << assorted::Hex(tag)
158  << ", page_id=" << assorted::Hex(page_id);
160  }
161  ASSERT_ND(new_overflow_entry < overflow_buckets_count_);
163  overflow_buckets_[new_overflow_entry].next_ = overflow_buckets_head_;
164  overflow_buckets_[new_overflow_entry].refcount_.count_ = 1;
165  overflow_buckets_[new_overflow_entry].bucket_ = new_bucket;
167  overflow_buckets_head_ = new_overflow_entry;
168  return kErrorCodeOk;
169 }
170 
172  LOG(INFO) << "Snapshot-Cache eviction starts at node-" << numa_node_
173  << ", clockhand_=" << clockhand_ << ", #target=" << args->target_count_;
174  const BucketId end = get_physical_buckets();
175  BucketId cur = clockhand_;
176 
177  // we check each entry in refcounts_, which are 2 bytes each.
178  // for quicker checks, we want 8-byte aligned access.
179  // also, we should anyway do prefetch, so make it 64-byte aligned
180  cur = (cur >> 5) << 5;
181  ASSERT_ND(cur % (1U << 5) == 0);
182  if (cur >= end) {
183  cur = 0;
184  }
185 
186  // evict on the normal buckets first.
187  args->evicted_count_ = 0;
188  uint16_t loops;
189  const uint16_t kMaxLoops = 16; // if we need more loops than this, something is wrong...
190  for (loops = 0; loops < kMaxLoops; ++loops) {
191  cur = evict_main_loop(args, cur, loops);
192  if (cur >= get_physical_buckets()) {
193  cur = 0;
194  // we went over all buckets in usual entries. now check the overflow linked list.
196  evict_overflow_loop(args, loops);
197  }
198  }
199  if (args->evicted_count_ >= args->target_count_) {
200  break;
201  } else {
202  ASSERT_ND(cur == 0); // we checked all buckets and wrapped around, right? go on to next loop
203  }
204  }
205 
206  clockhand_ = cur;
207  LOG(INFO) << "Snapshot-Cache eviction completed at node-" << numa_node_
208  << ", clockhand_=" << clockhand_ << ", #evicted=" << args->evicted_count_
209  << ", looped-over the whole hashtable for " << loops << " times";
210 }
211 
214  BucketId cur,
215  uint16_t loop) {
216  ASSERT_ND(cur % (1U << 5) == 0);
218  const uint16_t decrements = 1U << loop;
219  debugging::StopWatch watch;
220 
221  // the main idea is as follows.
222  // whenever the bucket has never been used or been used but released without unlucky races,
223  // the corresponding refcount is zero. hence, we just check for non-zeros in refcounts_.
224  // this is trivially vectorized and the only observable cost is L1 cache miss.
225  // we reduce L1 cache miss cost by prefetching a lot.
226  uint32_t cur_cacheline = cur >> 5;
227  const uint32_t end_cacheline = (get_physical_buckets() >> 5) + 1ULL;
228  // for example, we prefetch cacheline 16-23 while reading cacheline 0-7.
229  const uint16_t kL1PrefetchBatch = 8;
230  const uint16_t kL1PrefetchAhead = 16;
231  for (; cur_cacheline < end_cacheline; ++cur_cacheline) {
232  if (cur_cacheline / kL1PrefetchBatch == 0) {
234  refcounts_ + ((cur_cacheline + kL1PrefetchAhead) << 5),
235  kL1PrefetchBatch);
236  }
237 
238  BucketId bucket = cur_cacheline << 5;
239  // gcc, you should be smart enough to optimize this. at least with O3.
240  uint64_t* ints = reinterpret_cast<uint64_t*>(ASSUME_ALIGNED(refcounts_ + bucket, 64));
241  bool all_zeros = true;
242  for (uint16_t i = 0; i < 8U; ++i) {
243  if (ints[i] != 0) {
244  all_zeros = false;
245  break;
246  }
247  }
248 
249  if (LIKELY(all_zeros)) {
250  continue;
251  } else {
252  // this should be a rare case as far as we keep the hashtable sparse.
253  CacheRefCount* base = reinterpret_cast<CacheRefCount*>(refcounts_ + bucket);
254  for (uint16_t i = 0; i < 32U; ++i) {
255  if (base[i].count_ > 0) {
256  bool still_non_zero = base[i].decrement(decrements);
257  if (!still_non_zero) {
258  args->add_evicted(buckets_[bucket + i].get_content_id());
259  buckets_[bucket + i].data_ = 0;
260  }
261  }
262  }
263  }
264 
265  if (args->evicted_count_ >= args->target_count_) {
266  break;
267  }
268  }
269 
270  watch.stop();
271  LOG(INFO) << "Snapshot-Cache eviction main_loop at node-" << numa_node_ << ", checked "
272  << ((cur_cacheline << 5) - clockhand_) << " buckets in " << watch.elapsed_us() << "us";
273 
274  return cur_cacheline << 5;
275 }
276 
278  const uint16_t decrements = 1U << loop;
279  uint32_t checked_count = 0;
280 
281  // store evicted entries into
282  OverflowPointer evicted_head = 0; // evicted
283  debugging::StopWatch watch;
284  {
285  // We block this method entirely with the free buckets mutex.
286  // This does NOT block usual transactions unless they actually have to newly add to overflow,
287  // which should be very rare. This is cheap yet enough to make the free-list safe.
289 
290  // no interesting optimization. overflow list should be empty or almost empty.
292  if (head != 0) {
293  // skip the head. we handle it at the last.
294  OverflowPointer prev = head;
295  for (OverflowPointer cur = overflow_buckets_[prev].next_; cur != 0;) {
296  CacheOverflowEntry* cur_entry = overflow_buckets_ + cur;
297  OverflowPointer next = cur_entry->next_;
298  bool still_non_zero = cur_entry->refcount_.decrement(decrements);
299  if (!still_non_zero) {
300  args->add_evicted(cur_entry->bucket_.get_content_id());
301  CacheOverflowEntry* prev_entry = overflow_buckets_ + prev;
302  prev_entry->next_ = next;
303  cur_entry->bucket_.data_ = 0;
304  cur_entry->next_ = evicted_head;
305  evicted_head = cur;
306  }
307 
308  prev = cur;
309  cur = next;
310  ++checked_count;
311  }
312 
313  // finally check the head
314  CacheOverflowEntry* cur_entry = overflow_buckets_ + head;
315  bool still_non_zero = cur_entry->refcount_.decrement(decrements);
316  if (!still_non_zero) {
317  args->add_evicted(cur_entry->bucket_.get_content_id());
318  overflow_buckets_head_ = cur_entry->next_;
319  cur_entry->bucket_.data_ = 0;
320  cur_entry->next_ = evicted_head;
321  evicted_head = head;
322  }
323  ++checked_count;
324  }
325  }
326  watch.stop();
327  LOG(INFO) << "Snapshot-Cache eviction overflow_loop at node-" << numa_node_ << ", checked "
328  << (checked_count) << " buckets in " << watch.elapsed_us() << "us";
329 }
330 
331 
332 std::ostream& operator<<(std::ostream& o, const HashFunc& v) {
333  o << "<HashFunc>"
334  << "<logical_buckets_>" << v.logical_buckets_ << "<logical_buckets_>"
335  << "<physical_buckets_>" << v.physical_buckets_ << "<physical_buckets_>"
336  << "</HashFunc>";
337  return o;
338 }
339 
341  for (BucketId i = 0; i < get_physical_buckets(); ++i) {
342  if (buckets_[i].is_content_set()) {
343  ASSERT_ND(buckets_[i].get_tag() != 0);
344  } else {
345  ASSERT_ND(buckets_[i].get_tag() == 0);
346  }
347  }
348  return kRetOk;
349 }
350 
352  Stat result;
353  result.normal_entries_ = 0;
354  result.overflow_entries_ = 0;
355 
357  for (BucketId i = 0; i < end; ++i) {
358  if (buckets_[i].is_content_set()) {
359  ++result.normal_entries_;
360  }
361  }
362 
364  for (OverflowPointer i = overflow_buckets_head_; i != 0;) {
365  if (overflow_buckets_[i].bucket_.is_content_set()) {
366  ++result.overflow_entries_;
367  }
368  i = overflow_buckets_[i].next_;
369  }
370  }
371 
372  return result;
373 }
374 
375 
377  uint16_t batch_size,
378  const storage::SnapshotPagePointer* page_ids,
379  ContentId* out) const {
380  if (batch_size == 0) {
381  return kErrorCodeOk;
382  }
383  ASSERT_ND(batch_size <= kMaxFindBatchSize);
384  if (UNLIKELY(batch_size > kMaxFindBatchSize)) {
386  }
387 
388  // first, calculate hash values and prefetch
389  BucketId bucket_numbers[kMaxFindBatchSize];
390  for (uint16_t b = 0; b < batch_size; ++b) {
391  if (page_ids[b] == 0) {
392  continue;
393  } else if (b > 0 && page_ids[b - 1] == page_ids[b]) {
394  continue;
395  }
396 
397  bucket_numbers[b] = get_bucket_number(page_ids[b]);
398  ASSERT_ND(bucket_numbers[b] < get_logical_buckets());
399  // we prefetch up to 128 bytes (16 entries).
400  assorted::prefetch_cachelines(buckets_ + bucket_numbers[b], 2);
401  }
402 
403  for (uint16_t b = 0; b < batch_size; ++b) {
404  out[b] = 0;
405  if (page_ids[b] == 0) {
406  continue;
407  } else if (b > 0 && page_ids[b - 1] == page_ids[b]) {
408  out[b] = out[b - 1];
409  continue;
410  }
411 
412  PageIdTag tag = HashFunc::get_tag(page_ids[b]);
413  ASSERT_ND(tag != 0);
414  BucketId bucket_number = bucket_numbers[b];
415  for (uint16_t i = 0; i < kHopNeighbors; ++i) {
416  const CacheBucket& bucket = buckets_[bucket_number + i];
417  if (bucket.get_tag() == tag) {
418  // found (probably)!
419  refcounts_[bucket_number + i].increment();
420  out[b] = bucket.get_content_id();
421  break;
422  }
423  }
424 
425  // Not found. let's check overflow list
426  if (out[b] == 0 && overflow_buckets_head_) {
427  for (OverflowPointer i = overflow_buckets_head_; i != 0;) {
428  if (overflow_buckets_[i].bucket_.get_tag() == tag) {
431  break;
432  }
433  i = overflow_buckets_[i].next_;
434  }
435  }
436  }
437 
438  return kErrorCodeOk;
439 }
440 
441 } // namespace cache
442 } // namespace foedus
void evict_overflow_loop(EvictArgs *args, uint16_t loop)
0x0002 : "GENERAL: Invalid parameter given" .
Definition: error_code.hpp:106
numa_alloc_onnode() and numa_free().
BucketId clockhand_
We previously stopped eviction here for usual buckets.
soc::SharedMutex overflow_free_buckets_mutex_
The mutex to protect free overflow entries.
CacheOverflowEntry * overflow_buckets_
An entry in the overflow linked list for snapshot cache.
uint32_t BucketId
Offset in hashtable bucket.
memory::AlignedMemory buckets_memory_
ErrorCode install(storage::SnapshotPagePointer page_id, ContentId content)
Called when a cached page is not found.
memory::AlignedMemory refcounts_memory_
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
static PageIdTag get_tag(storage::SnapshotPagePointer page_id) __attribute__((always_inline))
Returns another hash value used to differentiate IDs with similar hash values.
const BucketId logical_buckets_
Number of logical (actually used) buckets in this table.
BucketId get_physical_buckets() const __attribute__((always_inline))
std::ostream & operator<<(std::ostream &o, const HashFunc &v)
uint32_t OverflowPointer
Position in the overflow buckets.
BucketId determine_logical_buckets(BucketId physical_buckets)
Brings error stacktrace information as return value of functions.
Definition: error_stack.hpp:81
void alloc(uint64_t size, uint64_t alignment, AllocType alloc_type, int numa_node) noexcept
Allocate a memory, releasing the current memory if exists.
HashFunc(BucketId physical_buckets)
void prefetch_cachelines(const void *address, int cacheline_count)
Prefetch multiple contiguous cachelines to L1 cache.
Definition: cacheline.hpp:66
A simple hash function logic used in snasphot cache.
#define ASSUME_ALIGNED(x, y)
Wraps GCC's __builtin_assume_aligned.
Definition: compiler.hpp:111
uint32_t determine_overflow_list_size(BucketId physical_buckets)
void increment() __attribute__((always_inline))
#define LIKELY(x)
Hints that x is highly likely true.
Definition: compiler.hpp:103
OverflowPointer overflow_free_buckets_head_
This forms another singly-linked list of free overflow entries.
uint64_t evicted_count_
[Out] Number of entries that were actually evicted
const uint16_t kCachelineSize
Byte count of one cache line.
Definition: cacheline.hpp:42
CacheHashtable(BucketId physical_buckets, uint16_t numa_node)
0 means no-error.
Definition: error_code.hpp:87
void evict(EvictArgs *args)
Evict some entries from the hashtable.
ErrorCode find_batch(uint16_t batch_size, const storage::SnapshotPagePointer *page_ids, ContentId *out) const
Batched version of find().
bool decrement(uint16_t subtract) __attribute__((always_inline))
returns whether the counter is still non-zero
BucketId evict_main_loop(EvictArgs *args, BucketId cur, uint16_t loop)
uint32_t PageIdTag
This is a lossy-compressed representation of SnapshotPagePointer used to quickly identify whether the...
ErrorStack verify_single_thread() const
only for debugging.
uint64_t SnapshotPagePointer
Page ID of a snapshot page.
Definition: storage_id.hpp:79
memory::PagePoolOffset ContentId
Offset of the actual page image in PagePool for snapshot pages.
uint64_t stop()
Take another current time tick.
Definition: stop_watch.cpp:35
Auto-lock scope object for SharedMutex.
memory::AlignedMemory overflow_buckets_memory_
void * get_block() const
Returns the memory block.
const uint16_t kHopNeighbors
Starting from the given position, we consider this many buckets for the entry.
uint64_t target_count_
[In] Evicts entries up to about target_count (maybe a bit more or less)
ContentId get_content_id() const
double elapsed_us() const
Definition: stop_watch.hpp:45
Hash bucket in cache table.
BucketId get_logical_buckets() const __attribute__((always_inline))
uint64_t generate_almost_prime_below(uint64_t threshold)
Generate a prime or some number that is almost prime less than the given number.
const ErrorStack kRetOk
Normal return value for no-error case.
Convenient way of writing hex integers to stream.
Stat get_stat_single_thread() const
only for debugging.
void reset(ContentId id=0, PageIdTag tag=0)
#define UNLIKELY(x)
Hints that x is highly likely false.
Definition: compiler.hpp:104
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
A high-resolution stop watch.
Definition: stop_watch.hpp:30
const BucketId physical_buckets_
Number of buckets that are physically allocated, at least kHopNeighbors more than logical...
BucketId get_bucket_number(storage::SnapshotPagePointer page_id) const __attribute__((always_inline))
Returns a bucket number the given page ID should belong to.
void memory_fence_release()
Equivalent to std::atomic_thread_fence(std::memory_order_release).
OverflowPointer next_
Note that we don't have to atomically maintain/follow this pointer thanks to the loose requirements...
OverflowPointer overflow_buckets_head_
This forms a singly-linked list of active overflow entries.
0x0903 : "SPCACHE: Hashtable for snapshot cache got too many overflow entries" .
Definition: error_code.hpp:194
ErrorCode
Enum of error codes defined in error_code.xmacro.
Definition: error_code.hpp:85
A loosely maintained reference count for CLOCK algorithm.