libfoedus-core
FOEDUS Core Library
numa_node_memory.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <numa.h>
21 #include <glog/logging.h>
22 
23 #include <iostream>
24 #include <sstream>
25 #include <string>
26 
27 #include "foedus/assert_nd.hpp"
28 #include "foedus/engine.hpp"
38 
39 namespace foedus {
40 namespace memory {
42  : engine_(engine),
43  numa_node_(numa_node),
44  cores_(engine_->get_options().thread_.thread_count_per_group_),
45  loggers_(engine_->get_options().log_.loggers_per_node_),
46  snapshot_cache_table_(nullptr) {
47 }
48 
49 int64_t get_numa_node_size(int node) {
50  if (::numa_available() < 0) {
51  return 0;
52  } else {
53  return ::numa_node_size(node, nullptr);
54  }
55 }
56 
58  LOG(INFO) << "Initializing NumaNodeMemory for node " << static_cast<int>(numa_node_) << "."
59  << " BEFORE: numa_node_size=" << get_numa_node_size(numa_node_);
60 
61 
62  // volatile pool is placed on the shared memory
64  uint64_t volatile_size =
65  static_cast<uint64_t>(engine_->get_options().memory_.page_pool_size_mb_per_node_) << 20;
66  volatile_pool_.attach(
67  memory_repo->get_node_memory_anchors(numa_node_)->volatile_pool_status_,
68  memory_repo->get_volatile_pool(numa_node_),
69  volatile_size,
70  true,
72  volatile_pool_.set_debug_pool_name(
73  std::string("VolatilePool-")
74  + std::to_string(static_cast<int>(numa_node_)));
75 
76  // snapshot pool is SOC-local
77  uint64_t snapshot_pool_bytes
78  = static_cast<uint64_t>(engine_->get_options().cache_.snapshot_cache_size_mb_per_node_) << 20;
80  // mprotect raises EINVAL if the underlying pages are hugepages.
81  LOG(INFO) << "rigorous_page_boundary_check_ is specified, so disabled hugepages.";
82  allocate_numa_memory(snapshot_pool_bytes, &snapshot_pool_memory_);
83  } else {
84  allocate_huge_numa_memory(snapshot_pool_bytes, &snapshot_pool_memory_);
85  }
86  snapshot_pool_control_block_.alloc(1 << 12, 1 << 12, AlignedMemory::kNumaAllocOnnode, numa_node_);
87  snapshot_pool_.attach(
88  reinterpret_cast<PagePoolControlBlock*>(snapshot_pool_control_block_.get_block()),
89  snapshot_pool_memory_.get_block(),
90  snapshot_pool_memory_.get_size(),
91  true,
93  snapshot_pool_.set_debug_pool_name(
94  std::string("SnapshotPool-")
95  + std::to_string(static_cast<int>(numa_node_)));
96 
97  CHECK_ERROR(volatile_pool_.initialize());
98  CHECK_ERROR(snapshot_pool_.initialize());
99 
100  // snapshot_pool_ consumes #pages * 4kb bytes of memory.
101  // CacheBucket is 16 bytes, so even with 32-fold (3% full hashtable), we spend only
102  // #pages * 0.5kb for hash buckets. This is a neligible overhead.
103  uint64_t cache_hashtable_buckets = (snapshot_pool_.get_memory_size() / storage::kPageSize) * 32;
104  snapshot_cache_table_ = new cache::CacheHashtable(cache_hashtable_buckets, numa_node_);
105  CHECK_ERROR(initialize_page_offset_chunk_memory());
106  CHECK_ERROR(initialize_log_buffers_memory());
107  for (auto ordinal = 0; ordinal < cores_; ++ordinal) {
108  CHECK_ERROR(initialize_core_memory(ordinal));
109  }
110  ASSERT_ND(volatile_pool_.is_initialized());
111  ASSERT_ND(snapshot_pool_.is_initialized());
112  ASSERT_ND(core_memories_.size() == cores_);
113  ASSERT_ND(volatile_offset_chunk_memory_pieces_.size() == cores_);
114  ASSERT_ND(snapshot_offset_chunk_memory_pieces_.size() == cores_);
115  ASSERT_ND(log_buffer_memory_pieces_.size() == cores_);
116 
117  LOG(INFO) << "Initialized NumaNodeMemory for node " << static_cast<int>(numa_node_) << "."
118  << " AFTER: numa_node_size=" << get_numa_node_size(numa_node_);
119  return kRetOk;
120 }
121 ErrorStack NumaNodeMemory::initialize_page_offset_chunk_memory() {
122  size_t size_per_core = sizeof(PagePoolOffsetChunk) * 2;
123  size_t total_size = size_per_core * cores_;
124  LOG(INFO) << "Initializing page_offset_chunk_memory_. total_size=" << total_size << " bytes";
125  if (total_size < kHugepageSize) {
126  // Just one per NUMA node. Not a significant waste.
127  total_size = kHugepageSize;
128  LOG(INFO) << "Allocating extra space to utilize hugepage.";
129  }
130  CHECK_ERROR(allocate_huge_numa_memory(total_size, &volatile_offset_chunk_memory_));
131  CHECK_ERROR(allocate_huge_numa_memory(total_size, &snapshot_offset_chunk_memory_));
132  for (auto ordinal = 0; ordinal < cores_; ++ordinal) {
133  {
134  PagePoolOffsetChunk* chunk = reinterpret_cast<PagePoolOffsetChunk*>(
135  volatile_offset_chunk_memory_.get_block()) + ordinal;
136  chunk->clear();
137  volatile_offset_chunk_memory_pieces_.push_back(chunk);
138  }
139  {
140  PagePoolOffsetChunk* chunk = reinterpret_cast<PagePoolOffsetChunk*>(
141  snapshot_offset_chunk_memory_.get_block()) + ordinal;
142  chunk->clear();
143  snapshot_offset_chunk_memory_pieces_.push_back(chunk);
144  }
145  }
146 
147  return kRetOk;
148 }
149 
150 ErrorStack NumaNodeMemory::initialize_log_buffers_memory() {
151  uint64_t size_per_core_ = static_cast<uint64_t>(engine_->get_options().log_.log_buffer_kb_) << 10;
152  uint64_t private_total = (cores_ * size_per_core_);
153  LOG(INFO) << "Initializing log_buffer_memory_. total_size=" << private_total;
154  CHECK_ERROR(allocate_huge_numa_memory(private_total, &log_buffer_memory_));
155  LOG(INFO) << "log_buffer_memory_ allocated. addr=" << log_buffer_memory_.get_block();
156  for (auto ordinal = 0; ordinal < cores_; ++ordinal) {
157  AlignedMemorySlice piece(&log_buffer_memory_, size_per_core_ * ordinal, size_per_core_);
158  LOG(INFO) << "log_buffer_piece[" << ordinal << "] addr=" << piece.get_block();
159  log_buffer_memory_pieces_.push_back(piece);
160  }
161 
162  return kRetOk;
163 }
164 
165 
166 ErrorStack NumaNodeMemory::initialize_core_memory(thread::ThreadLocalOrdinal ordinal) {
167  auto core_id = thread::compose_thread_id(numa_node_, ordinal);
168  NumaCoreMemory* core_memory = new NumaCoreMemory(engine_, this, core_id);
169  core_memories_.push_back(core_memory);
170  CHECK_ERROR(core_memory->initialize());
171  return kRetOk;
172 }
173 
174 
176  LOG(INFO) << "Uninitializing NumaNodeMemory for node " << static_cast<int>(numa_node_) << "."
177  << " BEFORE: numa_node_size=" << get_numa_node_size(numa_node_);
178 
179  ErrorStackBatch batch;
180  batch.uninitialize_and_delete_all(&core_memories_);
181  volatile_offset_chunk_memory_pieces_.clear();
182  volatile_offset_chunk_memory_.release_block();
183  snapshot_offset_chunk_memory_pieces_.clear();
184  snapshot_offset_chunk_memory_.release_block();
185  log_buffer_memory_pieces_.clear();
186  log_buffer_memory_.release_block();
187  if (snapshot_cache_table_) {
188  delete snapshot_cache_table_;
189  snapshot_cache_table_ = nullptr;
190  }
191  batch.emprace_back(volatile_pool_.uninitialize());
192  batch.emprace_back(snapshot_pool_.uninitialize());
193  snapshot_pool_memory_.release_block();
194  snapshot_pool_control_block_.release_block();
195 
196  LOG(INFO) << "Uninitialized NumaNodeMemory for node " << static_cast<int>(numa_node_) << "."
197  << " AFTER: numa_node_size=" << get_numa_node_size(numa_node_);
198  return SUMMARIZE_ERROR_BATCH(batch);
199 }
200 
202  uint64_t size,
203  uint64_t alignment,
204  AlignedMemory *out) const {
205  ASSERT_ND(out);
206  if (engine_->get_options().memory_.use_mmap_hugepages_ &&
207  alignment >= kHugepageSize
208  && size >= (1ULL << 30) * 8 / 10) {
209  LOG(INFO) << "This is a big memory allocation. Let's use the mmap hugepage (1GB pages)";
210  out->alloc(size, 1ULL << 30, AlignedMemory::kNumaMmapOneGbPages, numa_node_);
211  } else {
212  out->alloc(size, alignment, AlignedMemory::kNumaAllocOnnode, numa_node_);
213  }
214  if (out->is_null()) {
216  }
217  return kRetOk;
218 }
219 
221  std::stringstream ret;
222  PagePool::Stat volatile_stat = volatile_pool_.get_stat();
223  ret << " Volatile-Pool: " << volatile_stat.allocated_pages_ << " allocated pages, "
224  << volatile_stat.total_pages_ << " total pages, "
225  << (volatile_stat.total_pages_ - volatile_stat.allocated_pages_) << " free pages"
226  << std::endl;
227  PagePool::Stat snapshot_stat = snapshot_pool_.get_stat();
228  ret << " Snapshot-Pool: " << snapshot_stat.allocated_pages_ << " allocated pages, "
229  << snapshot_stat.total_pages_ << " total pages, "
230  << (snapshot_stat.total_pages_ - snapshot_stat.allocated_pages_) << " free pages"
231  << std::endl;
232  return ret.str();
233 }
234 
236  : engine_(engine), numa_node_(numa_node) {
238  volatile_pool_.attach(
239  memory_repo->get_node_memory_anchors(numa_node)->volatile_pool_status_,
240  memory_repo->get_volatile_pool(numa_node),
241  static_cast<uint64_t>(engine->get_options().memory_.page_pool_size_mb_per_node_) << 20,
242  false,
243  false);
244 }
245 
247  std::stringstream ret;
248  PagePool::Stat volatile_stat = volatile_pool_.get_stat();
249  ret << " Volatile-Pool: " << volatile_stat.allocated_pages_ << " allocated pages, "
250  << volatile_stat.total_pages_ << " total pages, "
251  << (volatile_stat.total_pages_ - volatile_stat.allocated_pages_) << " free pages"
252  << std::endl;
253  return ret.str();
254 }
255 
256 } // namespace memory
257 } // namespace foedus
void attach(PagePoolControlBlock *control_block, void *memory, uint64_t memory_size, bool owns, bool rigorous_page_boundary_check)
Definition: page_pool.cpp:101
std::string dump_free_memory_stat() const
Report rough statistics of free memory.
0x0001 : "GENERAL: Out of memory" .
Definition: error_code.hpp:105
ErrorStack allocate_numa_memory(uint64_t size, AlignedMemory *out) const
numa_alloc_onnode() and numa_free().
uint8_t ThreadLocalOrdinal
Typedef for a local ID of Thread (core), which is NOT unique across NUMA nodes.
Definition: thread_id.hpp:58
void emprace_back(ErrorStack &&error_stack)
If the given ErrorStack is an error, this method adds it to the end of this batch.
#define ERROR_STACK(e)
Instantiates ErrorStack with the given foedus::error_code, creating an error stack with the current f...
int64_t get_numa_node_size(int node)
void release_block()
Releases the memory block.
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
Stat get_stat() const
Definition: page_pool.cpp:118
Brings error stacktrace information as return value of functions.
Definition: error_stack.hpp:81
void alloc(uint64_t size, uint64_t alignment, AllocType alloc_type, int numa_node) noexcept
Allocate a memory, releasing the current memory if exists.
long numa_node_size(int node, long *freep)
NodeMemoryAnchors * get_node_memory_anchors(SocId node)
std::string dump_free_memory_stat() const
Report rough statistics of free memory.
ErrorStack initialize_once() override
ErrorStack uninitialize() override
An idempotent method to release all resources of this object, if any.
Definition: page_pool.cpp:116
const EngineOptions & get_options() const
Definition: engine.cpp:39
const uint64_t kHugepageSize
So far 2MB is the only page size available via Transparent Huge Page (THP).
Definition: memory_id.hpp:50
Batches zero or more ErrorStack objects to represent in one ErrorStack.
memory::MemoryOptions memory_
ErrorStack initialize() override
Acquires resources in this object, usually called right after constructor.
Definition: page_pool.cpp:114
int numa_available(void)
uint32_t log_buffer_kb_
Size in KB of log buffer for each worker thread.
Definition: log_options.hpp:83
Database engine object that holds all resources and provides APIs.
Definition: engine.hpp:109
ErrorStack allocate_huge_numa_memory(uint64_t size, AlignedMemory *out) const
A NUMA-local hashtable of cached snapshot pages.
void * get_block() const
Returns the memory block.
Repository of all shared memory in one FOEDUS instance.
uint64_t get_size() const
Returns the byte size of the memory block.
ThreadId compose_thread_id(ThreadGroupId node, ThreadLocalOrdinal local_core)
Returns a globally unique ID of Thread (core) for the given node and ordinal in the node...
Definition: thread_id.hpp:123
#define SUMMARIZE_ERROR_BATCH(x)
This macro calls ErrorStackBatch::summarize() with automatically provided parameters.
bool use_mmap_hugepages_
Whether to use non-transparent hugepages for big memories (1GB huge pages).
To reduce the overhead of grabbing/releasing pages from pool, we pack this many pointers for each gra...
Definition: page_pool.hpp:47
#define CHECK_ERROR(x)
This macro calls x and checks its returned value.
uint32_t page_pool_size_mb_per_node_
Size of the page pool in MB per each NUMA node.
Represents one memory block aligned to actual OS/hardware pages.
void uninitialize_and_delete_all(std::vector< T * > *vec)
A convenience method to uninitialize and delete all Initializable objects in a vector, storing all errors in this batch.
const ErrorStack kRetOk
Normal return value for no-error case.
soc::SocManager * get_soc_manager() const
See SOC and IPC.
Definition: engine.cpp:59
uint64_t get_memory_size() const
Definition: page_pool.cpp:117
ErrorStack uninitialize_once() override
uint32_t snapshot_cache_size_mb_per_node_
Size of the snapshot cache in MB per each NUMA node.
bool is_initialized() const override
Returns whether the object has been already initialized or not.
Definition: page_pool.cpp:115
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
uint8_t ThreadGroupId
Typedef for an ID of ThreadGroup (NUMA node).
Definition: thread_id.hpp:38
bool rigorous_page_boundary_check_
Whether to use mprotect() for page boundaries to detect bogus memory accesses.
memory::PagePoolControlBlock * volatile_pool_status_
PagePool's status and its synchronization mechanism for the volatile pool on this node...
const uint16_t kPageSize
A constant defining the page size (in bytes) of both snapshot pages and volatile pages.
Definition: storage_id.hpp:45
ErrorStack allocate_numa_memory_general(uint64_t size, uint64_t alignment, AlignedMemory *out) const
Allocate a memory of the given size on this NUMA node.
void set_debug_pool_name(const std::string &name)
Call this anytime after attach()
Definition: page_pool.cpp:122
cache::CacheOptions cache_
SharedMemoryRepo * get_shared_memory_repo()
Returns the shared memories maintained across SOCs.
Definition: soc_manager.cpp:38
bool is_null() const
Returns if this object doesn't hold a valid memory block.