libfoedus-core
FOEDUS Core Library
aligned_memory.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, Hewlett-Packard Development Company, LP.
3  * This program is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU General Public License as published by the Free
5  * Software Foundation; either version 2 of the License, or (at your option)
6  * any later version.
7  *
8  * This program is distributed in the hope that it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details. You should have received a copy of the GNU General Public
12  * License along with this program; if not, write to the Free Software
13  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14  *
15  * HP designates this particular file as subject to the "Classpath" exception
16  * as provided by HP in the LICENSE.txt file that accompanied this code.
17  */
19 
20 #include <numa.h>
21 #include <numaif.h>
22 #include <valgrind.h>
23 #include <glog/logging.h>
24 #include <sys/mman.h>
25 
26 #include <algorithm>
27 #include <cstdlib>
28 #include <cstring>
29 #include <fstream>
30 #include <iostream>
31 #include <string>
32 
33 #include "foedus/assert_nd.hpp"
37 
38 
39 // this is a quite new flag, so not exists in many environment. define it here.
40 #ifndef MAP_HUGE_SHIFT
41 #define MAP_HUGE_SHIFT 26
42 #endif // MAP_HUGE_SHIFT
43 #ifndef MAP_HUGE_2MB
44 #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
45 #endif // MAP_HUGE_2MB
46 #ifndef MAP_HUGE_1GB
47 #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
48 #endif // MAP_HUGE_1GB
49 
50 namespace foedus {
51 namespace memory {
52 AlignedMemory::AlignedMemory(uint64_t size, uint64_t alignment,
53  AllocType alloc_type, int numa_node) noexcept
54  : size_(0), alignment_(0), alloc_type_(kPosixMemalign), numa_node_(0),
55  block_(nullptr) {
56  alloc(size, alignment, alloc_type, numa_node);
57 }
58 
59 // std::mutex mmap_allocate_mutex;
60 // No, this doesn't matter. Rather, turns out that the issue is in linux kernel:
61 // https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8382d914ebf72092aa15cdc2a5dcedb2daa0209d
62 // In linux 3.15 and later, this problem gets resolved and highly parallelizable.
63 
64 char* alloc_mmap(uint64_t size, uint64_t alignment) {
65  // std::lock_guard<std::mutex> guard(mmap_allocate_mutex);
66  // we don't use MAP_POPULATE because it will block here and also serialize hugepage allocation!
67  // even if we run mmap in parallel, linux serializes the looooong population in all numa nodes.
68  // lame. we will memset right after this.
69  int pagesize;
70  if (alignment >= (1ULL << 30)) {
72  pagesize = MAP_HUGE_1GB | MAP_HUGETLB;
73  } else {
74  pagesize = MAP_HUGE_2MB | MAP_HUGETLB;
75  }
76  } else if (alignment >= (1ULL << 21)) {
77  pagesize = MAP_HUGE_2MB | MAP_HUGETLB;
78  } else {
79  pagesize = 0;
80  }
81  bool running_on_valgrind = RUNNING_ON_VALGRIND;
82  if (running_on_valgrind) {
83  // if this is running under valgrind, we have to avoid using hugepages due to a bug in valgrind.
84  // When we are running on valgrind, we don't care performance anyway. So shouldn't matter.
85  pagesize = 0;
86  }
87  char* ret = reinterpret_cast<char*>(::mmap(
88  nullptr,
89  size,
90  PROT_READ | PROT_WRITE,
91  MAP_ANONYMOUS | MAP_PRIVATE | pagesize, // | MAP_NORESERVE
92  -1,
93  0));
94  // Note: We previously used MAP_NORESERVE to explicitly say we don't want swapping,
95  // but mmap with this flag causes SIGSEGV when there aren't enough hugepages.
96  // In that case mmap doesn't return -1 because it just checks if VA space is mappable.
97  // We still don't need swapping, but it won't hurt. sorta. Debuggability matters more.
98 
99  // when mmap() fails, it returns -1 (MAP_FAILED)
100  if (ret == nullptr || ret == MAP_FAILED) {
101  LOG(FATAL) << "mmap() failed. size=" << size << ", error=" << assorted::os_error()
102  << ". This error usually means you don't have enough hugepages allocated."
103  << " eg) sudo sh -c 'echo 196608 > /proc/sys/vm/nr_hugepages'";
104  }
105  return ret;
106 }
107 
108 void* alloc_mmap_1gb_pages(uint64_t size) {
109  ASSERT_ND(size % (1ULL << 30) == 0);
110  return alloc_mmap(size, 1ULL << 30);
111 }
112 
114  uint64_t size,
115  uint64_t alignment,
116  AllocType alloc_type,
117  int numa_node) noexcept {
118  release_block();
119  ASSERT_ND(block_ == nullptr);
120  size_ = size;
121  alignment_ = alignment;
122  alloc_type_ = alloc_type;
123  numa_node_ = numa_node;
124  ASSERT_ND((alignment & (alignment - 1)) == 0); // alignment is power of two
125  if (alloc_type_ == kNumaMmapOneGbPages) {
126  alignment = 1ULL << 30;
127  }
128  if (size_ == 0 || size_ % alignment != 0) {
129  size_ = ((size_ / alignment) + 1) * alignment;
130  }
131 
132  // Use libnuma's numa_set_preferred to initialize the NUMA node of the memory.
133  // We can later do the equivalent with mbind IF the memory is not shared.
134  // mbind does nothing for shared memory. So, this is the only way
135  int original_node = 0;
136  if (::numa_available() >= 0) {
137  original_node = ::numa_preferred();
139  }
140 
141  debugging::StopWatch watch;
142  int posix_memalign_ret;
143  switch (alloc_type_) {
144  case kPosixMemalign:
145  // https://bugzilla.mozilla.org/show_bug.cgi?id=606270
146  // kind of pathetic, but to make sure.
147  posix_memalign_ret = ::posix_memalign(&block_, alignment, size_);
148  if (posix_memalign_ret != 0) {
149  block_ = nullptr;
150  }
151  break;
152  case kNumaAllocInterleaved: // actually we no longer support this.. no reason to use this.
153  case kNumaAllocOnnode:
154  block_ = alloc_mmap(size_, alignment);
155  break;
156  case kNumaMmapOneGbPages:
157  block_ = alloc_mmap_1gb_pages(size_);
158  break;
159  default:
160  ASSERT_ND(false);
161  }
162  watch.stop();
163 
164  if (block_ == nullptr) {
165  LOG(ERROR) << "Aligned memory allocation failed. OS error=" << assorted::os_error() << *this;
166  // also reset the numa_preferred
167  if (::numa_available() >= 0) {
168  ::numa_set_preferred(original_node);
169  }
170  return;
171  }
172 
173  debugging::StopWatch watch2;
174  std::memset(block_, 0, size_); // see class comment for why we do this immediately
175  watch2.stop();
176  if (::numa_available() >= 0) {
177  ::numa_set_preferred(original_node);
178  }
179  LOG(INFO) << "Allocated memory in " << watch.elapsed_ns() << "+"
180  << watch2.elapsed_ns() << " ns (alloc+memset)." << *this;
181 }
183  uint64_t required_size,
184  double expand_margin,
185  bool retain_content) noexcept {
186  if (is_null()) {
187  LOG(FATAL) << "Misuse of assure_capacity. Can't extend a null buffer";
189  }
190  if (size_ >= required_size) {
191  return kErrorCodeOk;
192  }
193  if (expand_margin < 1) {
194  expand_margin = 1;
195  }
196  uint64_t expanded = required_size * expand_margin;
197  VLOG(0) << "Expanding work memory from " << size_ << " to " << expanded;
198 
199  // save the current memory
200  AlignedMemory old(std::move(*this));
201  ASSERT_ND(!old.is_null());
202  ASSERT_ND(is_null());
203 
204  alloc(expanded, alignment_, alloc_type_, numa_node_);
205  if (is_null()) {
206  LOG(ERROR) << "Out of memory error while expanding work memory from "
207  << size_ << " to " << expanded;
208  *this = std::move(old); // recover the old one
209  return kErrorCodeOutofmemory;
210  }
211 
212  // copies the old content if specified
213  if (retain_content) {
214  ASSERT_ND(size_ >= old.size_);
215  std::memcpy(block_, old.block_, old.size_);
216  }
217 
218  old.release_block();
219  return kErrorCodeOk;
220 }
221 
222 AlignedMemory::AlignedMemory(AlignedMemory &&other) noexcept : block_(nullptr) {
223  *this = std::move(other);
224 }
226  release_block();
227  size_ = other.size_;
228  alignment_ = other.alignment_;
229  alloc_type_ = other.alloc_type_;
230  block_ = other.block_;
231  other.block_ = nullptr;
232  return *this;
233 }
234 
236  if (block_ != nullptr) {
237  switch (alloc_type_) {
238  case kPosixMemalign:
239  ::free(block_);
240  break;
242  case kNumaAllocOnnode:
243  case kNumaMmapOneGbPages:
244  ::munmap(block_, size_);
245  break;
246  default:
247  ASSERT_ND(false);
248  }
249  block_ = nullptr;
250  }
251 }
252 
253 std::ostream& operator<<(std::ostream& o, const AlignedMemory& v) {
254  o << "<AlignedMemory>";
255  o << "<is_null>" << v.is_null() << "</is_null>";
256  o << "<size>" << v.get_size() << "</size>";
257  o << "<alignment>" << v.get_alignment() << "</alignment>";
258  o << "<alloc_type>" << v.get_alloc_type() << " (";
259  switch (v.get_alloc_type()) {
261  o << "kPosixMemalign";
262  break;
264  o << "kNumaAllocInterleaved";
265  break;
267  o << "kNumaAllocOnnode";
268  break;
270  o << "kNumaMmapOneGbPages";
271  break;
272  default:
273  o << "Unknown";
274  }
275  o << ")</alloc_type>";
276  o << "<numa_node>" << static_cast<int>(v.get_numa_node()) << "</numa_node>";
277  o << "<address>" << v.get_block() << "</address>";
278  o << "</AlignedMemory>";
279  return o;
280 }
281 
282 std::ostream& operator<<(std::ostream& o, const AlignedMemorySlice& v) {
283  o << "<AlignedMemorySlice>";
284  o << "<offset>" << v.offset_ << "</offset>";
285  o << "<count>" << v.count_ << "</count>";
286  if (v.memory_) {
287  o << *v.memory_;
288  }
289  o << "</AlignedMemorySlice>";
290  return o;
291 }
292 
294  // /proc/meminfo should have "Hugepagesize: 1048576 kB"
295  // Unfortunately, sysinfo() doesn't provide this information. So, just read the whole file.
296  // Alternatively, we can use gethugepagesizes(3) in libhugetlbs, but I don't want to add
297  // a dependency just for that...
298  std::ifstream file("/proc/meminfo");
299  if (!file.is_open()) {
300  return false;
301  }
302 
303  std::string line;
304  while (std::getline(file, line)) {
305  if (line.find("Hugepagesize:") != std::string::npos) {
306  break;
307  }
308  }
309  file.close();
310  if (line.find("1048576 kB") != std::string::npos) {
311  return true;
312  }
313  return false;
314 }
315 } // namespace memory
316 } // namespace foedus
317 
0x0001 : "GENERAL: Out of memory" .
Definition: error_code.hpp:105
#define MAP_HUGE_1GB
0x0002 : "GENERAL: Invalid parameter given" .
Definition: error_code.hpp:106
numa_alloc_onnode() and numa_free().
uint64_t get_alignment() const
Returns the alignment of the memory block.
uint64_t count_
Byte count of this slice in memory_.
void release_block()
Releases the memory block.
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
Definition: assert_nd.hpp:44
AlignedMemory & operator=(const AlignedMemory &other)=delete
AlignedMemory * memory_
The wrapped memory.
AllocType
Type of new/delete operation for the block.
ErrorCode assure_capacity(uint64_t required_size, double expand_margin=2.0, bool retain_content=false) noexcept
If the current size is smaller than the given size, automatically expands.
void alloc(uint64_t size, uint64_t alignment, AllocType alloc_type, int numa_node) noexcept
Allocate a memory, releasing the current memory if exists.
void * alloc_mmap_1gb_pages(uint64_t size)
AllocType get_alloc_type() const
Returns type of new/delete operation for the block.
#define MAP_HUGE_2MB
0 means no-error.
Definition: error_code.hpp:87
int numa_available(void)
uint64_t stop()
Take another current time tick.
Definition: stop_watch.cpp:35
int numa_preferred(void)
A slice of foedus::memory::AlignedMemory.
std::ostream & operator<<(std::ostream &o, const AlignedMemory &v)
void * get_block() const
Returns the memory block.
uint64_t get_size() const
Returns the byte size of the memory block.
std::string os_error()
Thread-safe strerror(errno).
uint64_t offset_
Byte offset of this slice in memory_.
Represents one memory block aligned to actual OS/hardware pages.
int mod_numa_node(int numa_node)
In order to run even on a non-numa machine or a machine with fewer sockets, we allow specifying arbit...
char * alloc_mmap(uint64_t size, uint64_t alignment)
void numa_set_preferred(int node)
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
Definition: assert_nd.hpp:72
numa_alloc_interleaved() and numa_free().
A high-resolution stop watch.
Definition: stop_watch.hpp:30
bool is_1gb_hugepage_enabled()
Returns if 1GB hugepages were enabled.
ErrorCode
Enum of error codes defined in error_code.xmacro.
Definition: error_code.hpp:85
int get_numa_node() const
If alloc_type_ is kNumaAllocOnnode, returns the NUMA node this memory was allocated at...
AlignedMemory() noexcept
Empty constructor which allocates nothing.
uint64_t elapsed_ns() const
Definition: stop_watch.hpp:42
bool is_null() const
Returns if this object doesn't hold a valid memory block.