23 #include <glog/logging.h>
24 #include <sys/prctl.h>
25 #include <sys/types.h>
69 std::cerr <<
"[FOEDUS] Shared memory not initialized yet. Can't report errors" << std::endl;
109 for (uint16_t node = 0; node < soc_count; ++node) {
132 pid_t pid = ::getpid();
139 std::cerr <<
"[FOEDUS-Child] MasterUpid=" << master_upid <<
", ChildPid=" << pid
140 <<
", Node=" << soc_id <<
". Failed to attach shared memory. error=" << attach_error
141 <<
" This is an unrecoverable error. This process quits shortly" << std::endl;
162 const uint32_t kIntervalMillisecond = 20;
163 const uint32_t kTimeoutMillisecond = 10000;
168 std::this_thread::sleep_for(std::chrono::milliseconds(kIntervalMillisecond));
169 bool error_happened =
false;
170 bool remaining =
false;
171 for (uint16_t node = 0; node < soc_count; ++node) {
173 switch (child_status) {
181 error_happened =
true;
184 if (child_as_process) {
189 pid_t wait_ret = ::waitpid(
child_upids_[node], &status, WNOHANG | __WALL);
190 if (wait_ret == -1) {
191 std::cerr <<
"[FOEDUS] FATAL! waitpid() for child-process " <<
child_upids_[node]
193 error_happened =
true;
195 }
else if (wait_ret != 0) {
196 std::cerr <<
"[FOEDUS] FATAL! child-process " <<
child_upids_[node] <<
" has exit"
197 <<
" unexpectedly. status=" << status << std::endl;
198 error_happened =
true;
205 if (error_happened) {
206 std::cerr <<
"[FOEDUS] FATAL! Some child failed to attach shared memory." << std::endl;
210 }
else if (!remaining) {
212 }
else if ((++trials) * kIntervalMillisecond > kTimeoutMillisecond) {
213 std::cerr <<
"[FOEDUS] FATAL! Timeout happend while waiting for child SOCs to start up."
214 " Probably child SOC(s) hanged or did not trap SOC execution (if spawned)." << std::endl;
230 const uint32_t kIntervalMillisecond = 20;
231 const uint32_t kTimeoutMillisecond = 10000;
236 std::this_thread::sleep_for(std::chrono::milliseconds(kIntervalMillisecond));
237 bool remaining =
false;
238 for (uint16_t node = 0; node < soc_count; ++node) {
240 if (!child_as_process) {
257 pid_t wait_ret = ::waitpid(
child_upids_[node], &status, WNOHANG | __WALL);
258 if (wait_ret == -1) {
260 }
else if (wait_ret == 0) {
262 }
else if (WIFSIGNALED(status)) {
263 std::cerr <<
"[FOEDUS] ERROR! child-process " <<
child_upids_[node] <<
" has been"
264 <<
" terminated by signal. status=" << status << std::endl;
274 }
else if ((++trials) * kIntervalMillisecond > kTimeoutMillisecond) {
275 std::cerr <<
"[FOEDUS] ERROR! Timeout happend while waiting for child SOCs to terminate."
276 " Probably child SOC(s) hanged or did not trap SOC execution (if spawned)." << std::endl;
288 const uint32_t kIntervalMillisecond = 10;
292 std::this_thread::sleep_for(std::chrono::milliseconds(kIntervalMillisecond));
294 if (master_status == target_status) {
298 }
else if (static_cast<int>(master_status) > static_cast<int>(target_status)) {
328 const uint32_t kWarnSleeps = 400;
329 for (uint32_t count = 0;; ++count) {
330 if (count > 0 && count % kWarnSleeps == 0) {
331 LOG(WARNING) <<
"Suspiciously long wait for master " << (init ?
"" :
"un") <<
"initializing"
332 <<
" module-" << desired <<
". count=" << count;
338 LOG(ERROR) <<
"Master apparently died while wait_for_master_module";
353 if (cur == desired) {
356 std::this_thread::sleep_for(std::chrono::milliseconds(5));
367 const uint32_t kWarnSleeps = 400;
368 for (uint32_t count = 0;; ++count) {
369 if (count > 0 && count % kWarnSleeps == 0) {
370 LOG(WARNING) <<
"Suspiciously long wait for child " << (init ?
"" :
"un") <<
"initializing"
371 <<
" module-" << desired <<
". count=" << count;
374 std::this_thread::sleep_for(std::chrono::milliseconds(5));
376 bool error_happened =
false;
377 bool remaining =
false;
378 for (uint16_t node = 0; node < soc_count; ++node) {
382 error_happened =
true;
389 > static_cast<int>(desired)) {
390 LOG(ERROR) <<
"[FOEDUS] child init went too far??";
391 error_happened =
true;
398 < static_cast<int>(desired)) {
399 LOG(ERROR) <<
"[FOEDUS] ERROR! child uninit went too far??";
400 error_happened =
true;
405 if (child_as_process) {
409 pid_t wait_ret = ::waitpid(
child_upids_[node], &status, WNOHANG | __WALL);
410 if (wait_ret == -1) {
412 error_happened =
true;
413 LOG(ERROR) <<
"waitpid() while waiting for child module status failed";
415 }
else if (wait_ret != 0) {
417 error_happened =
true;
418 LOG(ERROR) <<
"child process has already exit while waiting for child module status";
425 if (error_happened) {
426 LOG(ERROR) <<
"Error encountered in wait_for_children_module";
432 }
else if (!remaining) {
447 for (uint16_t node = 0; node < soc_count; ++node) {
451 for (uint16_t node = 0; node < soc_count; ++node) {
469 std::cerr <<
"[FOEDUS-Child] Emulated SOC-" << node
470 <<
" exits with an error: " << ret << std::endl;
481 for (uint16_t node = 0; node < soc_count; ++node) {
482 pid_t pid = ::fork();
487 std::cerr <<
"[FOEDUS] Failed to fork child SOC. error=" <<
assorted::os_error() << std::endl;
490 }
else if (pid == 0) {
510 std::cerr <<
"[FOEDUS-Child] Forked SOC-" << node
511 <<
" exits with an error: " << ret << std::endl;
527 for (uint16_t node = 0; node < soc_count; ++node) {
528 posix_spawn_file_actions_t file_actions;
529 posix_spawnattr_t attr;
530 ::posix_spawn_file_actions_init(&file_actions);
531 ::posix_spawnattr_init(&attr);
534 std::string ld_env(
"LD_LIBRARY_PATH=");
536 std::string pid_env(
"FOEDUS_MASTER_UPID=");
537 pid_env += std::to_string(master_upid);
538 std::string eid_env(
"FOEDUS_MASTER_EID=");
539 eid_env += std::to_string(master_eid);
540 std::string soc_id_env(
"FOEDUS_SOC_ID=");
541 soc_id_env += std::to_string(node);
543 char*
const argv[] = {
const_cast<char*
>(executable.c_str()),
nullptr};
544 char*
const envp[] = {
545 const_cast<char*
>(ld_env.c_str()),
546 const_cast<char*>(pid_env.c_str()),
547 const_cast<char*>(eid_env.c_str()),
548 const_cast<char*>(soc_id_env.c_str()),
552 int ret = ::posix_spawn(&child_pid, executable.c_str(), &file_actions, &attr, argv, envp);
557 std::cerr <<
"[FOEDUS] Failed to spawn child SOC. error="
569 const char* master_upid_str = std::getenv(
"FOEDUS_MASTER_UPID");
570 const char* master_eid_str = std::getenv(
"FOEDUS_MASTER_EID");
571 const char* soc_id_str = std::getenv(
"FOEDUS_SOC_ID");
572 if (master_upid_str ==
nullptr || master_eid_str ==
nullptr || soc_id_str ==
nullptr) {
576 Upid master_upid = std::atoll(master_upid_str);
577 Eid master_eid = std::atoll(master_eid_str);
578 SocId node = std::atol(master_upid_str);
581 std::cerr <<
"[FOEDUS-Child] Spawned SOC-" << node
582 <<
" exits with an error: " << ret << std::endl;
583 ::_exit(EXIT_FAILURE);
585 ::_exit(EXIT_SUCCESS);
599 const std::vector< proc::ProcAndName >& procedures) {
605 ::prctl(PR_SET_PDEATHSIG, SIGHUP);
608 Engine soc_engine(engine_type, master_upid, master_eid, node);
611 std::cerr <<
"[FOEDUS-Child] Failed to initialize child SOC-" << node
612 <<
". error=" << init_error
613 <<
" This is an unrecoverable error. This process quits shortly" << std::endl;
622 LOG(INFO) <<
"The SOC engine-" << node <<
" was initialized.";
631 <<
". Waiting for master engine's initialization...";
634 LOG(INFO) <<
"The SOC engine-" << node <<
" detected that master engine has started"
639 LOG(INFO) <<
"Stopping the SOC engine-" << node;
643 LOG(ERROR) <<
"Error while uninitializing SOC engine-" << node <<
": " << uninit_error;
ErrorStack uninitialize_once() override
Upid child_upids_[kMaxSocs]
Process IDs of child SOCs.
ErrorStack allocate_shared_memories(uint64_t upid, Eid eid, const EngineOptions &options)
Master process creates shared memories by calling this method.
0x0C04 : "SOC : Failed to spawn child SOCs." .
void emulated_child_main(SocId node)
Main routine of emulated SOCs.
0x0C02 : "SOC : Failed to attach a shared memory." .
void * get_global_memory()
ErrorStack initialize() override
Starts up the database engine.
The master engine has normally terminated.
ModuleType initialized_modules_
The module that has been most recently initialized in this node.
static ErrorStack child_main_common(EngineType engine_type, Upid master_upid, Eid master_eid, SocId node, const std::vector< proc::ProcAndName > &procedures)
void change_master_status(MasterEngineStatus::StatusCode new_status)
#define ERROR_STACK(e)
Instantiates ErrorStack with the given foedus::error_code, creating an error stack with the current f...
ErrorStack launch_forked_children()
Launch children via fork.
Root package of FOEDUS (Fast Optimistic Engine for Data Unification Services).
The master is waiting for child engines to terminate.
GlobalMemoryAnchors * get_global_memory_anchors()
StatusCode
These statuses represent each step described in SocManager comment.
Eid get_master_eid() const
Returns Engine ID of the master engine.
std::vector< std::thread > child_emulated_threads_
Threads that emulate child SOCs.
void deallocate_shared_memories()
Detaches and releases the shared memories.
void change_status_atomic(StatusCode new_status)
Update the value of status_code_ with fence.
void change_status_atomic(StatusCode new_status)
Update the value of status_code_ with fence.
static void spawned_child_main(const std::vector< proc::ProcAndName > &procedures)
Main routine of spawned SOCs.
Whenever child observes this, they will call _exit() asap.
Brings error stacktrace information as return value of functions.
EngineType soc_type_
How to launch SOC engine instances.
0x0C0A : "SOC : Failed to normally terminate some SOC(s).." .
FileStatus status(const Path &p)
Returns the status of the file.
void mark_for_release()
Marks shared memories as being removed so that it will be reclaimed when all processes detach it...
A child SOC instance launched in other machines.
NodeMemoryAnchors * get_node_memory_anchors(SocId node)
Pin the current thread to the given NUMA node in this object's scope.
Procedure manager, which maintains the list of system/user procedures.
0x0C0B : "SOC : Child SOC failed to initialize a module." .
const EngineOptions & get_options() const
0x0C03 : "SOC : Failed to fork child SOCs." .
ModuleType initialized_modules_
The module that has been most recently initialized in master.
ErrorStack initialize_master()
Called as part of initialize_once() if this is a master engine.
#define COERCE_ERROR(x)
This macro calls x and aborts if encounters an error.
bool is_master() const
Returns if this engine object is a master instance.
Child engine successfully attached shared memory and waiting for master's kWaitingForChildInitializat...
ModuleType
Enumerates modules in FOEDUS engine.
The child engine has normally terminated.
Batches zero or more ErrorStack objects to represent in one ErrorStack.
ErrorStack wait_for_child_attach()
Wait for child SOCs to start up and at least finish attaching shared memory.
std::string convert_spawn_executable_pattern(int node) const
converts spawn_executable_pattern_ into a string with the given node ID.
soc::Upid get_master_upid() const
Returns Universal (or Unique) ID of the master process.
ErrorStack wait_for_children_module(bool init, ModuleType module)
0x0C09 : "SOC : Timeout happend while waiting for child SOCs to terminate. Probably child SOC(s) h...
std::string describe_registered_procs() const
For debug uses only.
ErrorStack launch_emulated_children()
Launch emulated children as threads.
Current status of master engine.
ErrorStack attach_shared_memories(uint64_t master_upid, Eid master_eid, SocId my_soc_id, EngineOptions *options)
Child processes (emulated or not) set a reference to shared memory and receive the EngnieOption value...
int forked_child_main(SocId node)
Main routine of forked SOCs.
ErrorStack wait_for_master_module(bool init, ModuleType module)
A child SOC instance launched via spawn().
Database engine object that holds all resources and provides APIs.
Done all initialization and running transactions.
std::vector< Engine * > child_emulated_engines_
And their engines.
ErrorStack initialize_once() override
std::pair< ProcName, Proc > ProcAndName
Child engine has successfully initialized all modules and is now waiting for master's kRunning status...
EngineType
Type of an engine instance of how to launch it.
ErrorStack launch_spawned_children()
Launch children via spawn.
ErrorStack wait_for_master_status(MasterEngineStatus::StatusCode target_status)
Wait for master engine to finish upto the specified status.
Current status of a child SOC engine.
const std::vector< ProcAndName > & get_pre_registered_procedures() const
Returns procedures given to pre_register()
0x0C07 : "SOC : Master engine died unexpectedly. This child engine will follow." .
uint64_t Upid
Universal (or Unique) ID of a process.
A child SOC instance launched via fork().
proc::ProcManager * get_proc_manager() const
See System and User Procedures.
uint16_t group_count_
Number of ThreadGroup in the engine.
0x0C06 : "SOC : Timeout happend while waiting for child SOCs to start up. Probably child SOC(s) ha...
EngineOptions * get_nonconst_options()
Returns an updatable reference to options.
void spinlock_yield()
Invoke _mm_pause(), x86 PAUSE instruction, or something equivalent in the env.
Repository of all shared memory in one FOEDUS instance.
void report_engine_fatal_error()
SharedMemoryRepo memory_repo_
#define SUMMARIZE_ERROR_BATCH(x)
This macro calls ErrorStackBatch::summarize() with automatically provided parameters.
ErrorStack wait_for_child_terminate()
Wait for child SOCs to terminate.
uint16_t SocId
Represents an ID of an SOC, or NUMA node.
std::string os_error()
Thread-safe strerror(errno).
thread::ThreadOptions thread_
Child engine has just started.
#define CHECK_ERROR(x)
This macro calls x and checks its returned value.
const ErrorStack kRetOk
Normal return value for no-error case.
MasterEngineStatus * master_status_memory_
This tiny piece of memory contains the current status of the master engine and its synchronization me...
soc::SocManager * get_soc_manager() const
See SOC and IPC.
Pimpl object of SocManager.
ErrorStack local_register(const ProcAndName &proc_and_name)
Register a function pointer as a user procedure in the current SOC.
ChildEngineStatus * child_status_memory_
This tiny piece of memory contains the current status of the child engine on this node...
0x0C08 : "SOC : The status of master engine is unexpected." .
soc::SocId get_soc_id() const
If this is a child instance, returns its SOC ID (NUMA node).
uint64_t Eid
An Engine ID to differentiate two Engine objects instantiated in the same process.
Done all initialization and running transactions.
void change_child_status(SocId node, ChildEngineStatus::StatusCode new_status)
0x0C0C : "SOC : Child SOC failed to uninitialize a module." .
#define ASSERT_ND(x)
A warning-free wrapper macro of assert() that has no performance effect in release mode even when 'x'...
ModuleType uninitialized_modules_
The module that has been most recently closed in master.
ModuleType uninitialized_modules_
The module that has been most recently closed in this node.
ErrorStack initialize_child()
Called as part of initialize_once() if this is a child SOC engine.
bool is_initialized() const override final
Returns whether the object has been already initialized or not.
ChildEngineStatus::StatusCode get_child_status(SocId node) const
Master engine successfully allocated shared memory and waiting for child's attach.
std::string convert_spawn_ld_library_path_pattern(int node) const
converts spawn_ld_library_path_pattern_ into a string with the given node ID.
Master engine successfully confirmed child's attach and reserved the reclamation of the shared memori...
StatusCode
These statuses represent each step described in SocManager comment.
bool is_error() const
Returns if this return code is not kErrorCodeOk.
The child engine observed some unrecoverable error and has exit.
MasterEngineStatus::StatusCode get_master_status() const
void memory_fence_acq_rel()
Equivalent to std::atomic_thread_fence(std::memory_order_acq_rel).
A child SOC instance launched just as a thread in the same process as master.
ErrorStack uninitialize() override
Terminates the database engine.