=== Applying patches on top of PostgreSQL commit ID 972c14fb9134fdfd76ea6ebcf98a55a945bbc988 === /etc/rc.d/jail: WARNING: Per-jail configuration via jail_* variables is obsolete. Please consider migrating to /etc/jail.conf. Wed Apr 15 08:30:29 UTC 2026 On branch cf/4884 nothing to commit, working tree clean === using 'git am' to apply patch ./0001-Fix-rare-recovery-shutdown-hang-due-to-checkpointer.patch === Applying: Fix rare recovery shutdown hang due to checkpointer. Using index info to reconstruct a base tree... M src/backend/postmaster/postmaster.c Falling back to patching base and 3-way merge... Auto-merging src/backend/postmaster/postmaster.c CONFLICT (content): Merge conflict in src/backend/postmaster/postmaster.c error: Failed to merge in the changes. hint: Use 'git am --show-current-patch=diff' to see the failed patch Patch failed at 0001 Fix rare recovery shutdown hang due to checkpointer. When you have resolved this problem, run "git am --continue". If you prefer to skip this patch, run "git am --skip" instead. To restore the original branch and stop patching, run "git am --abort". === using patch(1) to apply patch ./0001-Fix-rare-recovery-shutdown-hang-due-to-checkpointer.patch === patching file src/backend/postmaster/postmaster.c Hunk #1 FAILED at 3748. Hunk #2 succeeded at 3072 with fuzz 1 (offset -695 lines). Hunk #3 succeeded at 3111 with fuzz 1 (offset -694 lines). 1 out of 3 hunks FAILED -- saving rejects to file src/backend/postmaster/postmaster.c.rej Unstaged changes after reset: M src/backend/postmaster/postmaster.c Removing src/backend/postmaster/postmaster.c.rej === using 'git apply' to apply patch ./0001-Fix-rare-recovery-shutdown-hang-due-to-checkpointer.patch === Applied patch to 'src/backend/postmaster/postmaster.c' with conflicts. U src/backend/postmaster/postmaster.c diff --cc src/backend/postmaster/postmaster.c index 6e0f41d2661,62db752228a..00000000000 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@@ -2594,91 -3599,272 +2594,170 @@@ process_pm_child_exit(void } /* - * Log the death of a child process. + * CleanupBackend -- cleanup after terminated backend or background worker. + * + * Remove all local state associated with the child process and release its + * PMChild slot. */ static void -LogChildExit(int lev, const char *procname, int pid, int exitstatus) +CleanupBackend(PMChild *bp, + int exitstatus) /* child's exit status. */ { - /* - * size of activity_buffer is arbitrary, but set equal to default - * track_activity_query_size - */ - char activity_buffer[1024]; - const char *activity = NULL; - - if (!EXIT_STATUS_0(exitstatus)) - activity = pgstat_get_crashed_backend_activity(pid, - activity_buffer, - sizeof(activity_buffer)); - - if (WIFEXITED(exitstatus)) - ereport(lev, + char namebuf[MAXPGPATH]; + const char *procname; + bool crashed = false; + bool logged = false; + pid_t bp_pid; + bool bp_bgworker_notify; + BackendType bp_bkend_type; + RegisteredBgWorker *rw; - /*------ - translator: %s is a noun phrase describing a child process, such as - "server process" */ - (errmsg("%s (PID %d) exited with exit code %d", - procname, pid, WEXITSTATUS(exitstatus)), - activity ? errdetail("Failed process was running: %s", activity) : 0)); - else if (WIFSIGNALED(exitstatus)) + /* Construct a process name for the log message */ + if (bp->bkend_type == B_BG_WORKER) { -#if defined(WIN32) - ereport(lev, - - /*------ - translator: %s is a noun phrase describing a child process, such as - "server process" */ - (errmsg("%s (PID %d) was terminated by exception 0x%X", - procname, pid, WTERMSIG(exitstatus)), - errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), - activity ? errdetail("Failed process was running: %s", activity) : 0)); -#else - ereport(lev, - - /*------ - translator: %s is a noun phrase describing a child process, such as - "server process" */ - (errmsg("%s (PID %d) was terminated by signal %d: %s", - procname, pid, WTERMSIG(exitstatus), - pg_strsignal(WTERMSIG(exitstatus))), - activity ? errdetail("Failed process was running: %s", activity) : 0)); -#endif + snprintf(namebuf, MAXPGPATH, _("background worker \"%s\""), + bp->rw->rw_worker.bgw_type); + procname = namebuf; } else - ereport(lev, + procname = _(GetBackendTypeDesc(bp->bkend_type)); - /*------ - translator: %s is a noun phrase describing a child process, such as - "server process" */ - (errmsg("%s (PID %d) exited with unrecognized status %d", - procname, pid, exitstatus), - activity ? errdetail("Failed process was running: %s", activity) : 0)); -} + /* + * If a backend dies in an ugly way then we must signal all other backends + * to quickdie. If exit status is zero (normal) or one (FATAL exit), we + * assume everything is all right and proceed to remove the backend from + * the active child list. + */ + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + crashed = true; -/* - * Advance the postmaster's state machine and take actions as appropriate - * - * This is common code for process_pm_shutdown_request(), - * process_pm_child_exit() and process_pm_pmsignal(), which process the signals - * that might mean we need to change state. - */ -static void -PostmasterStateMachine(void) -{ - /* If we're doing a smart shutdown, try to advance that state. */ - if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) - { - if (!connsAllowed) - { - /* - * This state ends when we have no normal client backends running. - * Then we're ready to stop other children. - */ - if (CountChildren(BACKEND_TYPE_NORMAL) == 0) - pmState = PM_STOP_BACKENDS; - } - } +#ifdef WIN32 /* - * If we're ready to do so, signal child processes to shut down. (This - * isn't a persistent state, but treating it as a distinct pmState allows - * us to share this code across multiple shutdown code paths.) + * On win32, also treat ERROR_WAIT_NO_CHILDREN (128) as nonfatal case, + * since that sometimes happens under load when the process fails to start + * properly (long before it starts using shared memory). Microsoft reports + * it is related to mutex failure: + * http://archives.postgresql.org/pgsql-hackers/2010-09/msg00790.php */ - if (pmState == PM_STOP_BACKENDS) + if (exitstatus == ERROR_WAIT_NO_CHILDREN) { - /* - * Forget any pending requests for background workers, since we're no - * longer willing to launch any new workers. (If additional requests - * arrive, BackgroundWorkerStateChange will reject them.) - */ - ForgetUnstartedBackgroundWorkers(); - - /* Signal all backend children except walsenders */ - SignalSomeChildren(SIGTERM, - BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); - /* and the autovac launcher too */ - if (AutoVacPID != 0) - signal_child(AutoVacPID, SIGTERM); - /* and the bgwriter too */ - if (BgWriterPID != 0) - signal_child(BgWriterPID, SIGTERM); - /* and the walwriter too */ - if (WalWriterPID != 0) - signal_child(WalWriterPID, SIGTERM); - /* If we're in recovery, also stop startup and walreceiver procs */ - if (StartupPID != 0) - signal_child(StartupPID, SIGTERM); - if (WalReceiverPID != 0) - signal_child(WalReceiverPID, SIGTERM); - if (WalSummarizerPID != 0) - signal_child(WalSummarizerPID, SIGTERM); - if (SlotSyncWorkerPID != 0) - signal_child(SlotSyncWorkerPID, SIGTERM); - /* checkpointer, archiver, stats, and syslogger may continue for now */ - - /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ - pmState = PM_WAIT_BACKENDS; + LogChildExit(LOG, procname, bp->pid, exitstatus); + logged = true; + crashed = false; } +#endif /* - * If we are in a state-machine state that implies waiting for backends to - * exit, see if they're all gone, and change state if so. + * Release the PMChild entry. + * + * If the process attached to shared memory, this also checks that it + * detached cleanly. */ - if (pmState == PM_WAIT_BACKENDS) + bp_pid = bp->pid; + bp_bgworker_notify = bp->bgworker_notify; + bp_bkend_type = bp->bkend_type; + rw = bp->rw; + if (!ReleasePostmasterChildSlot(bp)) { /* - * PM_WAIT_BACKENDS state ends when we have no regular backends - * (including autovac workers), no bgworkers (including unconnected - * ones), and no walwriter, autovac launcher, bgwriter or slot sync - * worker. If we are doing crash recovery or an immediate shutdown - * then we expect the checkpointer to exit as well, otherwise not. The - * stats and syslogger processes are disregarded since they are not - * connected to shared memory; we also disregard dead_end children - * here. Walsenders and archiver are also disregarded, they will be - * terminated later after writing the checkpoint record. + * Uh-oh, the child failed to clean itself up. Treat as a crash after + * all. */ ++<<<<<<< ours + crashed = true; ++======= + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + StartupPID == 0 && + WalReceiverPID == 0 && + WalSummarizerPID == 0 && + BgWriterPID == 0 && + (CheckpointerPID == 0 || + (!FatalError && Shutdown < ImmediateShutdown) || + (FatalError && CheckpointerPID != 0)) && + WalWriterPID == 0 && + AutoVacPID == 0 && + SlotSyncWorkerPID == 0) + { + if (CheckpointerPID == 0 && + (Shutdown >= ImmediateShutdown || FatalError)) + { + /* + * Start waiting for dead_end children to die. This state + * change causes ServerLoop to stop creating new ones. + */ + pmState = PM_WAIT_DEAD_END; + + /* + * We already SIGQUIT'd the archiver and stats processes, if + * any, when we started immediate shutdown or entered + * FatalError state. + */ + } + else if (Shutdown > NoShutdown && Shutdown < ImmediateShutdown) + { + /* + * If we get here, we are proceeding with normal shutdown. All + * the regular children are gone, and it's time to tell the + * checkpointer to do a shutdown checkpoint. + */ + Assert(Shutdown > NoShutdown); + /* Start the checkpointer if not running */ + if (CheckpointerPID == 0) + CheckpointerPID = StartChildProcess(CheckpointerProcess); + /* And tell it to shut down */ + if (CheckpointerPID != 0) + { + signal_child(CheckpointerPID, SIGUSR2); + pmState = PM_SHUTDOWN; + } + else + { + /* + * If we failed to fork a checkpointer, just shut down. + * Any required cleanup will happen at next restart. We + * set FatalError so that an "abnormal shutdown" message + * gets logged when we exit. + * + * We don't consult send_abort_for_crash here, as it's + * unlikely that dumping cores would illuminate the reason + * for checkpointer fork failure. + */ + FatalError = true; + pmState = PM_WAIT_DEAD_END; + + /* Kill the walsenders and archiver too */ + SignalChildren(SIGQUIT); + if (PgArchPID != 0) + signal_child(PgArchPID, SIGQUIT); + } + } + else + { + /* + * Either it's an immediate shutdown or a child crashed, and + * we're still waiting for all the children to quit. The + * checkpointer was already told to quit. + */ + Assert(Shutdown == ImmediateShutdown || + (Shutdown == NoShutdown && FatalError)); + } + } ++>>>>>>> theirs } + bp = NULL; - if (pmState == PM_SHUTDOWN_2) - { - /* - * PM_SHUTDOWN_2 state ends when there's no other children than - * dead_end children left. There shouldn't be any regular backends - * left by now anyway; what we're really waiting for is walsenders and - * archiver. - */ - if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0) - { - pmState = PM_WAIT_DEAD_END; - } - } - - if (pmState == PM_WAIT_DEAD_END) + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ + if (crashed) { - /* Don't allow any new socket connection events. */ - ConfigurePostmasterWaitSet(false); - - /* - * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty - * (ie, no dead_end children remain), and the archiver is gone too. - * - * The reason we wait for those two is to protect them against a new - * postmaster starting conflicting subprocesses; this isn't an - * ironclad protection, but it at least helps in the - * shutdown-and-immediately-restart scenario. Note that they have - * already been sent appropriate shutdown signals, either during a - * normal state transition leading up to PM_WAIT_DEAD_END, or during - * FatalError processing. - */ - if (dlist_is_empty(&BackendList) && PgArchPID == 0) - { - /* These other guys should be dead already */ - Assert(StartupPID == 0); - Assert(WalReceiverPID == 0); - Assert(WalSummarizerPID == 0); - Assert(BgWriterPID == 0); - Assert(CheckpointerPID == 0); - Assert(WalWriterPID == 0); - Assert(AutoVacPID == 0); - Assert(SlotSyncWorkerPID == 0); - /* syslogger is not considered here */ - pmState = PM_NO_CHILDREN; - } + HandleChildCrash(bp_pid, exitstatus, procname); + return; } /*