diff --git a/tests/Makefile.am b/tests/Makefile.am index de83c0e06..fe6292979 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -220,7 +220,7 @@ TESTS_DEFAULT = \ da-queue-persist.sh \ daqueue-persist.sh \ daqueue-invld-qi.sh \ - daqueue-dirty-shutdown.sh \ + daqueue-dirty-shutdown-recovery.sh \ diskq-rfc5424.sh \ diskqueue.sh \ diskqueue-fsync.sh \ diff --git a/tests/daqueue-dirty-shutdown-recovery.sh b/tests/daqueue-dirty-shutdown-recovery.sh new file mode 100755 index 000000000..446ee596d --- /dev/null +++ b/tests/daqueue-dirty-shutdown-recovery.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Validate dirty-shutdown handling for a disk-assisted main queue. +# +# The historical version of this test assumed that a forced shutdown always +# left behind a fully recoverable queue that would later drain to an empty +# spool directory. Modern queue handling can also quarantine damaged queue +# state into mainq.bad.* when queue.onCorruption="safe" detects an +# inconsistent on-disk state. Both outcomes are valid as long as restart +# succeeds and no live mainq.* state is left behind. +# added 2016-12-01 by Rainer Gerhards +# rewritten 2026-03-30 to match current queue corruption semantics +# released under ASL 2.0 + +. ${srcdir:=.}/diag.sh init + +SPOOL_DIR="${RSYSLOG_DYNNAME}.spool" +RSYSLOGD_LOG="${RSYSLOG_DYNNAME}.rsyslogd.log" +STARTED_LOG="${RSYSLOG_DYNNAME}.started" + +write_phase1_conf() { + generate_conf + add_conf ' +module(load="../plugins/omtesting/.libs/omtesting") + +global(workDirectory="'"$SPOOL_DIR"'") +main_queue( + queue.filename="mainq" + queue.saveOnShutdown="on" + queue.timeoutShutdown="1" + queue.maxFileSize="1m" + queue.timeoutWorkerThreadShutdown="500" + queue.size="200000" +) + +:msg, contains, "msgnum:" :omtesting:sleep 10 0 +' +} + +write_phase2_conf() { + generate_conf + add_conf ' +module(load="../plugins/omtesting/.libs/omtesting") + +global(workDirectory="'"$SPOOL_DIR"'") +main_queue( + queue.filename="mainq" + queue.saveOnShutdown="on" + queue.maxFileSize="1m" + queue.timeoutWorkerThreadShutdown="500" + queue.size="200000" + queue.onCorruption="safe" +) + +template(name="outfmt" type="string" string="%msg:F,58:2%\\n") +action(type="omfile" file="'"$RSYSLOG_OUT_LOG"'" template="outfmt") +' +} + +list_spool_top() { + find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -printf '%f\n' | sort +} + +count_bad_dirs() { + find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -type d -name 'mainq.bad.*' | wc -l +} + +check_no_live_mainq_files() { + local live_files + + live_files="$(find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 \ + \( -type f -o -type l \) -name 'mainq*' -printf '%f\n')" + if [ -n "$live_files" ]; then + echo "FAIL: live mainq files remain after restart recovery" + printf '%s\n' "$live_files" + ls -l "$SPOOL_DIR" + error_exit 1 + fi +} + +write_phase1_conf +startup +injectmsg 0 210000 + +echo "spool files immediately before shutdown:" +list_spool_top + +shutdown_immediate +./msleep 750 + +echo "spool files immediately after shutdown (before kill):" +list_spool_top + +. "$srcdir/diag.sh" kill-immediate +wait_shutdown +rm -f "$RSYSLOG_PIDBASE.pid" + +echo "spool files after kill:" +list_spool_top + +if [ ! -f "$SPOOL_DIR/mainq.qi" ]; then + echo "FAIL: .qi file does not exist after interrupted shutdown" + error_exit 1 +fi + +echo ".qi file contents:" +cat "$SPOOL_DIR/mainq.qi" + +printf 'RSYSLOG RESTART\n\n' +write_phase2_conf + +: > "$RSYSLOGD_LOG" +: > "$STARTED_LOG" +export RS_REDIR=">>\"$RSYSLOGD_LOG\" 2>&1" + +startup +shutdown_when_empty +wait_shutdown + +unset RS_REDIR + +echo "spool files after restart recovery:" +list_spool_top + +check_no_live_mainq_files + +bad_dirs="$(count_bad_dirs)" +if [ "$bad_dirs" -gt 0 ]; then + echo "dirty shutdown caused queue quarantine into mainq.bad.*" + if [ -f "$RSYSLOG_OUT_LOG" ]; then + echo "recovered output was also written before quarantine cleanup" + fi + exit_test +fi + +if [ ! -s "$RSYSLOG_OUT_LOG" ]; then + echo "FAIL: no recovered output data gathered and no quarantine directory created" + ls -l "$SPOOL_DIR" + error_exit 1 +fi + +exit_test diff --git a/tests/daqueue-dirty-shutdown.sh b/tests/daqueue-dirty-shutdown.sh deleted file mode 100755 index c8de84b81..000000000 --- a/tests/daqueue-dirty-shutdown.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash -# This test simulates the case where the OS force-terminates rsyslog -# before it completely finishes persisting the queue to disk. Obviously, -# there is some data loss involved, but rsyslog should try to limit it. -# Most importantly, a .qi file needs to be written at "save" places, so that -# at least the queue is kind of readable. -# To simulate the error condition, we create a DA queue with a large memory -# part and fill it via injectmsg (do NOT use tcpflood, as this would add -# complexity of TCP window etc to the reception of messages - injectmsg is -# synchronous, so we do not have anything in flight after it terminates). -# We have a blocking action which prevents actual processing of any of the -# injected messages. We then inject a large number of messages, but only -# few above the number the memory part of the disk can hold. So the disk queue -# begins to get used. Once injection is done, we terminate rsyslog in the -# regular way, which will cause the memory part of the queue to be written -# out. After a relatively short period, we kill -9 rsyslogd, so that it -# does not have any chance to fully persists its state (this actually is -# what happens when force-terminated by the OS). -# Then, we check that at a minimum the .qi file exists. -# Copyright (C) 2016 by Rainer Gerhards -# Released under ASL 2.0 - -#uncomment the following if you want a log for step 1 of this test -#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout" -#export RSYSLOG_DEBUGLOG="log" -. ${srcdir:=.}/diag.sh init -generate_conf -add_conf ' -module(load="../plugins/omtesting/.libs/omtesting") - -# set spool locations and switch queue to disk-only mode -global(workDirectory="'${RSYSLOG_DYNNAME}'.spool") -main_queue(queue.filename="mainq" queue.saveonshutdown="on" - queue.timeoutshutdown="1" queue.maxfilesize="1m" - queue.timeoutworkerthreadshutdown="500" queue.size="200000" - ) - -:msg, contains, "msgnum:" :omtesting:sleep 10 0 -' -startup -injectmsg 0 210000 -echo spool files immediately before shutdown: -ls ${RSYSLOG_DYNNAME}.spool -shutdown_immediate # shut down without the ability to fully persist state -./msleep 750 # simulate an os timeout (let it run a *very short* bit, else it's done ;)) -echo spool files immediately after shutdown \(but before kill\): -ls -l ${RSYSLOG_DYNNAME}.spool - - -. $srcdir/diag.sh kill-immediate # do not give it sufficient time to shutdown -wait_shutdown -rm -f $RSYSLOG_PIDBASE.pid # as we kill, rsyslog does not itself cleanup the pid file - -echo spool files after kill: -ls -l ${RSYSLOG_DYNNAME}.spool - -if [ ! -f ${RSYSLOG_DYNNAME}.spool/mainq.qi ]; then - echo "FAIL: .qi file does not exist!" - error_exit 1 -fi - -echo .qi file contents: -cat ${RSYSLOG_DYNNAME}.spool/mainq.qi - - -# We now restart rsyslog and make sure it'll clean up the disk queue. -# So far, we cannot reliably detect if the data is properly shuffled -# over, but that's a moot point anyhow because we expect to loss -# (large) amounts of the data. In later stages, we however may verify - -#uncomment the following if you want a log for step 2 of this test -#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout" -#export RSYSLOG_DEBUGLOG="log2" - -printf 'RSYSLOG RESTART\n\n' -# special case: we need to preserve our dynamic settings, as generate_conf -# overwrites them. TODO: handle this in diag.sh -RSYSLOG_DYNNAME_SAVE="$RSYSLOG_DYNNAME" -RSYSLOG_OUT_LOG_SAVE="$RSYSLOG_OUT_LOG" -generate_conf -RSYSLOG_OUT_LOG="$RSYSLOG_OUT_LOG_SAVE" -RSYSLOG_DYNNAME="$RSYSLOG_DYNNAME_SAVE" -add_conf ' -module(load="../plugins/omtesting/.libs/omtesting") - -global(workDirectory="'${RSYSLOG_DYNNAME}'.spool") -main_queue(queue.filename="mainq" queue.saveonshutdown="on" - queue.maxfilesize="1m" # note: now regular shutdown timeout! - queue.timeoutworkerthreadshutdown="500" queue.size="200000" - ) - -$template outfmt,"%msg:F,58:2%\n" -action(type="omfile" file="'$RSYSLOG_OUT_LOG'" template="outfmt") -' -startup -#wait_queueempty -#echo existing queue empty, injecting new data -#injectmsg 1000000 1000 -shutdown_when_empty -wait_shutdown - -# now the spool directory must be empty -spoolFiles=$(ls ${RSYSLOG_DYNNAME}.spool/) - -if [[ ! -z $spoolFiles ]]; then - echo "FAIL: spool directory is not empty!" - ls -l ${RSYSLOG_DYNNAME}.spool - error_exit 1 -fi - -# check if we got at least some data -if [ ! -f $RSYSLOG_OUT_LOG ]; then - echo "FAIL: no output data gathered (no ${RSYSLOG_OUT_LOG})!" - error_exit 1 -fi -exit_test