Merge pull request #6675 from rgerhards/codex-rewrite-daqueue-dirty-shutdown

tests: rewrite dirty shutdown recovery test
This commit is contained in:
Rainer Gerhards 2026-04-04 17:03:05 +02:00 committed by GitHub
commit fff107c0c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 142 additions and 117 deletions

View File

@ -220,7 +220,7 @@ TESTS_DEFAULT = \
da-queue-persist.sh \
daqueue-persist.sh \
daqueue-invld-qi.sh \
daqueue-dirty-shutdown.sh \
daqueue-dirty-shutdown-recovery.sh \
diskq-rfc5424.sh \
diskqueue.sh \
diskqueue-fsync.sh \

View File

@ -0,0 +1,141 @@
#!/bin/bash
# Validate dirty-shutdown handling for a disk-assisted main queue.
#
# The historical version of this test assumed that a forced shutdown always
# left behind a fully recoverable queue that would later drain to an empty
# spool directory. Modern queue handling can also quarantine damaged queue
# state into mainq.bad.* when queue.onCorruption="safe" detects an
# inconsistent on-disk state. Both outcomes are valid as long as restart
# succeeds and no live mainq.* state is left behind.
# added 2016-12-01 by Rainer Gerhards
# rewritten 2026-03-30 to match current queue corruption semantics
# released under ASL 2.0
. ${srcdir:=.}/diag.sh init
SPOOL_DIR="${RSYSLOG_DYNNAME}.spool"
RSYSLOGD_LOG="${RSYSLOG_DYNNAME}.rsyslogd.log"
STARTED_LOG="${RSYSLOG_DYNNAME}.started"
write_phase1_conf() {
generate_conf
add_conf '
module(load="../plugins/omtesting/.libs/omtesting")
global(workDirectory="'"$SPOOL_DIR"'")
main_queue(
queue.filename="mainq"
queue.saveOnShutdown="on"
queue.timeoutShutdown="1"
queue.maxFileSize="1m"
queue.timeoutWorkerThreadShutdown="500"
queue.size="200000"
)
:msg, contains, "msgnum:" :omtesting:sleep 10 0
'
}
write_phase2_conf() {
generate_conf
add_conf '
module(load="../plugins/omtesting/.libs/omtesting")
global(workDirectory="'"$SPOOL_DIR"'")
main_queue(
queue.filename="mainq"
queue.saveOnShutdown="on"
queue.maxFileSize="1m"
queue.timeoutWorkerThreadShutdown="500"
queue.size="200000"
queue.onCorruption="safe"
)
template(name="outfmt" type="string" string="%msg:F,58:2%\\n")
action(type="omfile" file="'"$RSYSLOG_OUT_LOG"'" template="outfmt")
'
}
list_spool_top() {
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -printf '%f\n' | sort
}
count_bad_dirs() {
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -type d -name 'mainq.bad.*' | wc -l
}
check_no_live_mainq_files() {
local live_files
live_files="$(find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 \
\( -type f -o -type l \) -name 'mainq*' -printf '%f\n')"
if [ -n "$live_files" ]; then
echo "FAIL: live mainq files remain after restart recovery"
printf '%s\n' "$live_files"
ls -l "$SPOOL_DIR"
error_exit 1
fi
}
write_phase1_conf
startup
injectmsg 0 210000
echo "spool files immediately before shutdown:"
list_spool_top
shutdown_immediate
./msleep 750
echo "spool files immediately after shutdown (before kill):"
list_spool_top
. "$srcdir/diag.sh" kill-immediate
wait_shutdown
rm -f "$RSYSLOG_PIDBASE.pid"
echo "spool files after kill:"
list_spool_top
if [ ! -f "$SPOOL_DIR/mainq.qi" ]; then
echo "FAIL: .qi file does not exist after interrupted shutdown"
error_exit 1
fi
echo ".qi file contents:"
cat "$SPOOL_DIR/mainq.qi"
printf 'RSYSLOG RESTART\n\n'
write_phase2_conf
: > "$RSYSLOGD_LOG"
: > "$STARTED_LOG"
export RS_REDIR=">>\"$RSYSLOGD_LOG\" 2>&1"
startup
shutdown_when_empty
wait_shutdown
unset RS_REDIR
echo "spool files after restart recovery:"
list_spool_top
check_no_live_mainq_files
bad_dirs="$(count_bad_dirs)"
if [ "$bad_dirs" -gt 0 ]; then
echo "dirty shutdown caused queue quarantine into mainq.bad.*"
if [ -f "$RSYSLOG_OUT_LOG" ]; then
echo "recovered output was also written before quarantine cleanup"
fi
exit_test
fi
if [ ! -s "$RSYSLOG_OUT_LOG" ]; then
echo "FAIL: no recovered output data gathered and no quarantine directory created"
ls -l "$SPOOL_DIR"
error_exit 1
fi
exit_test

View File

@ -1,116 +0,0 @@
#!/bin/bash
# This test simulates the case where the OS force-terminates rsyslog
# before it completely finishes persisting the queue to disk. Obviously,
# there is some data loss involved, but rsyslog should try to limit it.
# Most importantly, a .qi file needs to be written at "save" places, so that
# at least the queue is kind of readable.
# To simulate the error condition, we create a DA queue with a large memory
# part and fill it via injectmsg (do NOT use tcpflood, as this would add
# complexity of TCP window etc to the reception of messages - injectmsg is
# synchronous, so we do not have anything in flight after it terminates).
# We have a blocking action which prevents actual processing of any of the
# injected messages. We then inject a large number of messages, but only
# few above the number the memory part of the disk can hold. So the disk queue
# begins to get used. Once injection is done, we terminate rsyslog in the
# regular way, which will cause the memory part of the queue to be written
# out. After a relatively short period, we kill -9 rsyslogd, so that it
# does not have any chance to fully persists its state (this actually is
# what happens when force-terminated by the OS).
# Then, we check that at a minimum the .qi file exists.
# Copyright (C) 2016 by Rainer Gerhards
# Released under ASL 2.0
#uncomment the following if you want a log for step 1 of this test
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
#export RSYSLOG_DEBUGLOG="log"
. ${srcdir:=.}/diag.sh init
generate_conf
add_conf '
module(load="../plugins/omtesting/.libs/omtesting")
# set spool locations and switch queue to disk-only mode
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
queue.timeoutshutdown="1" queue.maxfilesize="1m"
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
)
:msg, contains, "msgnum:" :omtesting:sleep 10 0
'
startup
injectmsg 0 210000
echo spool files immediately before shutdown:
ls ${RSYSLOG_DYNNAME}.spool
shutdown_immediate # shut down without the ability to fully persist state
./msleep 750 # simulate an os timeout (let it run a *very short* bit, else it's done ;))
echo spool files immediately after shutdown \(but before kill\):
ls -l ${RSYSLOG_DYNNAME}.spool
. $srcdir/diag.sh kill-immediate # do not give it sufficient time to shutdown
wait_shutdown
rm -f $RSYSLOG_PIDBASE.pid # as we kill, rsyslog does not itself cleanup the pid file
echo spool files after kill:
ls -l ${RSYSLOG_DYNNAME}.spool
if [ ! -f ${RSYSLOG_DYNNAME}.spool/mainq.qi ]; then
echo "FAIL: .qi file does not exist!"
error_exit 1
fi
echo .qi file contents:
cat ${RSYSLOG_DYNNAME}.spool/mainq.qi
# We now restart rsyslog and make sure it'll clean up the disk queue.
# So far, we cannot reliably detect if the data is properly shuffled
# over, but that's a moot point anyhow because we expect to loss
# (large) amounts of the data. In later stages, we however may verify
#uncomment the following if you want a log for step 2 of this test
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
#export RSYSLOG_DEBUGLOG="log2"
printf 'RSYSLOG RESTART\n\n'
# special case: we need to preserve our dynamic settings, as generate_conf
# overwrites them. TODO: handle this in diag.sh
RSYSLOG_DYNNAME_SAVE="$RSYSLOG_DYNNAME"
RSYSLOG_OUT_LOG_SAVE="$RSYSLOG_OUT_LOG"
generate_conf
RSYSLOG_OUT_LOG="$RSYSLOG_OUT_LOG_SAVE"
RSYSLOG_DYNNAME="$RSYSLOG_DYNNAME_SAVE"
add_conf '
module(load="../plugins/omtesting/.libs/omtesting")
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
queue.maxfilesize="1m" # note: now regular shutdown timeout!
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
)
$template outfmt,"%msg:F,58:2%\n"
action(type="omfile" file="'$RSYSLOG_OUT_LOG'" template="outfmt")
'
startup
#wait_queueempty
#echo existing queue empty, injecting new data
#injectmsg 1000000 1000
shutdown_when_empty
wait_shutdown
# now the spool directory must be empty
spoolFiles=$(ls ${RSYSLOG_DYNNAME}.spool/)
if [[ ! -z $spoolFiles ]]; then
echo "FAIL: spool directory is not empty!"
ls -l ${RSYSLOG_DYNNAME}.spool
error_exit 1
fi
# check if we got at least some data
if [ ! -f $RSYSLOG_OUT_LOG ]; then
echo "FAIL: no output data gathered (no ${RSYSLOG_OUT_LOG})!"
error_exit 1
fi
exit_test