mirror of
https://github.com/rsyslog/rsyslog.git
synced 2026-04-17 06:50:48 +02:00
Merge pull request #6675 from rgerhards/codex-rewrite-daqueue-dirty-shutdown
tests: rewrite dirty shutdown recovery test
This commit is contained in:
commit
fff107c0c0
@ -220,7 +220,7 @@ TESTS_DEFAULT = \
|
||||
da-queue-persist.sh \
|
||||
daqueue-persist.sh \
|
||||
daqueue-invld-qi.sh \
|
||||
daqueue-dirty-shutdown.sh \
|
||||
daqueue-dirty-shutdown-recovery.sh \
|
||||
diskq-rfc5424.sh \
|
||||
diskqueue.sh \
|
||||
diskqueue-fsync.sh \
|
||||
|
||||
141
tests/daqueue-dirty-shutdown-recovery.sh
Executable file
141
tests/daqueue-dirty-shutdown-recovery.sh
Executable file
@ -0,0 +1,141 @@
|
||||
#!/bin/bash
|
||||
# Validate dirty-shutdown handling for a disk-assisted main queue.
|
||||
#
|
||||
# The historical version of this test assumed that a forced shutdown always
|
||||
# left behind a fully recoverable queue that would later drain to an empty
|
||||
# spool directory. Modern queue handling can also quarantine damaged queue
|
||||
# state into mainq.bad.* when queue.onCorruption="safe" detects an
|
||||
# inconsistent on-disk state. Both outcomes are valid as long as restart
|
||||
# succeeds and no live mainq.* state is left behind.
|
||||
# added 2016-12-01 by Rainer Gerhards
|
||||
# rewritten 2026-03-30 to match current queue corruption semantics
|
||||
# released under ASL 2.0
|
||||
|
||||
. ${srcdir:=.}/diag.sh init
|
||||
|
||||
SPOOL_DIR="${RSYSLOG_DYNNAME}.spool"
|
||||
RSYSLOGD_LOG="${RSYSLOG_DYNNAME}.rsyslogd.log"
|
||||
STARTED_LOG="${RSYSLOG_DYNNAME}.started"
|
||||
|
||||
write_phase1_conf() {
|
||||
generate_conf
|
||||
add_conf '
|
||||
module(load="../plugins/omtesting/.libs/omtesting")
|
||||
|
||||
global(workDirectory="'"$SPOOL_DIR"'")
|
||||
main_queue(
|
||||
queue.filename="mainq"
|
||||
queue.saveOnShutdown="on"
|
||||
queue.timeoutShutdown="1"
|
||||
queue.maxFileSize="1m"
|
||||
queue.timeoutWorkerThreadShutdown="500"
|
||||
queue.size="200000"
|
||||
)
|
||||
|
||||
:msg, contains, "msgnum:" :omtesting:sleep 10 0
|
||||
'
|
||||
}
|
||||
|
||||
write_phase2_conf() {
|
||||
generate_conf
|
||||
add_conf '
|
||||
module(load="../plugins/omtesting/.libs/omtesting")
|
||||
|
||||
global(workDirectory="'"$SPOOL_DIR"'")
|
||||
main_queue(
|
||||
queue.filename="mainq"
|
||||
queue.saveOnShutdown="on"
|
||||
queue.maxFileSize="1m"
|
||||
queue.timeoutWorkerThreadShutdown="500"
|
||||
queue.size="200000"
|
||||
queue.onCorruption="safe"
|
||||
)
|
||||
|
||||
template(name="outfmt" type="string" string="%msg:F,58:2%\\n")
|
||||
action(type="omfile" file="'"$RSYSLOG_OUT_LOG"'" template="outfmt")
|
||||
'
|
||||
}
|
||||
|
||||
list_spool_top() {
|
||||
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -printf '%f\n' | sort
|
||||
}
|
||||
|
||||
count_bad_dirs() {
|
||||
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -type d -name 'mainq.bad.*' | wc -l
|
||||
}
|
||||
|
||||
check_no_live_mainq_files() {
|
||||
local live_files
|
||||
|
||||
live_files="$(find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 \
|
||||
\( -type f -o -type l \) -name 'mainq*' -printf '%f\n')"
|
||||
if [ -n "$live_files" ]; then
|
||||
echo "FAIL: live mainq files remain after restart recovery"
|
||||
printf '%s\n' "$live_files"
|
||||
ls -l "$SPOOL_DIR"
|
||||
error_exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
write_phase1_conf
|
||||
startup
|
||||
injectmsg 0 210000
|
||||
|
||||
echo "spool files immediately before shutdown:"
|
||||
list_spool_top
|
||||
|
||||
shutdown_immediate
|
||||
./msleep 750
|
||||
|
||||
echo "spool files immediately after shutdown (before kill):"
|
||||
list_spool_top
|
||||
|
||||
. "$srcdir/diag.sh" kill-immediate
|
||||
wait_shutdown
|
||||
rm -f "$RSYSLOG_PIDBASE.pid"
|
||||
|
||||
echo "spool files after kill:"
|
||||
list_spool_top
|
||||
|
||||
if [ ! -f "$SPOOL_DIR/mainq.qi" ]; then
|
||||
echo "FAIL: .qi file does not exist after interrupted shutdown"
|
||||
error_exit 1
|
||||
fi
|
||||
|
||||
echo ".qi file contents:"
|
||||
cat "$SPOOL_DIR/mainq.qi"
|
||||
|
||||
printf 'RSYSLOG RESTART\n\n'
|
||||
write_phase2_conf
|
||||
|
||||
: > "$RSYSLOGD_LOG"
|
||||
: > "$STARTED_LOG"
|
||||
export RS_REDIR=">>\"$RSYSLOGD_LOG\" 2>&1"
|
||||
|
||||
startup
|
||||
shutdown_when_empty
|
||||
wait_shutdown
|
||||
|
||||
unset RS_REDIR
|
||||
|
||||
echo "spool files after restart recovery:"
|
||||
list_spool_top
|
||||
|
||||
check_no_live_mainq_files
|
||||
|
||||
bad_dirs="$(count_bad_dirs)"
|
||||
if [ "$bad_dirs" -gt 0 ]; then
|
||||
echo "dirty shutdown caused queue quarantine into mainq.bad.*"
|
||||
if [ -f "$RSYSLOG_OUT_LOG" ]; then
|
||||
echo "recovered output was also written before quarantine cleanup"
|
||||
fi
|
||||
exit_test
|
||||
fi
|
||||
|
||||
if [ ! -s "$RSYSLOG_OUT_LOG" ]; then
|
||||
echo "FAIL: no recovered output data gathered and no quarantine directory created"
|
||||
ls -l "$SPOOL_DIR"
|
||||
error_exit 1
|
||||
fi
|
||||
|
||||
exit_test
|
||||
@ -1,116 +0,0 @@
|
||||
#!/bin/bash
|
||||
# This test simulates the case where the OS force-terminates rsyslog
|
||||
# before it completely finishes persisting the queue to disk. Obviously,
|
||||
# there is some data loss involved, but rsyslog should try to limit it.
|
||||
# Most importantly, a .qi file needs to be written at "save" places, so that
|
||||
# at least the queue is kind of readable.
|
||||
# To simulate the error condition, we create a DA queue with a large memory
|
||||
# part and fill it via injectmsg (do NOT use tcpflood, as this would add
|
||||
# complexity of TCP window etc to the reception of messages - injectmsg is
|
||||
# synchronous, so we do not have anything in flight after it terminates).
|
||||
# We have a blocking action which prevents actual processing of any of the
|
||||
# injected messages. We then inject a large number of messages, but only
|
||||
# few above the number the memory part of the disk can hold. So the disk queue
|
||||
# begins to get used. Once injection is done, we terminate rsyslog in the
|
||||
# regular way, which will cause the memory part of the queue to be written
|
||||
# out. After a relatively short period, we kill -9 rsyslogd, so that it
|
||||
# does not have any chance to fully persists its state (this actually is
|
||||
# what happens when force-terminated by the OS).
|
||||
# Then, we check that at a minimum the .qi file exists.
|
||||
# Copyright (C) 2016 by Rainer Gerhards
|
||||
# Released under ASL 2.0
|
||||
|
||||
#uncomment the following if you want a log for step 1 of this test
|
||||
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
|
||||
#export RSYSLOG_DEBUGLOG="log"
|
||||
. ${srcdir:=.}/diag.sh init
|
||||
generate_conf
|
||||
add_conf '
|
||||
module(load="../plugins/omtesting/.libs/omtesting")
|
||||
|
||||
# set spool locations and switch queue to disk-only mode
|
||||
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
|
||||
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
|
||||
queue.timeoutshutdown="1" queue.maxfilesize="1m"
|
||||
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
|
||||
)
|
||||
|
||||
:msg, contains, "msgnum:" :omtesting:sleep 10 0
|
||||
'
|
||||
startup
|
||||
injectmsg 0 210000
|
||||
echo spool files immediately before shutdown:
|
||||
ls ${RSYSLOG_DYNNAME}.spool
|
||||
shutdown_immediate # shut down without the ability to fully persist state
|
||||
./msleep 750 # simulate an os timeout (let it run a *very short* bit, else it's done ;))
|
||||
echo spool files immediately after shutdown \(but before kill\):
|
||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
||||
|
||||
|
||||
. $srcdir/diag.sh kill-immediate # do not give it sufficient time to shutdown
|
||||
wait_shutdown
|
||||
rm -f $RSYSLOG_PIDBASE.pid # as we kill, rsyslog does not itself cleanup the pid file
|
||||
|
||||
echo spool files after kill:
|
||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
||||
|
||||
if [ ! -f ${RSYSLOG_DYNNAME}.spool/mainq.qi ]; then
|
||||
echo "FAIL: .qi file does not exist!"
|
||||
error_exit 1
|
||||
fi
|
||||
|
||||
echo .qi file contents:
|
||||
cat ${RSYSLOG_DYNNAME}.spool/mainq.qi
|
||||
|
||||
|
||||
# We now restart rsyslog and make sure it'll clean up the disk queue.
|
||||
# So far, we cannot reliably detect if the data is properly shuffled
|
||||
# over, but that's a moot point anyhow because we expect to loss
|
||||
# (large) amounts of the data. In later stages, we however may verify
|
||||
|
||||
#uncomment the following if you want a log for step 2 of this test
|
||||
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
|
||||
#export RSYSLOG_DEBUGLOG="log2"
|
||||
|
||||
printf 'RSYSLOG RESTART\n\n'
|
||||
# special case: we need to preserve our dynamic settings, as generate_conf
|
||||
# overwrites them. TODO: handle this in diag.sh
|
||||
RSYSLOG_DYNNAME_SAVE="$RSYSLOG_DYNNAME"
|
||||
RSYSLOG_OUT_LOG_SAVE="$RSYSLOG_OUT_LOG"
|
||||
generate_conf
|
||||
RSYSLOG_OUT_LOG="$RSYSLOG_OUT_LOG_SAVE"
|
||||
RSYSLOG_DYNNAME="$RSYSLOG_DYNNAME_SAVE"
|
||||
add_conf '
|
||||
module(load="../plugins/omtesting/.libs/omtesting")
|
||||
|
||||
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
|
||||
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
|
||||
queue.maxfilesize="1m" # note: now regular shutdown timeout!
|
||||
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
|
||||
)
|
||||
|
||||
$template outfmt,"%msg:F,58:2%\n"
|
||||
action(type="omfile" file="'$RSYSLOG_OUT_LOG'" template="outfmt")
|
||||
'
|
||||
startup
|
||||
#wait_queueempty
|
||||
#echo existing queue empty, injecting new data
|
||||
#injectmsg 1000000 1000
|
||||
shutdown_when_empty
|
||||
wait_shutdown
|
||||
|
||||
# now the spool directory must be empty
|
||||
spoolFiles=$(ls ${RSYSLOG_DYNNAME}.spool/)
|
||||
|
||||
if [[ ! -z $spoolFiles ]]; then
|
||||
echo "FAIL: spool directory is not empty!"
|
||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
||||
error_exit 1
|
||||
fi
|
||||
|
||||
# check if we got at least some data
|
||||
if [ ! -f $RSYSLOG_OUT_LOG ]; then
|
||||
echo "FAIL: no output data gathered (no ${RSYSLOG_OUT_LOG})!"
|
||||
error_exit 1
|
||||
fi
|
||||
exit_test
|
||||
Loading…
x
Reference in New Issue
Block a user