mirror of
https://github.com/rsyslog/rsyslog.git
synced 2026-04-17 08:00:48 +02:00
Merge pull request #6675 from rgerhards/codex-rewrite-daqueue-dirty-shutdown
tests: rewrite dirty shutdown recovery test
This commit is contained in:
commit
fff107c0c0
@ -220,7 +220,7 @@ TESTS_DEFAULT = \
|
|||||||
da-queue-persist.sh \
|
da-queue-persist.sh \
|
||||||
daqueue-persist.sh \
|
daqueue-persist.sh \
|
||||||
daqueue-invld-qi.sh \
|
daqueue-invld-qi.sh \
|
||||||
daqueue-dirty-shutdown.sh \
|
daqueue-dirty-shutdown-recovery.sh \
|
||||||
diskq-rfc5424.sh \
|
diskq-rfc5424.sh \
|
||||||
diskqueue.sh \
|
diskqueue.sh \
|
||||||
diskqueue-fsync.sh \
|
diskqueue-fsync.sh \
|
||||||
|
|||||||
141
tests/daqueue-dirty-shutdown-recovery.sh
Executable file
141
tests/daqueue-dirty-shutdown-recovery.sh
Executable file
@ -0,0 +1,141 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Validate dirty-shutdown handling for a disk-assisted main queue.
|
||||||
|
#
|
||||||
|
# The historical version of this test assumed that a forced shutdown always
|
||||||
|
# left behind a fully recoverable queue that would later drain to an empty
|
||||||
|
# spool directory. Modern queue handling can also quarantine damaged queue
|
||||||
|
# state into mainq.bad.* when queue.onCorruption="safe" detects an
|
||||||
|
# inconsistent on-disk state. Both outcomes are valid as long as restart
|
||||||
|
# succeeds and no live mainq.* state is left behind.
|
||||||
|
# added 2016-12-01 by Rainer Gerhards
|
||||||
|
# rewritten 2026-03-30 to match current queue corruption semantics
|
||||||
|
# released under ASL 2.0
|
||||||
|
|
||||||
|
. ${srcdir:=.}/diag.sh init
|
||||||
|
|
||||||
|
SPOOL_DIR="${RSYSLOG_DYNNAME}.spool"
|
||||||
|
RSYSLOGD_LOG="${RSYSLOG_DYNNAME}.rsyslogd.log"
|
||||||
|
STARTED_LOG="${RSYSLOG_DYNNAME}.started"
|
||||||
|
|
||||||
|
write_phase1_conf() {
|
||||||
|
generate_conf
|
||||||
|
add_conf '
|
||||||
|
module(load="../plugins/omtesting/.libs/omtesting")
|
||||||
|
|
||||||
|
global(workDirectory="'"$SPOOL_DIR"'")
|
||||||
|
main_queue(
|
||||||
|
queue.filename="mainq"
|
||||||
|
queue.saveOnShutdown="on"
|
||||||
|
queue.timeoutShutdown="1"
|
||||||
|
queue.maxFileSize="1m"
|
||||||
|
queue.timeoutWorkerThreadShutdown="500"
|
||||||
|
queue.size="200000"
|
||||||
|
)
|
||||||
|
|
||||||
|
:msg, contains, "msgnum:" :omtesting:sleep 10 0
|
||||||
|
'
|
||||||
|
}
|
||||||
|
|
||||||
|
write_phase2_conf() {
|
||||||
|
generate_conf
|
||||||
|
add_conf '
|
||||||
|
module(load="../plugins/omtesting/.libs/omtesting")
|
||||||
|
|
||||||
|
global(workDirectory="'"$SPOOL_DIR"'")
|
||||||
|
main_queue(
|
||||||
|
queue.filename="mainq"
|
||||||
|
queue.saveOnShutdown="on"
|
||||||
|
queue.maxFileSize="1m"
|
||||||
|
queue.timeoutWorkerThreadShutdown="500"
|
||||||
|
queue.size="200000"
|
||||||
|
queue.onCorruption="safe"
|
||||||
|
)
|
||||||
|
|
||||||
|
template(name="outfmt" type="string" string="%msg:F,58:2%\\n")
|
||||||
|
action(type="omfile" file="'"$RSYSLOG_OUT_LOG"'" template="outfmt")
|
||||||
|
'
|
||||||
|
}
|
||||||
|
|
||||||
|
list_spool_top() {
|
||||||
|
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -printf '%f\n' | sort
|
||||||
|
}
|
||||||
|
|
||||||
|
count_bad_dirs() {
|
||||||
|
find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 -type d -name 'mainq.bad.*' | wc -l
|
||||||
|
}
|
||||||
|
|
||||||
|
check_no_live_mainq_files() {
|
||||||
|
local live_files
|
||||||
|
|
||||||
|
live_files="$(find "$SPOOL_DIR" -maxdepth 1 -mindepth 1 \
|
||||||
|
\( -type f -o -type l \) -name 'mainq*' -printf '%f\n')"
|
||||||
|
if [ -n "$live_files" ]; then
|
||||||
|
echo "FAIL: live mainq files remain after restart recovery"
|
||||||
|
printf '%s\n' "$live_files"
|
||||||
|
ls -l "$SPOOL_DIR"
|
||||||
|
error_exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
write_phase1_conf
|
||||||
|
startup
|
||||||
|
injectmsg 0 210000
|
||||||
|
|
||||||
|
echo "spool files immediately before shutdown:"
|
||||||
|
list_spool_top
|
||||||
|
|
||||||
|
shutdown_immediate
|
||||||
|
./msleep 750
|
||||||
|
|
||||||
|
echo "spool files immediately after shutdown (before kill):"
|
||||||
|
list_spool_top
|
||||||
|
|
||||||
|
. "$srcdir/diag.sh" kill-immediate
|
||||||
|
wait_shutdown
|
||||||
|
rm -f "$RSYSLOG_PIDBASE.pid"
|
||||||
|
|
||||||
|
echo "spool files after kill:"
|
||||||
|
list_spool_top
|
||||||
|
|
||||||
|
if [ ! -f "$SPOOL_DIR/mainq.qi" ]; then
|
||||||
|
echo "FAIL: .qi file does not exist after interrupted shutdown"
|
||||||
|
error_exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ".qi file contents:"
|
||||||
|
cat "$SPOOL_DIR/mainq.qi"
|
||||||
|
|
||||||
|
printf 'RSYSLOG RESTART\n\n'
|
||||||
|
write_phase2_conf
|
||||||
|
|
||||||
|
: > "$RSYSLOGD_LOG"
|
||||||
|
: > "$STARTED_LOG"
|
||||||
|
export RS_REDIR=">>\"$RSYSLOGD_LOG\" 2>&1"
|
||||||
|
|
||||||
|
startup
|
||||||
|
shutdown_when_empty
|
||||||
|
wait_shutdown
|
||||||
|
|
||||||
|
unset RS_REDIR
|
||||||
|
|
||||||
|
echo "spool files after restart recovery:"
|
||||||
|
list_spool_top
|
||||||
|
|
||||||
|
check_no_live_mainq_files
|
||||||
|
|
||||||
|
bad_dirs="$(count_bad_dirs)"
|
||||||
|
if [ "$bad_dirs" -gt 0 ]; then
|
||||||
|
echo "dirty shutdown caused queue quarantine into mainq.bad.*"
|
||||||
|
if [ -f "$RSYSLOG_OUT_LOG" ]; then
|
||||||
|
echo "recovered output was also written before quarantine cleanup"
|
||||||
|
fi
|
||||||
|
exit_test
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -s "$RSYSLOG_OUT_LOG" ]; then
|
||||||
|
echo "FAIL: no recovered output data gathered and no quarantine directory created"
|
||||||
|
ls -l "$SPOOL_DIR"
|
||||||
|
error_exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit_test
|
||||||
@ -1,116 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# This test simulates the case where the OS force-terminates rsyslog
|
|
||||||
# before it completely finishes persisting the queue to disk. Obviously,
|
|
||||||
# there is some data loss involved, but rsyslog should try to limit it.
|
|
||||||
# Most importantly, a .qi file needs to be written at "save" places, so that
|
|
||||||
# at least the queue is kind of readable.
|
|
||||||
# To simulate the error condition, we create a DA queue with a large memory
|
|
||||||
# part and fill it via injectmsg (do NOT use tcpflood, as this would add
|
|
||||||
# complexity of TCP window etc to the reception of messages - injectmsg is
|
|
||||||
# synchronous, so we do not have anything in flight after it terminates).
|
|
||||||
# We have a blocking action which prevents actual processing of any of the
|
|
||||||
# injected messages. We then inject a large number of messages, but only
|
|
||||||
# few above the number the memory part of the disk can hold. So the disk queue
|
|
||||||
# begins to get used. Once injection is done, we terminate rsyslog in the
|
|
||||||
# regular way, which will cause the memory part of the queue to be written
|
|
||||||
# out. After a relatively short period, we kill -9 rsyslogd, so that it
|
|
||||||
# does not have any chance to fully persists its state (this actually is
|
|
||||||
# what happens when force-terminated by the OS).
|
|
||||||
# Then, we check that at a minimum the .qi file exists.
|
|
||||||
# Copyright (C) 2016 by Rainer Gerhards
|
|
||||||
# Released under ASL 2.0
|
|
||||||
|
|
||||||
#uncomment the following if you want a log for step 1 of this test
|
|
||||||
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
|
|
||||||
#export RSYSLOG_DEBUGLOG="log"
|
|
||||||
. ${srcdir:=.}/diag.sh init
|
|
||||||
generate_conf
|
|
||||||
add_conf '
|
|
||||||
module(load="../plugins/omtesting/.libs/omtesting")
|
|
||||||
|
|
||||||
# set spool locations and switch queue to disk-only mode
|
|
||||||
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
|
|
||||||
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
|
|
||||||
queue.timeoutshutdown="1" queue.maxfilesize="1m"
|
|
||||||
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
|
|
||||||
)
|
|
||||||
|
|
||||||
:msg, contains, "msgnum:" :omtesting:sleep 10 0
|
|
||||||
'
|
|
||||||
startup
|
|
||||||
injectmsg 0 210000
|
|
||||||
echo spool files immediately before shutdown:
|
|
||||||
ls ${RSYSLOG_DYNNAME}.spool
|
|
||||||
shutdown_immediate # shut down without the ability to fully persist state
|
|
||||||
./msleep 750 # simulate an os timeout (let it run a *very short* bit, else it's done ;))
|
|
||||||
echo spool files immediately after shutdown \(but before kill\):
|
|
||||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
|
||||||
|
|
||||||
|
|
||||||
. $srcdir/diag.sh kill-immediate # do not give it sufficient time to shutdown
|
|
||||||
wait_shutdown
|
|
||||||
rm -f $RSYSLOG_PIDBASE.pid # as we kill, rsyslog does not itself cleanup the pid file
|
|
||||||
|
|
||||||
echo spool files after kill:
|
|
||||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
|
||||||
|
|
||||||
if [ ! -f ${RSYSLOG_DYNNAME}.spool/mainq.qi ]; then
|
|
||||||
echo "FAIL: .qi file does not exist!"
|
|
||||||
error_exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo .qi file contents:
|
|
||||||
cat ${RSYSLOG_DYNNAME}.spool/mainq.qi
|
|
||||||
|
|
||||||
|
|
||||||
# We now restart rsyslog and make sure it'll clean up the disk queue.
|
|
||||||
# So far, we cannot reliably detect if the data is properly shuffled
|
|
||||||
# over, but that's a moot point anyhow because we expect to loss
|
|
||||||
# (large) amounts of the data. In later stages, we however may verify
|
|
||||||
|
|
||||||
#uncomment the following if you want a log for step 2 of this test
|
|
||||||
#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction nostdout"
|
|
||||||
#export RSYSLOG_DEBUGLOG="log2"
|
|
||||||
|
|
||||||
printf 'RSYSLOG RESTART\n\n'
|
|
||||||
# special case: we need to preserve our dynamic settings, as generate_conf
|
|
||||||
# overwrites them. TODO: handle this in diag.sh
|
|
||||||
RSYSLOG_DYNNAME_SAVE="$RSYSLOG_DYNNAME"
|
|
||||||
RSYSLOG_OUT_LOG_SAVE="$RSYSLOG_OUT_LOG"
|
|
||||||
generate_conf
|
|
||||||
RSYSLOG_OUT_LOG="$RSYSLOG_OUT_LOG_SAVE"
|
|
||||||
RSYSLOG_DYNNAME="$RSYSLOG_DYNNAME_SAVE"
|
|
||||||
add_conf '
|
|
||||||
module(load="../plugins/omtesting/.libs/omtesting")
|
|
||||||
|
|
||||||
global(workDirectory="'${RSYSLOG_DYNNAME}'.spool")
|
|
||||||
main_queue(queue.filename="mainq" queue.saveonshutdown="on"
|
|
||||||
queue.maxfilesize="1m" # note: now regular shutdown timeout!
|
|
||||||
queue.timeoutworkerthreadshutdown="500" queue.size="200000"
|
|
||||||
)
|
|
||||||
|
|
||||||
$template outfmt,"%msg:F,58:2%\n"
|
|
||||||
action(type="omfile" file="'$RSYSLOG_OUT_LOG'" template="outfmt")
|
|
||||||
'
|
|
||||||
startup
|
|
||||||
#wait_queueempty
|
|
||||||
#echo existing queue empty, injecting new data
|
|
||||||
#injectmsg 1000000 1000
|
|
||||||
shutdown_when_empty
|
|
||||||
wait_shutdown
|
|
||||||
|
|
||||||
# now the spool directory must be empty
|
|
||||||
spoolFiles=$(ls ${RSYSLOG_DYNNAME}.spool/)
|
|
||||||
|
|
||||||
if [[ ! -z $spoolFiles ]]; then
|
|
||||||
echo "FAIL: spool directory is not empty!"
|
|
||||||
ls -l ${RSYSLOG_DYNNAME}.spool
|
|
||||||
error_exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if we got at least some data
|
|
||||||
if [ ! -f $RSYSLOG_OUT_LOG ]; then
|
|
||||||
echo "FAIL: no output data gathered (no ${RSYSLOG_OUT_LOG})!"
|
|
||||||
error_exit 1
|
|
||||||
fi
|
|
||||||
exit_test
|
|
||||||
Loading…
x
Reference in New Issue
Block a user