mirror of
https://github.com/rsyslog/rsyslog.git
synced 2025-12-20 10:50:41 +01:00
344 lines
8.3 KiB
C
344 lines
8.3 KiB
C
/* mmutf8fix.c
|
|
* fix invalid UTF8 sequences. This is begun as a very simple replacer
|
|
* of non-control characters, and actually breaks some UTF-8 encoding
|
|
* right now. If the module turns out to be useful, it should be enhanced
|
|
* to support modes that really detect invalid UTF8. In the longer term
|
|
* it could also be evolved into an any-charset-to-UTF8 converter. But
|
|
* first let's see if it really gets into widespread enough use.
|
|
*
|
|
* Copyright 2013-2016 Adiscon GmbH.
|
|
*
|
|
* This file is part of rsyslog.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
* -or-
|
|
* see COPYING.ASL20 in the source distribution
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#include "config.h"
|
|
#include "rsyslog.h"
|
|
#include <stdio.h>
|
|
#include <stdarg.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <signal.h>
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <stdint.h>
|
|
#include "conf.h"
|
|
#include "syslogd-types.h"
|
|
#include "srUtils.h"
|
|
#include "template.h"
|
|
#include "module-template.h"
|
|
#include "errmsg.h"
|
|
|
|
MODULE_TYPE_OUTPUT
|
|
MODULE_TYPE_NOKEEP
|
|
MODULE_CNFNAME("mmutf8fix")
|
|
|
|
|
|
DEFobjCurrIf(errmsg);
|
|
DEF_OMOD_STATIC_DATA
|
|
|
|
/* define operation modes we have */
|
|
#define MODE_CC 0 /* just fix control characters */
|
|
#define MODE_UTF8 1 /* do real UTF-8 fixing */
|
|
|
|
/* config variables */
|
|
typedef struct _instanceData {
|
|
uchar replChar;
|
|
uint8_t mode; /* operations mode */
|
|
} instanceData;
|
|
|
|
typedef struct wrkrInstanceData {
|
|
instanceData *pData;
|
|
} wrkrInstanceData_t;
|
|
|
|
struct modConfData_s {
|
|
rsconf_t *pConf; /* our overall config object */
|
|
};
|
|
static modConfData_t *loadModConf = NULL;/* modConf ptr to use for the current load process */
|
|
static modConfData_t *runModConf = NULL;/* modConf ptr to use for the current exec process */
|
|
|
|
|
|
/* tables for interfacing with the v6 config system */
|
|
/* action (instance) parameters */
|
|
static struct cnfparamdescr actpdescr[] = {
|
|
{ "mode", eCmdHdlrGetWord, 0 },
|
|
{ "replacementchar", eCmdHdlrGetChar, 0 }
|
|
};
|
|
static struct cnfparamblk actpblk =
|
|
{ CNFPARAMBLK_VERSION,
|
|
sizeof(actpdescr)/sizeof(struct cnfparamdescr),
|
|
actpdescr
|
|
};
|
|
|
|
BEGINbeginCnfLoad
|
|
CODESTARTbeginCnfLoad
|
|
loadModConf = pModConf;
|
|
pModConf->pConf = pConf;
|
|
ENDbeginCnfLoad
|
|
|
|
BEGINendCnfLoad
|
|
CODESTARTendCnfLoad
|
|
ENDendCnfLoad
|
|
|
|
BEGINcheckCnf
|
|
CODESTARTcheckCnf
|
|
ENDcheckCnf
|
|
|
|
BEGINactivateCnf
|
|
CODESTARTactivateCnf
|
|
runModConf = pModConf;
|
|
ENDactivateCnf
|
|
|
|
BEGINfreeCnf
|
|
CODESTARTfreeCnf
|
|
ENDfreeCnf
|
|
|
|
|
|
BEGINcreateInstance
|
|
CODESTARTcreateInstance
|
|
ENDcreateInstance
|
|
|
|
|
|
BEGINcreateWrkrInstance
|
|
CODESTARTcreateWrkrInstance
|
|
ENDcreateWrkrInstance
|
|
|
|
|
|
BEGINisCompatibleWithFeature
|
|
CODESTARTisCompatibleWithFeature
|
|
ENDisCompatibleWithFeature
|
|
|
|
|
|
BEGINfreeInstance
|
|
CODESTARTfreeInstance
|
|
ENDfreeInstance
|
|
|
|
|
|
BEGINfreeWrkrInstance
|
|
CODESTARTfreeWrkrInstance
|
|
ENDfreeWrkrInstance
|
|
|
|
|
|
static inline void
|
|
setInstParamDefaults(instanceData *pData)
|
|
{
|
|
pData->mode = MODE_UTF8;
|
|
pData->replChar = ' ';
|
|
}
|
|
|
|
BEGINnewActInst
|
|
struct cnfparamvals *pvals;
|
|
int i;
|
|
CODESTARTnewActInst
|
|
DBGPRINTF("newActInst (mmutf8fix)\n");
|
|
if((pvals = nvlstGetParams(lst, &actpblk, NULL)) == NULL) {
|
|
ABORT_FINALIZE(RS_RET_MISSING_CNFPARAMS);
|
|
}
|
|
|
|
CODE_STD_STRING_REQUESTnewActInst(1)
|
|
CHKiRet(OMSRsetEntry(*ppOMSR, 0, NULL, OMSR_TPL_AS_MSG));
|
|
CHKiRet(createInstance(&pData));
|
|
setInstParamDefaults(pData);
|
|
|
|
for(i = 0 ; i < actpblk.nParams ; ++i) {
|
|
if(!pvals[i].bUsed)
|
|
continue;
|
|
if(!strcmp(actpblk.descr[i].name, "mode")) {
|
|
if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
|
|
sizeof("utf-8")-1)) {
|
|
pData->mode = MODE_UTF8;
|
|
} else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
|
|
sizeof("controlcharacters")-1)) {
|
|
pData->mode = MODE_CC;
|
|
} else {
|
|
char *cstr = es_str2cstr(pvals[i].val.d.estr, NULL);
|
|
errmsg.LogError(0, RS_RET_INVLD_MODE,
|
|
"mmutf8fix: invalid mode '%s' - ignored",
|
|
cstr);
|
|
free(cstr);
|
|
}
|
|
} else if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
|
|
pData->replChar = es_getBufAddr(pvals[i].val.d.estr)[0];
|
|
} else {
|
|
dbgprintf("mmutf8fix: program error, non-handled "
|
|
"param '%s'\n", actpblk.descr[i].name);
|
|
}
|
|
}
|
|
|
|
CODE_STD_FINALIZERnewActInst
|
|
cnfparamvalsDestruct(pvals, &actpblk);
|
|
ENDnewActInst
|
|
|
|
|
|
BEGINdbgPrintInstInfo
|
|
CODESTARTdbgPrintInstInfo
|
|
ENDdbgPrintInstInfo
|
|
|
|
|
|
BEGINtryResume
|
|
CODESTARTtryResume
|
|
ENDtryResume
|
|
|
|
|
|
static void
|
|
doCC(instanceData *pData, uchar *msg, int lenMsg)
|
|
{
|
|
int i;
|
|
|
|
for(i = 0 ; i < lenMsg ; ++i) {
|
|
if(msg[i] < 32 || msg[i] > 126) {
|
|
msg[i] = pData->replChar;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* fix an invalid multibyte sequence */
|
|
static void
|
|
fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen)
|
|
{
|
|
int i;
|
|
|
|
/* startIdx and seqLen always set if bytesLeft is set,
|
|
which is required before this function is called */
|
|
*endIdx = strtIdx + seqLen;
|
|
if(*endIdx > lenMsg)
|
|
*endIdx = lenMsg;
|
|
for(i = strtIdx ; i < *endIdx ; ++i)
|
|
msg[i] = pData->replChar;
|
|
}
|
|
|
|
static void
|
|
doUTF8(instanceData *pData, uchar *msg, int lenMsg)
|
|
{
|
|
uchar c;
|
|
int8_t seqLen = 0, bytesLeft = 0;
|
|
uint32_t codepoint;
|
|
int strtIdx = 0, endIdx = 0;
|
|
int i;
|
|
|
|
for(i = 0 ; i < lenMsg ; ++i) {
|
|
c = msg[i];
|
|
if(bytesLeft) {
|
|
if((c & 0xc0) != 0x80) {
|
|
/* sequence invalid, invalidate all bytes
|
|
startIdx is always set if bytesLeft is set */
|
|
fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx,
|
|
seqLen);
|
|
i = endIdx - 1;
|
|
bytesLeft = 0;
|
|
} else {
|
|
codepoint = (codepoint << 6) | (c & 0x3f);
|
|
--bytesLeft;
|
|
if(bytesLeft == 0) {
|
|
/* too-large codepoint? */
|
|
if(codepoint > 0x10FFFF) {
|
|
/* sequence invalid, invalidate all bytes
|
|
startIdx is always set if bytesLeft is set */
|
|
fixInvldMBSeq(pData, msg, lenMsg,
|
|
strtIdx, &endIdx,
|
|
seqLen);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if((c & 0x80) == 0) {
|
|
/* 1-byte sequence, US-ASCII */
|
|
; /* nothing to do, all well */
|
|
} else if((c & 0xe0) == 0xc0) {
|
|
/* 2-byte sequence */
|
|
/* 0xc0 and 0xc1 are illegal */
|
|
if(c == 0xc0 || c == 0xc1) {
|
|
msg[i] = pData->replChar;
|
|
} else {
|
|
strtIdx = i;
|
|
seqLen = bytesLeft = 1;
|
|
codepoint = c & 0x1f;
|
|
}
|
|
} else if((c & 0xf0) == 0xe0) {
|
|
/* 3-byte sequence */
|
|
strtIdx = i;
|
|
seqLen = bytesLeft = 2;
|
|
codepoint = c & 0x0f;
|
|
} else if((c & 0xf8) == 0xf0) {
|
|
/* 4-byte sequence */
|
|
strtIdx = i;
|
|
seqLen = bytesLeft = 3;
|
|
codepoint = c & 0x07;
|
|
} else { /* invalid (5&6 byte forbidden by RFC3629) */
|
|
msg[i] = pData->replChar;
|
|
}
|
|
if(i+bytesLeft >= lenMsg) {
|
|
int dummy = lenMsg;
|
|
/* invalid, as rest of message cannot contain full char */
|
|
fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &dummy, seqLen);
|
|
i = lenMsg - 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
BEGINdoAction_NoStrings
|
|
smsg_t **ppMsg = (smsg_t **) pMsgData;
|
|
smsg_t *pMsg = ppMsg[0];
|
|
uchar *msg;
|
|
int lenMsg;
|
|
CODESTARTdoAction
|
|
lenMsg = getMSGLen(pMsg);
|
|
msg = getMSG(pMsg);
|
|
if(pWrkrData->pData->mode == MODE_CC) {
|
|
doCC(pWrkrData->pData, msg, lenMsg);
|
|
} else {
|
|
doUTF8(pWrkrData->pData, msg, lenMsg);
|
|
}
|
|
ENDdoAction
|
|
|
|
|
|
BEGINparseSelectorAct
|
|
CODESTARTparseSelectorAct
|
|
CODE_STD_STRING_REQUESTparseSelectorAct(1)
|
|
if(strncmp((char*) p, ":mmutf8fix:", sizeof(":mmutf8fix:") - 1)) {
|
|
errmsg.LogError(0, RS_RET_LEGA_ACT_NOT_SUPPORTED,
|
|
"mmutf8fix supports only v6+ config format, use: "
|
|
"action(type=\"mmutf8fix\" ...)");
|
|
}
|
|
ABORT_FINALIZE(RS_RET_CONFLINE_UNPROCESSED);
|
|
CODE_STD_FINALIZERparseSelectorAct
|
|
ENDparseSelectorAct
|
|
|
|
|
|
BEGINmodExit
|
|
CODESTARTmodExit
|
|
objRelease(errmsg, CORE_COMPONENT);
|
|
ENDmodExit
|
|
|
|
|
|
BEGINqueryEtryPt
|
|
CODESTARTqueryEtryPt
|
|
CODEqueryEtryPt_STD_OMOD_QUERIES
|
|
CODEqueryEtryPt_STD_OMOD8_QUERIES
|
|
CODEqueryEtryPt_STD_CONF2_OMOD_QUERIES
|
|
CODEqueryEtryPt_STD_CONF2_QUERIES
|
|
ENDqueryEtryPt
|
|
|
|
|
|
BEGINmodInit()
|
|
CODESTARTmodInit
|
|
*ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
|
|
CODEmodInit_QueryRegCFSLineHdlr
|
|
DBGPRINTF("mmutf8fix: module compiled with rsyslog version %s.\n", VERSION);
|
|
CHKiRet(objUse(errmsg, CORE_COMPONENT));
|
|
ENDmodInit
|