rsyslog/tools/syncdemo.c
Peter Portante 2470f0f72b Remove use of sizeof(char) and sizeof(uchar)
Remove the use sizeof(char) or sizeof(uchar) in calculations for memory
allocation or string length.  There are no known platforms for which
sizeof(char) or sizeof(uchar) is not 1, and c99 has defined sizeof(char)
to be 1 (section 6.5.3.4 of c99).
2015-12-16 02:18:45 +00:00

454 lines
12 KiB
C

/* syncdemo - a program to demonstrate the performance and validity of different
* synchronization methods as well as some timing properties.
*
* The task to be done is very simple: a single gloabl integer is to to incremented
* by multiple threads. All this is done in a very-high concurrency environment. Note that
* the test is unfair to mechanisms likes spinlocks, because we have almost only wait
* time but no real processing time between the waits. However, the test provides
* some good insight into atomic instructions vs. other synchronisation methods.
* It also proves that garbling variables by not doing proper synchronisation is
* highly likely. For best results, this program should be executed on a
* multiprocessor machine (on a uniprocessor, it will probably not display the
* problems caused by missing synchronisation).
*
* Note: partitioned processing mode means that all computation is first done
* locally and the final result is then combined doing proper synchronization.
* This mode is used as a baseline for uninterrupted processing.
*
* compile with $ gcc -O1 -o syncdemo -lpthread syncdemo.c
*
* Alternatively, you may use -O0, but not a higher level. Note that
* the gcc code generator does in neither case generate code really
* suitable to compare "part" and "none" modes. If you absolutely need
* to do that, you need to use inline assembly. However, the results should
* be fairly OK when consitently using either -O0 or -O1. If you see a big loss
* of performance when you compare "none" and "part", be sure to run
* "none" with -t1 and watch out for the results! In any case, looking at the generated
* assembly code is vital to interpret results correctly. Review of generated assembly
* done on 2010-05-05 indicates that -O0 is probably the best choice. Note that we
* use the volatile attribute in one spot. This is used because it results in the
* best comparable result for our gcc 4.4.3, not really to invoke the volatile semantics.
*
* use "gcc -g -Wa,-ahl=syncdemo.s -lpthread syncdemo.c" to obtain a mixed code/assembly listing.
*
* This program REQUIRES linux. With slight modification, it may run on Solaris.
* Note that gcc on Sparc does NOT offer atomic instruction support!
*
* Copyright (C) 2010 by Rainer Gerhards <rgerhards@hq.adiscon.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* -or-
* see COPYING.ASL20 in the source distribution
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Inspired by (retrieved 2010-04-13)
* http://www.alexonlinux.com/multithreaded-simple-data-type-access-and-atomic-variables
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <semaphore.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <errno.h>
#include <getopt.h>
typedef enum { part, none, atomic, cas, spinlock, mutex, semaphore } syncType_t;
static syncType_t syncTypes[] = { part, none, atomic, cas, spinlock, mutex, semaphore };
/* config settings */
static int bCPUAffinity = 0;
static int procs = 0; /* number of processors */
static int numthrds = 0; /* if zero, => equal num of processors */
static unsigned goal = 50000000; /* 50 million */
static int bCSV = 0; /* generate CSV output? */
static int numIterations = 1; /* number of iterations */
static int dummyLoad = 0; /* number of dummy load iterations to generate */
syncType_t syncType;
static int bAllSyncTypes = 0;
static int global_int = 0; /* our global counter */
static unsigned thrd_WorkToDo; /* number of computations each thread must do */
static volatile int bStartRun = 0; /* indicate to flag when threads should start */
static struct timeval tvStart, tvEnd; /* used for timing one testing iteration */
/* statistic counters */
static long long totalRuntime;
static unsigned minRuntime = 999999999;
static unsigned maxRuntime = 0;
/* sync objects (if needed) */
static pthread_mutex_t mut;
static pthread_spinlock_t spin;
static sem_t sem;
static char*
getSyncMethName(syncType_t st)
{
switch(st) {
case part : return "partition";
case none : return "none";
case atomic : return "atomic op";
case spinlock : return "spin lock";
case mutex : return "mutex";
case semaphore: return "semaphore";
case cas : return "cas";
}
}
static pid_t
gettid()
{
return syscall( __NR_gettid );
}
void *workerThread( void *arg )
{
int i, j;
volatile int partval = 0; /* use volatile so that gcc generates code similar to global var */
int *partptr;
int oldval, newval; /* for CAS sync mode */
int thrd_num = (int)(long)arg;
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(thrd_num % procs, &set);
if(syncType == part) {
partval = 0;
}
/* if enabled, try to put thread on a fixed CPU (the one that corresponds to the
* thread ID). This may
*/
if(bCPUAffinity) {
if (sched_setaffinity( gettid(), sizeof( cpu_set_t ), &set )) {
perror( "sched_setaffinity" );
return NULL;
}
}
/* wait for "go" */
while(bStartRun == 0)
/*WAIT!*/;
for (i = 0; i < thrd_WorkToDo; i++) {
switch(syncType) {
case part:
///* one needs to use inline assembly to get this right... */
//asm("addl $1, global_int(%rip)");
partval++;
break;
case none:
global_int++;
break;
case atomic:
__sync_fetch_and_add(&global_int,1);
break;
case cas:
do {
oldval = global_int;
newval = oldval + 1;
} while(!__sync_bool_compare_and_swap(&global_int, oldval, newval));
break;
case mutex:
pthread_mutex_lock(&mut);
global_int++;
pthread_mutex_unlock(&mut);
break;
case spinlock:
pthread_spin_lock(&spin);
global_int++;
pthread_spin_unlock(&spin);
break;
case semaphore:
sem_wait(&sem);
global_int++;
sem_post(&sem);
break;
}
/* we now generate "dummy load" if instructed to do so. The idea is that
* we do some other work, as in real life, so that we have a better
* ratio of sync vs. actual work to do.
*/
for(j = 0 ; j < dummyLoad ; ++j) {
/* be careful: compiler may optimize loop out! */;
}
}
if(syncType == part) {
pthread_mutex_lock(&mut);
global_int += partval;
pthread_mutex_unlock(&mut);
}
return NULL;
}
static void beginTiming(void)
{
if(!(bCSV || bAllSyncTypes)) {
printf("Test Parameters:\n");
printf("\tNumber of Cores.........: %d\n", procs);
printf("\tNumber of Threads.......: %d\n", numthrds);
printf("\tSet Affinity............: %s\n", bCPUAffinity ? "yes" : "no");
printf("\tCount to................: %u\n", goal);
printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
printf("\tDummy Load Counter......: %d\n", dummyLoad);
printf("\tSync Method used........: %s\n", getSyncMethName(syncType));
}
gettimeofday(&tvStart, NULL);
}
static void endTiming(void)
{
unsigned delta;
long sec, usec;
long runtime;
gettimeofday(&tvEnd, NULL);
if(tvStart.tv_usec > tvEnd.tv_usec) {
tvEnd.tv_sec--;
tvEnd.tv_usec += 1000000;
}
sec = tvEnd.tv_sec - tvStart.tv_sec;
usec = tvEnd.tv_usec - tvStart.tv_usec;
delta = thrd_WorkToDo * numthrds - global_int;
if(!bAllSyncTypes) {
if(bCSV) {
printf("%s,%d,%d,%d,%u,%u,%ld.%06.6ld\n",
getSyncMethName(syncType), procs, numthrds, bCPUAffinity, goal, delta, sec, usec);
} else {
printf("measured (sytem time) runtime is %ld.% 6.6ld seconds\n", sec, usec);
if(delta == 0) {
printf("Computation was done correctly.\n");
} else {
printf("Computation INCORRECT,\n"
"\texpected %9u\n"
"\treal %9u\n"
"\toff by %9u\n",
thrd_WorkToDo * numthrds,
global_int,
delta);
}
}
}
runtime = sec * 1000 + (usec / 1000);
totalRuntime += runtime;
if(runtime < minRuntime)
minRuntime = runtime;
if(runtime > maxRuntime)
maxRuntime = runtime;
}
static void
usage(void)
{
fprintf(stderr, "Usage: syncdemo -a -c<num> -t<num>\n");
fprintf(stderr, "\t-a set CPU affinity\n");
fprintf(stderr, "\t-i number of iterations\n");
fprintf(stderr, "\t-c<num> count to <num>\n");
fprintf(stderr, "\t-d<num> dummy load, <num> iterations\n");
fprintf(stderr, "\t-t<num> number of threads to use\n");
fprintf(stderr, "\t-s<type> sync-type to use (none, atomic, mutex, spin, semaphore)\n");
fprintf(stderr, "\t-C generate CSV output\n");
fprintf(stderr, "\t-A test ALL sync types\n");
exit(2);
}
/* carry out the actual test (one iteration)
*/
static void
singleTest(void)
{
int i;
pthread_t *thrs;
global_int = 0;
bStartRun = 0;
thrs = malloc(sizeof(pthread_t) * numthrds);
if (thrs == NULL) {
perror( "malloc" );
exit(1);
}
thrd_WorkToDo = goal / numthrds;
for (i = 0; i < numthrds; i++) {
if(pthread_create( &thrs[i], NULL, workerThread, (void *)(long)i )) {
perror( "pthread_create" );
procs = i;
break;
}
}
beginTiming();
bStartRun = 1; /* start the threads (they are busy-waiting so far!) */
for (i = 0; i < numthrds; i++)
pthread_join( thrs[i], NULL );
endTiming();
free( thrs );
}
/* display an unsigned ms runtime count as string. Note that the
* string is inside a dynamically allocated buffer, which the caller
* must free to prevent a memory leak.
*/
char *
dispRuntime(unsigned rt)
{
static char *fmtbuf;
fmtbuf = malloc(32);
snprintf(fmtbuf, 32, "%u.%03.3u",
rt / 1000, rt % 1000);
return(fmtbuf);
}
doTest(syncType_t st)
{
int i;
syncType = st;
totalRuntime = 0;
minRuntime = 999999999;
maxRuntime = 0;
for(i = 0 ; i < numIterations ; ++i) {
//printf("starting iteration %d\n", i);
singleTest();
}
/* we have a memory leak due to calling dispRuntime(), but we don't
* care as we terminate immediately.
*/
printf("%-10s: total runtime %6ld.%3.3u, avg %s, min %s, max %s\n",
getSyncMethName(st),
(long)totalRuntime/1000, (unsigned)(totalRuntime % 1000),
dispRuntime((unsigned) (totalRuntime / numIterations)),
dispRuntime(minRuntime),
dispRuntime(maxRuntime));
}
int
main(int argc, char *argv[])
{
int i;
int opt;
while((opt = getopt(argc, argv, "ac:d:i:t:s:CA")) != EOF) {
switch((char)opt) {
case 'A':
bAllSyncTypes = 1;
break;
case 'a':
bCPUAffinity = 1;
break;
case 'c':
goal = (unsigned) atol(optarg);
break;
case 'd':
dummyLoad = atoi(optarg);
break;
case 'i':
numIterations = atoi(optarg);
break;
case 't':
numthrds = atoi(optarg);
break;
case 'C':
bCSV = 1;
break;
case 's':
if(!strcmp(optarg, "none"))
syncType = none;
else if(!strcmp(optarg, "part"))
syncType = part;
else if(!strcmp(optarg, "atomic"))
syncType = atomic;
else if(!strcmp(optarg, "cas"))
syncType = cas;
else if(!strcmp(optarg, "mutex")) {
syncType = mutex;
pthread_mutex_init(&mut, NULL);
} else if(!strcmp(optarg, "spin")) {
syncType = spinlock;
} else if(!strcmp(optarg, "semaphore")) {
syncType = semaphore;
sem_init(&sem, 0, 1);
} else {
fprintf(stderr, "error: invalid sync mode '%s'\n", optarg);
usage();
}
break;
default:usage();
break;
}
}
/* for simplicity, we init all sync helpers no matter if we need them */
pthread_mutex_init(&mut, NULL);
pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
sem_init(&sem, 0, 1);
/* Getting number of CPUs */
procs = (int)sysconf(_SC_NPROCESSORS_ONLN);
if(procs < 0) {
perror("sysconf");
return -1;
}
if(numthrds < 1) {
numthrds = procs;
}
if(bAllSyncTypes) {
for(i = 0 ; i < sizeof(syncTypes) / sizeof(syncType_t) ; ++i) {
doTest(syncTypes[i]);
}
printf("Done running tests, result based on:\n");
printf("\tNumber of Cores.........: %d\n", procs);
printf("\tNumber of Threads.......: %d\n", numthrds);
printf("\tSet CPU Affinity........: %s\n", bCPUAffinity ? "yes" : "no");
printf("\tCount to................: %u\n", goal);
printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
printf("\tDummy Load Counter......: %d\n", dummyLoad);
printf("\tIterations..............: %d\n", numIterations);
} else {
doTest(syncType);
}
return 0;
}