rsyslog/tools/syncdemo.c
Rainer Gerhards 89b387c7a5 relicense auxiluary file
it's not even a real rsyslog part...
I am the sole author.
2014-05-27 13:03:47 +02:00

454 lines
12 KiB
C

/* syncdemo - a program to demonstrate the performance and validity of different
* synchronization methods as well as some timing properties.
*
* The task to be done is very simple: a single gloabl integer is to to incremented
* by multiple threads. All this is done in a very-high concurrency environment. Note that
* the test is unfair to mechanisms likes spinlocks, because we have almost only wait
* time but no real processing time between the waits. However, the test provides
* some good insight into atomic instructions vs. other synchronisation methods.
* It also proves that garbling variables by not doing proper synchronisation is
* highly likely. For best results, this program should be executed on a
* multiprocessor machine (on a uniprocessor, it will probably not display the
* problems caused by missing synchronisation).
*
* Note: partitioned processing mode means that all computation is first done
* locally and the final result is then combined doing proper synchronization.
* This mode is used as a baseline for uninterrupted processing.
*
* compile with $ gcc -O1 -o syncdemo -lpthread syncdemo.c
*
* Alternatively, you may use -O0, but not a higher level. Note that
* the gcc code generator does in neither case generate code really
* suitable to compare "part" and "none" modes. If you absolutely need
* to do that, you need to use inline assembly. However, the results should
* be fairly OK when consitently using either -O0 or -O1. If you see a big loss
* of performance when you compare "none" and "part", be sure to run
* "none" with -t1 and watch out for the results! In any case, looking at the generated
* assembly code is vital to interpret results correctly. Review of generated assembly
* done on 2010-05-05 indicates that -O0 is probably the best choice. Note that we
* use the volatile attribute in one spot. This is used because it results in the
* best comparable result for our gcc 4.4.3, not really to invoke the volatile semantics.
*
* use "gcc -g -Wa,-ahl=syncdemo.s -lpthread syncdemo.c" to obtain a mixed code/assembly listing.
*
* This program REQUIRES linux. With slight modification, it may run on Solaris.
* Note that gcc on Sparc does NOT offer atomic instruction support!
*
* Copyright (C) 2010 by Rainer Gerhards <rgerhards@hq.adiscon.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* -or-
* see COPYING.ASL20 in the source distribution
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Inspired by (retrieved 2010-04-13)
* http://www.alexonlinux.com/multithreaded-simple-data-type-access-and-atomic-variables
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <semaphore.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <errno.h>
#include <getopt.h>
typedef enum { part, none, atomic, cas, spinlock, mutex, semaphore } syncType_t;
static syncType_t syncTypes[] = { part, none, atomic, cas, spinlock, mutex, semaphore };
/* config settings */
static int bCPUAffinity = 0;
static int procs = 0; /* number of processors */
static int numthrds = 0; /* if zero, => equal num of processors */
static unsigned goal = 50000000; /* 50 million */
static int bCSV = 0; /* generate CSV output? */
static int numIterations = 1; /* number of iterations */
static int dummyLoad = 0; /* number of dummy load iterations to generate */
syncType_t syncType;
static int bAllSyncTypes = 0;
static int global_int = 0; /* our global counter */
static unsigned thrd_WorkToDo; /* number of computations each thread must do */
static volatile int bStartRun = 0; /* indicate to flag when threads should start */
static struct timeval tvStart, tvEnd; /* used for timing one testing iteration */
/* statistic counters */
static long long totalRuntime;
static unsigned minRuntime = 999999999;
static unsigned maxRuntime = 0;
/* sync objects (if needed) */
static pthread_mutex_t mut;
static pthread_spinlock_t spin;
static sem_t sem;
static char*
getSyncMethName(syncType_t st)
{
switch(st) {
case part : return "partition";
case none : return "none";
case atomic : return "atomic op";
case spinlock : return "spin lock";
case mutex : return "mutex";
case semaphore: return "semaphore";
case cas : return "cas";
}
}
static pid_t
gettid()
{
return syscall( __NR_gettid );
}
void *workerThread( void *arg )
{
int i, j;
volatile int partval = 0; /* use volatile so that gcc generates code similar to global var */
int *partptr;
int oldval, newval; /* for CAS sync mode */
int thrd_num = (int)(long)arg;
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(thrd_num % procs, &set);
if(syncType == part) {
partval = 0;
}
/* if enabled, try to put thread on a fixed CPU (the one that corresponds to the
* thread ID). This may
*/
if(bCPUAffinity) {
if (sched_setaffinity( gettid(), sizeof( cpu_set_t ), &set )) {
perror( "sched_setaffinity" );
return NULL;
}
}
/* wait for "go" */
while(bStartRun == 0)
/*WAIT!*/;
for (i = 0; i < thrd_WorkToDo; i++) {
switch(syncType) {
case part:
///* one needs to use inline assembly to get this right... */
//asm("addl $1, global_int(%rip)");
partval++;
break;
case none:
global_int++;
break;
case atomic:
__sync_fetch_and_add(&global_int,1);
break;
case cas:
do {
oldval = global_int;
newval = oldval + 1;
} while(!__sync_bool_compare_and_swap(&global_int, oldval, newval));
break;
case mutex:
pthread_mutex_lock(&mut);
global_int++;
pthread_mutex_unlock(&mut);
break;
case spinlock:
pthread_spin_lock(&spin);
global_int++;
pthread_spin_unlock(&spin);
break;
case semaphore:
sem_wait(&sem);
global_int++;
sem_post(&sem);
break;
}
/* we now generate "dummy load" if instructed to do so. The idea is that
* we do some other work, as in real life, so that we have a better
* ratio of sync vs. actual work to do.
*/
for(j = 0 ; j < dummyLoad ; ++j) {
/* be careful: compiler may optimize loop out! */;
}
}
if(syncType == part) {
pthread_mutex_lock(&mut);
global_int += partval;
pthread_mutex_unlock(&mut);
}
return NULL;
}
static void beginTiming(void)
{
if(!(bCSV || bAllSyncTypes)) {
printf("Test Parameters:\n");
printf("\tNumber of Cores.........: %d\n", procs);
printf("\tNumber of Threads.......: %d\n", numthrds);
printf("\tSet Affinity............: %s\n", bCPUAffinity ? "yes" : "no");
printf("\tCount to................: %u\n", goal);
printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
printf("\tDummy Load Counter......: %d\n", dummyLoad);
printf("\tSync Method used........: %s\n", getSyncMethName(syncType));
}
gettimeofday(&tvStart, NULL);
}
static void endTiming(void)
{
unsigned delta;
long sec, usec;
long runtime;
gettimeofday(&tvEnd, NULL);
if(tvStart.tv_usec > tvEnd.tv_usec) {
tvEnd.tv_sec--;
tvEnd.tv_usec += 1000000;
}
sec = tvEnd.tv_sec - tvStart.tv_sec;
usec = tvEnd.tv_usec - tvStart.tv_usec;
delta = thrd_WorkToDo * numthrds - global_int;
if(!bAllSyncTypes) {
if(bCSV) {
printf("%s,%d,%d,%d,%u,%u,%ld.%06.6ld\n",
getSyncMethName(syncType), procs, numthrds, bCPUAffinity, goal, delta, sec, usec);
} else {
printf("measured (sytem time) runtime is %ld.% 6.6ld seconds\n", sec, usec);
if(delta == 0) {
printf("Computation was done correctly.\n");
} else {
printf("Computation INCORRECT,\n"
"\texpected %9u\n"
"\treal %9u\n"
"\toff by %9u\n",
thrd_WorkToDo * numthrds,
global_int,
delta);
}
}
}
runtime = sec * 1000 + (usec / 1000);
totalRuntime += runtime;
if(runtime < minRuntime)
minRuntime = runtime;
if(runtime > maxRuntime)
maxRuntime = runtime;
}
static void
usage(void)
{
fprintf(stderr, "Usage: syncdemo -a -c<num> -t<num>\n");
fprintf(stderr, "\t-a set CPU affinity\n");
fprintf(stderr, "\t-i number of iterations\n");
fprintf(stderr, "\t-c<num> count to <num>\n");
fprintf(stderr, "\t-d<num> dummy load, <num> iterations\n");
fprintf(stderr, "\t-t<num> number of threads to use\n");
fprintf(stderr, "\t-s<type> sync-type to use (none, atomic, mutex, spin, semaphore)\n");
fprintf(stderr, "\t-C generate CSV output\n");
fprintf(stderr, "\t-A test ALL sync types\n");
exit(2);
}
/* carry out the actual test (one iteration)
*/
static void
singleTest(void)
{
int i;
pthread_t *thrs;
global_int = 0;
bStartRun = 0;
thrs = malloc(sizeof(pthread_t) * numthrds);
if (thrs == NULL) {
perror( "malloc" );
exit(1);
}
thrd_WorkToDo = goal / numthrds;
for (i = 0; i < numthrds; i++) {
if(pthread_create( &thrs[i], NULL, workerThread, (void *)(long)i )) {
perror( "pthread_create" );
procs = i;
break;
}
}
beginTiming();
bStartRun = 1; /* start the threads (they are busy-waiting so far!) */
for (i = 0; i < numthrds; i++)
pthread_join( thrs[i], NULL );
endTiming();
free( thrs );
}
/* display an unsigned ms runtime count as string. Note that the
* string is inside a dynamically allocated buffer, which the caller
* must free to prevent a memory leak.
*/
char *
dispRuntime(unsigned rt)
{
static char *fmtbuf;
fmtbuf = malloc(32 * sizeof(char));
snprintf(fmtbuf, 32, "%u.%03.3u",
rt / 1000, rt % 1000);
return(fmtbuf);
}
doTest(syncType_t st)
{
int i;
syncType = st;
totalRuntime = 0;
minRuntime = 999999999;
maxRuntime = 0;
for(i = 0 ; i < numIterations ; ++i) {
//printf("starting iteration %d\n", i);
singleTest();
}
/* we have a memory leak due to calling dispRuntime(), but we don't
* care as we terminate immediately.
*/
printf("%-10s: total runtime %6ld.%3.3u, avg %s, min %s, max %s\n",
getSyncMethName(st),
(long)totalRuntime/1000, (unsigned)(totalRuntime % 1000),
dispRuntime((unsigned) (totalRuntime / numIterations)),
dispRuntime(minRuntime),
dispRuntime(maxRuntime));
}
int
main(int argc, char *argv[])
{
int i;
int opt;
while((opt = getopt(argc, argv, "ac:d:i:t:s:CA")) != EOF) {
switch((char)opt) {
case 'A':
bAllSyncTypes = 1;
break;
case 'a':
bCPUAffinity = 1;
break;
case 'c':
goal = (unsigned) atol(optarg);
break;
case 'd':
dummyLoad = atoi(optarg);
break;
case 'i':
numIterations = atoi(optarg);
break;
case 't':
numthrds = atoi(optarg);
break;
case 'C':
bCSV = 1;
break;
case 's':
if(!strcmp(optarg, "none"))
syncType = none;
else if(!strcmp(optarg, "part"))
syncType = part;
else if(!strcmp(optarg, "atomic"))
syncType = atomic;
else if(!strcmp(optarg, "cas"))
syncType = cas;
else if(!strcmp(optarg, "mutex")) {
syncType = mutex;
pthread_mutex_init(&mut, NULL);
} else if(!strcmp(optarg, "spin")) {
syncType = spinlock;
} else if(!strcmp(optarg, "semaphore")) {
syncType = semaphore;
sem_init(&sem, 0, 1);
} else {
fprintf(stderr, "error: invalid sync mode '%s'\n", optarg);
usage();
}
break;
default:usage();
break;
}
}
/* for simplicity, we init all sync helpers no matter if we need them */
pthread_mutex_init(&mut, NULL);
pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
sem_init(&sem, 0, 1);
/* Getting number of CPUs */
procs = (int)sysconf(_SC_NPROCESSORS_ONLN);
if(procs < 0) {
perror("sysconf");
return -1;
}
if(numthrds < 1) {
numthrds = procs;
}
if(bAllSyncTypes) {
for(i = 0 ; i < sizeof(syncTypes) / sizeof(syncType_t) ; ++i) {
doTest(syncTypes[i]);
}
printf("Done running tests, result based on:\n");
printf("\tNumber of Cores.........: %d\n", procs);
printf("\tNumber of Threads.......: %d\n", numthrds);
printf("\tSet CPU Affinity........: %s\n", bCPUAffinity ? "yes" : "no");
printf("\tCount to................: %u\n", goal);
printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
printf("\tDummy Load Counter......: %d\n", dummyLoad);
printf("\tIterations..............: %d\n", numIterations);
} else {
doTest(syncType);
}
return 0;
}