We’re using the Aerospike LibEvent client to connect to our Aerospike 2 clusters.
After sustained stress tests using Apache Bench at 1,000 requests for 10 iterations (10 * 1,000 requests total), we encounter segfaults.
Sometimes, we can go over a 100,000 requests before a segfault. Other times we can go a mere 20,000 requests before encountering a segfault.
The segfault is hard to track down.
Here is one from a core file:
Program terminated with signal 6, Aborted.
#0 0x00007fb78a2e8945 in __setfpucw (set=<value optimized out>) at setfpucw.c:34
34 cw |= set & ~_FPU_RESERVED;
(gdb) bt full
#0 0x00007fb78a2e8945 in __setfpucw (set=<value optimized out>) at setfpucw.c:34
cw = 0
#1 0x00007fb78a2ea25b in sigset (sig=1024, disp=<value optimized out>) at ../sysdeps/posix/sigset.c:92
act = {__sigaction_handler = {sa_handler = 0x4, sa_sigaction = 0x4}, sa_mask = {__val = {5, 140426275203183, 3, 140734354466228, 12, 140426275203187, 2, 140426275203187, 2, 140734354466253, 3, 140426275196513, 1,
140426275203183, 3, 140734354466228}}, sa_flags = 12, sa_restorer = 0x7fb78a402873}
oact = {__sigaction_handler = {sa_handler = 0x20, sa_sigaction = 0x20}, sa_mask = {__val = {0 <repeats 15 times>, 38}}, sa_flags = -1976405138, sa_restorer = 0x7fb78a403bc8 <banner+488>}
set = {__val = {140734354482059, 6, 140426275208161, 2, 140426275208360, 33, 140426275208165, 4, 140734354468656, 16, 140426275208171, 5, 1049320464, 140426274167794, 140426275208171, 5}}
oset = {__val = {140734354466512, 140734354466496, 111950720, 140426274304254, 140734354468656, 16, 140734354466560, 18374403900885434160, 117217384, 140426274304434, 140426275208165, 4, 140734354466608, 30927616, 26782368,
140426274304254}}
#2 0x00007fb78a3271b2 in __fpurge (fp=0x21) at __fpurge.c:37
No locals.
#3 0x0000000000000002 in ?? ()
No symbol table info available.
#4 0x00007fff4534a790 in ?? ()
No symbol table info available.
#5 0x00007fb78a3b90e3 in nrl_domainname () at getnameinfo.c:93
c = <value optimized out>
th = {h_name = 0x6fc9868 "H\252\002", h_aliases = 0x7fb78a3271b2, h_addrtype = -1975501851, h_length = 32695, h_addr_list = 0x4}
tmpbuflen = 140426275208161
tmpbuf = 0x7fff4534a790 "\213\343\064E\377\177"
herror = 26782368
h = <value optimized out>
lock = 0
not_first = 0
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
In this stack trace, the error originates from libevent:
Program terminated with signal 11, Segmentation fault.
#0 event_active_nolock (ev=0x4dfbfd0, res=1, ncalls=1) at event.c:2280
2280 EVENT_BASE_ASSERT_LOCKED(base);
(gdb) bt full
#0 event_active_nolock (ev=0x4dfbfd0, res=1, ncalls=1) at event.c:2280
base = 0x0
#1 event_active_nolock (ev=0x4dfbfd0, res=1, ncalls=1) at event.c:2264
No locals.
#2 0x00007f41824f440d in timeout_process (base=0x46139e0, flags=4) at event.c:2469
now = {tv_sec = 14085976, tv_usec = 467851}
ev = 0x4dfbfd0
#3 event_base_loop (base=0x46139e0, flags=4) at event.c:1601
evsel = 0x7f4182728160
tv = {tv_sec = 0, tv_usec = 184}
tv_p = <value optimized out>
res = <value optimized out>
done = 0
retval = 0
__func__ = "event_base_loop"
Our code is very simple and based upon the example/main.c for ev2citrusleaf_get_all.
We make use of the libevent client for performance and haven’t moved to the Aerospike 3 client yet.