问题描述:
    在服务器上,概率(一个月1次)出现,某个服务停摆了,日志上看,没有任何报错,而且服务进程也在,只是无响应了。

问题分析:
    1,用strace命令查看,发现当前进程阻塞在系统API上, futext_wait_queue_me
    2, 用gdb命令 attach 当前阻塞进程,bt显示 call trace,发现是syslog系统调用出现了死锁。
    3,进一步分析,发现,某个函数在外部流程和信号处理函数(定时任务)中都调到了,而这个函数内部,调用了syslog输出日志。
    4,结论:服务进程和信号处理函数同时调用syslog,可能导致死锁

问题验证:

点击(此处)折叠或打开

  1. #include <errno.h>
  2. #include <syslog.h>
  3. #include <sys/types.h>
  4. #include <sys/socket.h>
  5. #include <netinet/in.h>
  6. #include <sys/select.h>
  7. #include <unistd.h>
  8. #include <signal.h>
  9. #include <sys/types.h>
  10. #include <sys/stat.h>
  11. #include <fcntl.h>
  12. #include <stdio.h>
  13. #include <sys/time.h>
  14. #include <sys/un.h>

  15. int idx = 0;
  16. static void sigroutine(int signo)
  17. {
  18.     char* logcnt = "this log from sigroutine,************************************************************************************************************************************************************ end";
  19.     switch (signo)
  20.     {
  21.         case SIGALRM:
  22.             idx = 0;
  23.             syslog(LOG_DEBUG, "%s\n", logcnt);
  24.             break;
  25.     }
  26.     return;
  27. }

  28. static void srv_heartbeat_start(void)
  29. {
  30.     struct itimerval value, old_value;
  31.     signal(SIGALRM, sigroutine);

  32.     value.it_value.tv_sec = 0;
  33.     value.it_value.tv_usec = 50000;
  34.     value.it_interval.tv_sec = 0;
  35.     value.it_interval.tv_usec = 50000;
  36.     setitimer(ITIMER_REAL, &value, &old_value);
  37. }

  38. int main(int argc, char** argv)
  39. {
  40.     char* logcnt = "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++";

  41. srv_heartbeat_start();
  42.     while (1)
  43.     {
  44.         syslog(LOG_INFO, "--> %d\n", idx++);
  45.         syslog(LOG_DEBUG, "%s\n", logcnt);        
  46.     }
  47. }

如上代码,主服务一直循环输入log,起一个定时器,每50ms 发送一个时钟信号,并且在时钟信号处理函数中打印一条日志。以上程序必须syslog死锁的问题。gdb 打印堆栈如下:

#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
#1  0x00007f73854ec344 in _L_lock_533 () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f73854ebda2 in __GI___vsyslog_chk (pri=, flag=flag@entry=-1, fmt=0x400802 "%s\n", ap=ap@entry=0x7ffcb7e84c38) at ../misc/syslog.c:258
#3  0x00007f73854ec15f in __syslog (pri=, fmt=) at ../misc/syslog.c:117
#4  0x000000000040060f in sigroutine (signo=14) at syslog_lock.c:40
#5 
#6  0x00007f73854f30bb in __libc_send (fd=3, buf=0x1a95270, n=33, flags=-2058407747, flags@entry=16384) at ../sysdeps/unix/sysv/linux/x86_64/send.c:27
#7  0x00007f73854ebf31 in __GI___vsyslog_chk (pri=, flag=flag@entry=-1, fmt=0x4008b3 "--> %d\n", ap=ap@entry=0x7ffcb7e853e8) at ../misc/syslog.c:279
#8  0x00007f73854ec15f in __syslog (pri=, fmt=) at ../misc/syslog.c:117
#9  0x00000000004006a2 in main (argc=1, argv=0x7ffcb7e855c8) at syslog_lock.c:65
10-27 15:53