本文共 34271 字,大约阅读时间需要 114 分钟。
[root@upucore_1 ~]# top -Hp 8177top - 09:50:33 up 34 min, 8 users, load average: 4.76, 4.32, 3.65Tasks: 2 total, 0 running, 2 sleeping, 0 stopped, 0 zombieCpu0 : 11.8%us, 11.8%sy, 0.0%ni, 76.5%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu1 : 11.1%us, 11.1%sy, 0.0%ni, 77.8%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu2 : 5.9%us, 11.8%sy, 0.0%ni, 82.4%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu3 : 23.5%us, 29.4%sy, 0.0%ni, 47.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu4 : 5.6%us, 5.6%sy, 0.0%ni, 88.9%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu5 : 11.1%us, 16.7%sy, 0.0%ni, 72.2%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu6 : 5.3%us, 5.3%sy, 0.0%ni, 84.2%id, 0.0%wa, 0.0%hi, 5.3%si, 0.0%stCpu7 : 16.7%us, 33.3%sy, 0.0%ni, 50.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu8 : 5.6%us, 0.0%sy, 0.0%ni, 94.4%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu9 : 5.6%us, 94.4%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu10 : 10.5%us, 31.6%sy, 0.0%ni, 57.9%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu11 : 12.5%us, 25.0%sy, 0.0%ni, 62.5%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu12 : 10.5%us, 15.8%sy, 0.0%ni, 73.7%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu13 : 0.0%us, 70.6%sy, 0.0%ni, 29.4%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu14 : 0.0%us,100.0%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stCpu15 : 5.6%us, 0.0%sy, 0.0%ni, 94.4%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%stMem: 32823540k total, 9229512k used, 23594028k free, 214588k buffersSwap: 0k total, 0k used, 0k free, 809616k cached PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 8177 root 20 0 110m 3268 1544 S 0.0 0.0 0:00.66 modb 13784 root 20 0 110m 3268 1544 S 0.0 0.0 0:00.00 modb看起来 CPU 和内存使用没有什么异常。通过 pstack 进行观察
[root@upucore_1 ~]# pstack 8177Thread 2 (Thread 0x7f26f132e700 (LWP 13784)):#0 0x00000032698e8ef3 in epoll_wait () from /lib64/libc.so.6#1 0x00007f26f1f62a96 in epoll_dispatch () from /opt/mcu/modbcore/libevent_core.so#2 0x00007f26f1f529f6 in event_base_loop () from /opt/mcu/modbcore/libevent_core.so#3 0x0000000000408fcc in sql_event_thread_loop(event_thread_t*) ()#4 0x0000003269c079d1 in start_thread () from /lib64/libpthread.so.0#5 0x00000032698e88fd in clone () from /lib64/libc.so.6Thread 1 (Thread 0x7f26f1330720 (LWP 8177)):#0 0x00000032698db53d in write () from /lib64/libc.so.6#1 0x0000003269871ad3 in _IO_new_file_write () from /lib64/libc.so.6#2 0x000000326987199a in _IO_new_file_xsputn () from /lib64/libc.so.6#3 0x0000003269849681 in buffered_vfprintf () from /lib64/libc.so.6#4 0x000000326984421e in vfprintf () from /lib64/libc.so.6#5 0x000000326984f0e8 in fprintf () from /lib64/libc.so.6#6 0x00007f26f1d37f26 in fsm(amqp_connection_state_t_*) () from /opt/mcu/modbcore/librabbitmq_r.so#7 0x00007f26f1f52dbc in event_base_loop () from /opt/mcu/modbcore/libevent_core.so#8 0x0000000000406835 in main ()发现 pid 为 8177 的 modb 调用栈确实会卡在 write() 系统调用上。对应到业务代码,则是 modb(内部链接的共享库 librabbitmq.so)中通过 fprintf 输出日志到 stderr 上的情况。
[root@upucore_1 ~]# strace -t -p 8177Process 8177 attached - interrupt to quit09:53:30 write(2, "110000199\",null,null,null,null,n"..., 1667) = 166709:53:30 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7607275, ...}) = 009:53:30 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7607275, ...}) = 009:53:30 lseek(18, 7607275, SEEK_SET) = 760727509:53:30 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:30 write(18, "[8177] 17 Sep 09:53:30 \n[Modb] l"..., 4096) = 409609:53:30 write(18, "values(?,?)\"],\"src...\n", 22) = 2209:53:30 close(18) = 009:53:30 munmap(0x7f26f1553000, 4096) = 009:53:30 write(2, "[Modb] ####### curBMC(1755) > l"..., 54) = 5409:53:30 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611393, ...}) = 009:53:30 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611393, ...}) = 009:53:30 lseek(18, 7611393, SEEK_SET) = 761139309:53:30 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:30 write(18, "[8177] 17 Sep 09:53:30 [Modb] ##"..., 77) = 7709:53:30 close(18) = 009:53:30 munmap(0x7f26f1553000, 4096) = 009:53:30 write(2, "[Modb] ####### modbKey = 26277d"..., 90) = 9009:53:30 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611470, ...}) = 009:53:30 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611470, ...}) = 009:53:30 lseek(18, 7611470, SEEK_SET) = 761147009:53:30 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:30 write(18, "[8177] 17 Sep 09:53:30 [Modb] ##"..., 113) = 11309:53:30 close(18) = 009:53:30 munmap(0x7f26f1553000, 4096) = 009:53:30 sendto(8, "\f\0\0\0\3use modb_db", 16, 0, NULL, 0) = 1609:53:30 recvfrom(8, "\7\0\0\1\0\0\0\2\0\0\0", 16384, 0, NULL, NULL) = 1109:53:30 write(2, "[Modb] 'use modb_db' Success!\n", 30) = 3009:53:30 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611583, ...}) = 009:53:30 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:30 fstat(18, {st_mode=S_IFREG|0644, st_size=7611583, ...}) = 009:53:30 lseek(18, 7611583, SEEK_SET) = 761158309:53:30 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:30 write(18, "[8177] 17 Sep 09:53:30 [Modb] 'u"..., 53) = 5309:53:30 close(18) = 009:53:30 munmap(0x7f26f1553000, 4096) = 009:53:30 sendto(8, "\305\0\0\0\3insert into modbVer(bmc_ver"..., 201, 0, NULL, 0) = 20109:53:30 recvfrom(8, "\7\0\0\1\0\1\0\2\0\0\0", 16384, 0, NULL, NULL) = 1109:53:30 write(2, "[Modb] 'insert into modbVer(bmc_"..., 215(此处会卡住,人为回车进行分隔)) = 21509:53:34 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:34 fstat(18, {st_mode=S_IFREG|0644, st_size=7611636, ...}) = 009:53:34 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:34 fstat(18, {st_mode=S_IFREG|0644, st_size=7611636, ...}) = 009:53:34 lseek(18, 7611636, SEEK_SET) = 761163609:53:34 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:34 write(18, "[8177] 17 Sep 09:53:34 [Modb] 'i"..., 238) = 23809:53:34 close(18) = 009:53:34 munmap(0x7f26f1553000, 4096) = 009:53:34 write(2, "\n[Modb] local_comsume_cb: not DE"..., 83) = 8309:53:34 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:34 fstat(18, {st_mode=S_IFREG|0644, st_size=7611874, ...}) = 009:53:34 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:34 fstat(18, {st_mode=S_IFREG|0644, st_size=7611874, ...}) = 009:53:34 lseek(18, 7611874, SEEK_SET) = 761187409:53:34 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:34 write(18, "[8177] 17 Sep 09:53:34 \n[Modb] l"..., 106) = 10609:53:34 close(18) = 0...09:53:34 munmap(0x7f26f1553000, 4096) = 009:53:34 sendto(16, "\1\0\1\0\0\0\r\0<\0P\0\0\0\0\0\0\1\251\0\316", 21, MSG_NOSIGNAL, NULL, 0) = 2109:53:34 write(2, "[MQ][CON_f_LOCAL s:16] --> send"..., 43) = 4309:53:34 write(2, "[MQ][CON_f_LOCAL s:16][fsm:conn_"..., 56) = 5609:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 epoll_wait(3, {}, 32, 0) = 009:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:34 write(2, "[MQ][s:17] mq_pop => timestamp ="..., 53) = 5309:53:34 write(2, "[MQ][PRO_t_TRANS s:17][fsm:conn_"..., 79) = 7909:53:34 write(2, "[MQ][PRO_t_TRANS s:17][fsm:conn_"..., 96) = 9609:53:34 write(2, "[MQ][PRO_t_TRANS s:17][fsm:conn_"..., 60) = 6009:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 recvfrom(16, "\1\0\1\0\0\0.\0<\0<\vCON_f_LOCAL\0\0\0\0\0\0\1\252\1"..., 131072, 0, NULL, NULL) = 416809:53:34 write(2, "[MQ][CON_f_LOCAL s:16] <-- recv"..., 47) = 4709:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 epoll_wait(3, { {EPOLLOUT, {u32=17, u64=17}}}, 32, 0) = 109:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 sendto(17, "\1\0\1\0\0\0B\0<\0(\0\0\25modb.transfer.ex:c"..., 74, MSG_NOSIGNAL, NULL, 0) = 7409:53:34 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:34 write(2, "[MQ][s:17] send Basic.Publish =>"..., 65) = 6509:53:34 write(2, "[MQ][PRO_t_TRANS s:17] --> send"..., 47) = 4709:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 sendto(17, "\2\0\1\0\0\0 \0<\0\0\0\0\0\0\0\0\2\371\220\0\20applicatio"..., 40, MSG_NOSIGNAL, NULL, 0) = 4009:53:34 write(2, "[MQ][PRO_t_TRANS s:17] --> send"..., 55) = 5509:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 sendto(17, "\3\0\1\0\0\2\371{\"keys\": [], \"app\": \"movi"..., 769, MSG_NOSIGNAL, NULL, 0) = 76909:53:34 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:34 write(2, "[MQ][s:17] send Content-Body => "..., 64) = 6409:53:34 write(2, "[MQ][PRO_t_TRANS s:17] --> send"..., 53) = 5309:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 17, {EPOLLOUT, {u32=17, u64=17}}) = 009:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 17, {EPOLLIN, {u32=17, u64=17}}) = 009:53:34 write(2, "[MQ][CON_f_LOCAL s:16] <-- recv"..., 55) = 5509:53:34 epoll_ctl(3, EPOLL_CTL_ADD, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 epoll_wait(3, {}, 32, 0) = 009:53:34 epoll_ctl(3, EPOLL_CTL_DEL, 16, {EPOLLIN, {u32=16, u64=16}}) = 009:53:34 write(2, "[MQ][CON_f_LOCAL s:16] <-- recv"..., 53) = 5309:53:34 write(2, "\n[Modb] local_comsume_cb: Recv C"..., 4115(此处会卡住,人为回车进行分隔)) = 411509:53:42 open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 1809:53:42 fstat(18, {st_mode=S_IFREG|0644, st_size=7613621, ...}) = 009:53:42 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f155300009:53:42 fstat(18, {st_mode=S_IFREG|0644, st_size=7613621, ...}) = 009:53:42 lseek(18, 7613621, SEEK_SET) = 761362109:53:42 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 009:53:42 write(18, "[8177] 17 Sep 09:53:42 \n[Modb] l"..., 4096) = 409609:53:42 write(18, "values(?,?)\"],\"src...\n", 22) = 2209:53:42 close(18) = 0...09:53:51 write(2, "[MQ][s:17] send Content-Body => "..., 64^C通过 strace 发现,每次卡住的情况都发生在进行 write 系统调用时,且为向 fd 2 上输出日志信息。可以看到,输出的日志内容包括 modb 自身的日志和其所链接的 librabbitmq.so 中的日志。Process 8177 detached[root@upucore_1 ~]#
[root@upucore_1 ~]# df -hFilesystem Size Used Avail Use% Mounted on/dev/sda2 39G 9.8G 27G 27% /tmpfs 16G 0 16G 0% /dev/shm/dev/sda1 58G 7.9G 47G 15% /opt/mcu/dev/sda5 4.9G 138M 4.5G 3% /opt/sync/dev/sda3 9.7G 150M 9.0G 2% /usr/mcu/loggfsd1:/web_app 39G 9.8G 27G 27% /mnt/files[root@upucore_1 ~]# [root@upucore_1 ~]# [root@upucore_1 ~]# free -m total used free shared buffers cachedMem: 32054 9081 22972 0 221 818-/+ buffers/cache: 8042 24012Swap: 0 0 0[root@upucore_1 ~]#怀疑是否由于有其他业务程序写磁盘,导致 I/O 能力不够。查看发现只有 mysql 线程的磁盘写比较高,但也在合理范围内。
[root@upucore_1 data]# iotop -oTotal DISK READ: 3.06 K/s | Total DISK WRITE: 1344.33 K/s TID PRIO USER DISK READ DISK WRITE SWAPIN IO> COMMAND 540 be/3 root 0.00 B/s 0.00 B/s 0.00 % 25.83 % [jbd2/sda2-8] 7917 be/4 root 0.00 B/s 45.93 K/s 0.00 % 6.60 % beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/~l/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1" 4806 be/4 mysql 0.00 B/s 58.18 K/s 0.00 % 4.74 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 7934 be/4 root 0.00 B/s 30.62 K/s 0.00 % 4.69 % beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/~l/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1" 7916 be/4 root 0.00 B/s 27.56 K/s 0.00 % 2.47 % beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/~l/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1" 1229 be/3 root 0.00 B/s 79.62 K/s 0.00 % 1.08 % [jbd2/sda1-8] 6923 be/4 root 0.00 B/s 3.06 K/s 0.00 % 0.80 % run_erl -daemon /tmp//opt/mcu/upucore/ /opt/mcu/upucore/log exec /opt/mcu/upucore/bin/upu console '' 9449 be/4 root 0.00 B/s 3.06 K/s 0.00 % 0.61 % run_erl -daemon /tmp//opt/mcu/upu/ /opt/mcu/upu/log exec /opt/mcu/upu/bin/upu console '' 4933 be/4 mysql 0.00 B/s 857.43 K/s 0.00 % 0.48 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 4935 be/4 mysql 0.00 B/s 6.12 K/s 0.00 % 0.31 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=330621292 be/4 mysql 3.06 K/s 9.19 K/s 0.00 % 0.27 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 4921 be/4 mysql 0.00 B/s 3.06 K/s 0.00 % 0.22 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 4803 be/4 mysql 0.00 B/s 0.00 B/s 0.00 % 0.02 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 4804 be/4 mysql 0.00 B/s 0.00 B/s 0.00 % 0.00 % mysqld --basedir=/usr/local/mysql --datadir=/usr/local/mys~ql/data/upucore_1.pid --socket=/tmp/mysql.sock --port=3306 1602 be/4 root 0.00 B/s 21.44 K/s 0.00 % 0.00 % ./sa 1612 be/4 root 0.00 B/s 3.06 K/s 0.00 % 0.00 % ./sa 1638 be/4 root 0.00 B/s 9.19 K/s 0.00 % 0.00 % ./sa13119 be/4 root 0.00 B/s 3.06 K/s 0.00 % 0.00 % nms_adapter 1 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % init 2 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [kthreadd] 3 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/0] 4 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [ksoftirqd/0] 5 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/0] 6 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [watchdog/0] 7 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/1] 8 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/1] 9 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [ksoftirqd/1] 10 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [watchdog/1] 11 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/2] 12 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/2] 13 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [ksoftirqd/2] 14 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [watchdog/2] 15 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/3] 16 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/3] 17 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [ksoftirqd/3] 18 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [watchdog/3] 19 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/4] 20 rt/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [migration/4] 21 be/4 root 0.00 B/s 0.00 B/s 0.00 % 0.00 % [ksoftirqd/4][root@upucore_1 data]#经过一番分析,发现之前的排查方向存在问题。日志输出到 stderr 上,但 stderr 却没有重定向到文件上。所以应该和磁盘 I/O 没直接关系。
[root@upucore_1 ~]# ps aux|grep modbroot 4179 0.0 0.0 111744 2080 ? Sl 16:57 0:00 /opt/mcu/modb/modb /etc/modb.confroot 4189 0.0 0.0 110952 2040 ? Sl 16:57 0:00 /opt/mcu/modbcore/modb /etc/modbcore.confroot 4413 0.0 0.0 108200 1300 ? S 16:57 0:00 /bin/sh /usr/local/keepalived/etc/keepalived/script_master.sh mysql nds modbcore zookeeper root 4750 0.0 0.0 103256 872 pts/0 S+ 16:57 0:00 grep modb[root@upucore_1 ~]# [root@upucore_1 ~]# lsof -p 4179 -a -d 0-2COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEmodb 4179 root 0u CHR 5,1 0t0 5559 /dev/consolemodb 4179 root 1u CHR 5,1 0t0 5559 /dev/consolemodb 4179 root 2u CHR 5,1 0t0 5559 /dev/console[root@upucore_1 ~]# [root@upucore_1 ~]# lsof -p 4189 -a -d 0-2COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEmodb 4189 root 0u CHR 5,1 0t0 5559 /dev/consolemodb 4189 root 1u CHR 5,1 0t0 5559 /dev/consolemodb 4189 root 2u CHR 5,1 0t0 5559 /dev/console[root@upucore_1 ~]# [root@upucore_1 ~]# lsof /dev/console COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEmodb 4179 root 0u CHR 5,1 0t0 5559 /dev/consolemodb 4179 root 1u CHR 5,1 0t0 5559 /dev/consolemodb 4179 root 2u CHR 5,1 0t0 5559 /dev/consolemodb 4189 root 0u CHR 5,1 0t0 5559 /dev/consolemodb 4189 root 1u CHR 5,1 0t0 5559 /dev/consolemodb 4189 root 2u CHR 5,1 0t0 5559 /dev/console[root@upucore_1 ~]#结果发现,确实两个 modb 业务的 fd 0,1,2 均指向了 /dev/console 。并且系统中只有 modb 业务的 fd 0,1,2 是指向 /dev/console 的。
[root@upucore_1 ~]# cat /proc/cmdline ro root=UUID=ef97984a-07ef-4cdb-9825-cb845c23e7ec rd_NO_LUKS LANG=en_US.UTF-8 KEYTABLE=us rd_NO_MD SYSFONT=latarcyrheb-sun16 crashkernel=131M@0M console=ttyS0,9600n8 rd_NO_LVM rd_NO_DM[root@upucore_1 ~]#可以看到 console=ttyS0,9600n8 这个设置,意思是输出到 /dev/console 的内容,实际是发送给了串口,而串口波特率默认值为 9600 b/s 。虽然后续会调整到 115200 ,那也不过才 115 kB/s 的输出速度。
[root@upucore_1 ~]# chkconfig --list...haproxy 0:off 1:off 2:on 3:on 4:on 5:on 6:off...keepalived 0:off 1:off 2:off 3:off 4:off 5:off 6:off...memcached 0:off 1:off 2:on 3:on 4:on 5:on 6:off...modb 0:off 1:off 2:on 3:on 4:on 5:on 6:offmodbcore 0:off 1:off 2:on 3:on 4:on 5:on 6:off...nginx 0:off 1:off 2:on 3:on 4:on 5:on 6:off...rabbitmq 0:off 1:off 2:off 3:off 4:off 5:off 6:off...redis 0:off 1:off 2:on 3:on 4:on 5:on 6:off...zookeeper 0:off 1:off 2:on 3:on 4:on 5:on 6:off...[root@upucore_1 ~]#选择其中几个进行查看
[root@upucore_1 ~]# lsof -c redis -a -d 0-2COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEredis-ser 4159 root 0u CHR 1,3 0t0 3968 /dev/nullredis-ser 4159 root 1u CHR 1,3 0t0 3968 /dev/nullredis-ser 4159 root 2u CHR 1,3 0t0 3968 /dev/null[root@upucore_1 ~]# [root@upucore_1 ~]# lsof -c memcached -a -d 0-2COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEmemcached 1976 root 0u CHR 1,3 0t0 3968 /dev/nullmemcached 1976 root 1u CHR 1,3 0t0 3968 /dev/nullmemcached 1976 root 2u CHR 1,3 0t0 3968 /dev/null[root@upucore_1 ~]# [root@upucore_1 ~]# ps ajxf|grep nginx 1 1994 1994 1994 ? -1 Ss 0 0:00 nginx: master process /usr/local/nginx/sbin/nginx -c /usr/local/nginx/conf/nginx.conf 1994 1995 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 1996 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 1997 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 1998 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2000 1994 1994 ? -1 S 0 0:04 \_ nginx: worker process 1994 2001 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2002 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2003 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2004 1994 1994 ? -1 S 0 0:04 \_ nginx: worker process 1994 2005 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2007 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2008 1994 1994 ? -1 S 0 0:04 \_ nginx: worker process 1994 2009 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2010 1994 1994 ? -1 S 0 0:04 \_ nginx: worker process 1994 2011 1994 1994 ? -1 S 0 0:04 \_ nginx: worker process 1994 2012 1994 1994 ? -1 S 0 0:03 \_ nginx: worker process 1994 2013 1994 1994 ? -1 S 0 0:02 \_ nginx: cache manager process 8264 17252 17251 8264 pts/0 17251 S+ 0 0:00 | \_ grep nginx[root@upucore_1 ~]# [root@upucore_1 ~]# lsof -p 1994 -a -d 0-2COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAMEnginx 1994 root 0u CHR 1,3 0t0 3968 /dev/nullnginx 1994 root 1u CHR 1,3 0t0 3968 /dev/nullnginx 1994 root 2w REG 8,2 9623092 1051452 /usr/local/nginx/logs/error.log[root@upucore_1 ~]# [root@upucore_1 ~]#可以看到,fd 0,1,2 要么被重定向到 /dev/null ,要么被重定向到文件中了。
[root@upucore_1 modbcore]# strace -s 4096 -p 8177Process 8177 attached - interrupt to quit...open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 18fstat(18, {st_mode=S_IFREG|0644, st_size=19540496, ...}) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f1553000fstat(18, {st_mode=S_IFREG|0644, st_size=19540496, ...}) = 0lseek(18, 19540496, SEEK_SET) = 19540496stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0write(18, "[8177] 17 Sep 15:06:45 \n[Modb] local_comsume_cb: Recv Content-Body ==>\n {\"app\":\"movision\",\"data\":{\"account\":\"0512110000195\",\"accountType\":1,\"binded\":false,\"brief\":\"\",\"cmsApproval\":false,\"createdAt\":null,\"dateOfBirth\":null,\"dcsAdmin\":false,\"defaultServiceDomainAdmin\":false,\"defaultUserDomainAdmin\":false,\"deviceGuid\":\"10252200000000000000512110000195\",\"deviceType\":\"522\",\"e164\":\"0512110000195\",\"email\":\"\",\"enable\":true,\"enableBMC\":false,\"enableCall\":false,\"enableDCS\":true,\"enableHD\":true,\"enableHDMeeting\":true,\"enableIncoming\":false,\"enableLive\":false,\"enableMeeting\":true,\"enableMeetingSMS\":false,\"enableNM\":false,\"enableOut\":false,\"enablePlay\":false,\"enableRoam\":false,\"enableSatellite\":false,\"enableSatelliteP2P\":false,\"enableUMC\":false,\"enableVRS\":true,\"enableVenueMonitor\":false,\"enableVideo\":false,\"enableWeibo\":true,\"extNum\":\"\",\"fax\":\"\",\"fullPy\":\"\",\"id\":643,\"ipRegion\":\"\",\"jid\":\"b9da57fa-c384-4247-a797-05fa19537730@hello\",\"jobNum\":\"\",\"limited\":false,\"meetingAdmin\":false,\"mobile\":\"\",\"moid\":\"b9da57fa-c384-4247-a797-05fa19537730\",\"name\":\"0512110000195\",\"nmAdmin\":false,\"nuServerId\":\"3b7cfbb3-b5c5-42b2-a759-209557a1ecf3\",\"officeLocation\":\"\",\"password\":\"21218cca77804d2ba1922c33e0151105\",\"portrait128\":\"\",\"portrait256\":\"\",\"portrait32\":\"\",\"portrait40\":\"\",\"portrait64\":\"\",\"restrictStrategy\":0,\"restrictUsedOn\":\"HD100;HD100S;HD1000;WD1000;TS6610;TS5610;TrueLink;iPad;iPhone;Android_Phone;SKY X500 1080P60;SKY X500 1080P30;SKY X500 720P60;TrueSens X500 1080P;TrueSens X500 720P;SKY X700;SKY X900;SKY for Windows;SKY for Mac;SKY for Android TV;SKY for Android Pad;HD100-TP\",\"seat\":\"\",\"serviceDomainAdmin\":false,\"serviceDomainMoid\":\"26277da4-b84d-4cad-8c69-fc977c9397d0\",\"serviceDomainName\":\"keda\",\"sex\":\"\",\"userDomainAdmin\":false,\"userDomainMoid\":\"zzeq2smvxtq687vgzno1vbco\",\"userDomainName\":\"admin\",\"validityPeriod\":{\"date\":29,\"day\":5,\"hours\":0,\"minutes\":0,\"month\":11,\"seconds\":0,\"time\":1514476800000,\"timezoneOffset\":-480,\"year\":117},\"vrsAdmin\":false,\"weiboAdmin\":false},\"db\":\"use ap\",\"event\":\"createAccount\",\"init\":true,\"keys\":[\"26277da4-b84d-4cad-8c69-fc977c9397d0\"],\"sqlargs\":[[\"b9da57fa-c384-4247-a797-05fa19537730\",\"0512110000195\",null,\"0512110000195\",null,null,\"b9da57fa-c384-4247-a797-05fa19537730@hello\",\"21218cca77804d2ba1922c33e0151105\",\"zzeq2smvxtq687vgzno1vbco\",\"26277da4-b84d-4cad-8c69-fc977c9397d0\",\"10252200000000000000512110000195\",\"3b7cfbb3-b5c5-42b2-a759-209557a1ecf3\",\"522\",1,false,true,false,null],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"0512110000195\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",null,\"\",\"2017-12-29\",\"HD100;HD100S;HD1000;WD1000;TS6610;TS5610;TrueLink;iPad;iPhone;Android_Phone;SKY X500 1080P60;SKY X500 1080P30;SKY X500 720P60;TrueSens X500 1080P;TrueSens X500 720P;SKY X700;SKY X900;SKY for Windows;SKY for Mac;SKY for Android TV;SKY for Android Pad;HD100-TP\",null,null,null,null,null,null],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableMeeting\"],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableHD\"],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableVRS\"],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableHDMeeting\"],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableDCS\"],[\"b9da57fa-c384-4247-a797-05fa19537730\",\"enableWeibo\"]],\"sqls\":[\"insert into user_info(moid,account,email,e164,mobile,ip_region,jid,password,user_domain_moid,service_domain_moid,device_guid,nu_server_id,device_type,account_type,binded,enable,created_at,limited,manage_user_domain_moid) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,now(),?,?)\",\"insert into user_profile(moid,name,seat,ext_num,fax,job_num,brief,full_py,sex,date_of_birth,office_location,validity_period,restrict_used_on,restrict_strategy,portrait_32,portrait_40,portrait_64,portrait_128,portrait_256) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\",\"insert into user_privilege_data(moid,privilege_key) values(?,?)\"],\"sr", 4096) = 4096write(18, "c\":\"\",\"version\":\"\"}\n\n", 21) = 21close(18) = 0munmap(0x7f26f1553000, 4096) = 0write(2, "\n[Modb] local_comsume_cb: not DEST Platform, do TRANSFER and BROADCAST if config!\n\n", 83) = 83open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 18fstat(18, {st_mode=S_IFREG|0644, st_size=19544613, ...}) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f1553000fstat(18, {st_mode=S_IFREG|0644, st_size=19544613, ...}) = 0lseek(18, 19544613, SEEK_SET) = 19544613stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0write(18, "[8177] 17 Sep 15:06:45 \n[Modb] local_comsume_cb: not DEST Platform, do TRANSFER and BROADCAST if config!\n\n", 106) = 106close(18) = 0munmap(0x7f26f1553000, 4096) = 0stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0write(2, "[MQ] mq_push => timestamp = 17 Sep 15:06:45.307\n", 48) = 48epoll_ctl(3, EPOLL_CTL_ADD, 16, { EPOLLOUT, {u32=16, u64=16}}) = 0epoll_wait(3, { { EPOLLIN, {u32=17, u64=17}}, { EPOLLOUT, {u32=16, u64=16}}}, 32, 10) = 2epoll_ctl(3, EPOLL_CTL_DEL, 17, { EPOLLIN, {u32=17, u64=17}}) = 0epoll_ctl(3, EPOLL_CTL_DEL, 16, { EPOLLOUT, {u32=16, u64=16}}) = 0recvfrom(17, "\1\0\1\0\0\0\r\0<\0P\0\0\0\0\0\0\16h\0\316", 131072, 0, NULL, NULL) = 21write(2, "[MQ][PRO_t_TRANS s:17] <-- recv Basic.Ack\n", 43) = 43stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0write(2, "[MQ][s:17] recv Basic.Ack => timestamp = 17 Sep 15:06:45.308\n", 61) = 61write(2, "[MoDB] ### CB: Publisher Confirm -- [Basic.Ack] Tag:[PRO_t_TRANS] Sock[17] Delivery_Tag:[3688] multiple:[0]\n", 109) = 109open("/var/log/modbcore.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 18fstat(18, {st_mode=S_IFREG|0644, st_size=19544719, ...}) = 0mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f26f1553000fstat(18, {st_mode=S_IFREG|0644, st_size=19544719, ...}) = 0lseek(18, 19544719, SEEK_SET) = 19544719stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0write(18, "[8177] 17 Sep 15:06:45 [MoDB] ### CB: Publisher Confirm -- [Basic.Ack] Tag:[PRO_t_TRANS] Sock[17] Delivery_Tag:[3688] multiple:[0]\n", 132) = 132close(18) = 0munmap(0x7f26f1553000, 4096) = 0sendto(16, "\1\0\1\0\0\0\r\0<\0P\0\0\0\0\0\0\22\26\0\316", 21, MSG_NOSIGNAL, NULL, 0) = 21write(2, "[MQ][CON_f_LOCAL s:16] --> send Basic.Ack\n", 43) = 43write(2, "[MQ][CON_f_LOCAL s:16][fsm:conn_idle][C] Start to recv!\n", 56(卡住)) = 56...^CProcess 8177 detached[root@upucore_1 modbcore]#
转载地址:http://jjvxl.baihongyu.com/