周梦康 发表于 2021-09-17 54 次浏览

遇到的情况有一个旧的 PHP 共享服务器,收到报警,现状是 CPU 100% 内存使用不到 40% 但是负载到了100多
负载过高导致Mysql 客户端网络 I/O 模式发生变化的问题

抓几个占用 CPU 的进程

$ps aux --sort=-pcpu | head -10
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
admin    19782  2.1  0.2 244368 17800 ?        R    15:09   0:02 php-fpm: pool www
admin    20007  2.0  0.1 242748 16160 ?        S    15:09   0:02 php-fpm: pool www
admin    19517  2.0  0.2 247076 20084 ?        S    15:08   0:03 php-fpm: pool www
admin    19740  2.0  0.2 243600 16480 ?        S    15:08   0:02 php-fpm: pool www
admin    17632  2.0  0.2 244388 18016 ?        S    15:04   0:07 php-fpm: pool www
admin    17616  2.0  0.3 251552 24904 ?        S    15:04   0:07 php-fpm: pool www
admin    19765  2.0  0.2 247484 20724 ?        S    15:09   0:02 php-fpm: pool www
admin    19766  2.0  0.2 244880 17728 ?        S    15:09   0:02 php-fpm: pool www

然后统计了下系统调用占用比例情况

$ sudo strace -c -p {pid}

基本如下

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 34.86    0.014677           6      2626           munmap
 13.80    0.005809           5      1267           poll
 12.03    0.005067           3      1935      1030 access
  9.58    0.004035           1      3909           gettimeofday
  5.47    0.002305           0      7497           fstat
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 20.76    0.008707           3      2543           poll
 17.47    0.007330           1      4919           munmap
 10.84    0.004548           1      3803      2083 access
  8.05    0.003375           0     14324           fstat
  7.15    0.003000          52        58           accept
$ strace -s 1024 -p {pid}
access("xxx/vendor/composer/../xxx.php", F_OK) = -1 ENOENT (No such file or directory)
access("a/xxx.php", F_OK) = -1 ENOENT (No such file or directory)
access("b/xxx.php", F_OK) = -1 ENOENT (No such file or directory)
gettimeofday({1631691482, 828453}, NULL) = 0
gettimeofday({1631691482, 828480}, NULL) = 0
open("c/xxx.php", O_RDONLY) = 4
fstat(4, {st_mode=S_IFREG|0664, st_size=23359, ...}) = 0
fstat(4, {st_mode=S_IFREG|0664, st_size=23359, ...}) = 0
fstat(4, {st_mode=S_IFREG|0664, st_size=23359, ...}) = 0
mmap(NULL, 23359, PROT_READ, MAP_SHARED, 4, 0) = 0x7f72f24ba000
munmap(0x7f72f24ba000, 23359)           = 0

munmap 主要是加载代码到内存,access主要是自动加载反复报错,有优化的空间。

poll 就非常诡异了

sendto(5, "\207\0\0\0\3select * from * where * order by id asc", 139, MSG_DONTWAIT, NULL, 0) = 139
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 78, MSG_DONTWAIT, NULL, NULL) = 78
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 82
poll([{fd=5, events=POLLIN|POLLERR|POLLHUP}], 1, 1471228928) = 1 ([{fd=5, revents=POLLIN}])
recvfrom(5, "*", 82, MSG_DONTWAIT, NULL, NULL) = 40

为什么客户端读取的时候会用 poll ,而且是同一个网络请求,应该不存在多路复用的场景。而且 buffer 只有 82。
后面通过降级方案,负载下来之后,同样的请求的系统调用却变得合理了,一次性读取完毕。

write(5, "\207\0\0\0\3select * from * where * order by id asc", 139) = 139
read(5, "*", 16384) = 692 # 一次性读取完毕

对比发现,在高负载的情况下有个参数MSG_DONTWAIT表示将 fd 设置成非阻塞模式

评论列表