问题描述
之前有个同学反馈说现网有机器netstat看不了监听端口的进程与pid,当时让他改用lsof -i:PORT方式规避,没去深究。最近在核对外网监听端口时又碰到了:定位分析
为啥有的端口ok有的不能显示?是否是监听端口范围问题?用python -m SimpleHTTPServer测试监听不同端口,netstat都不能查出进程名pid
说明这台机器有问题,strace跟踪比较6200、24849这2个进程pid有什么不同,netstat到底做了什么?
进程pid: 6200
进程pid:24849
对比发现,netstat在处理6200这个pid时,没有读取/proc/PID/cmdline。为啥没读取呢?估计得看netstat源码了。偷懒google了一把。发现之前有人遇到过这个问题,原因是:
netstat低版本有bug,当Socket id 大于 2^31 时,会造成无法显示进程信息。
源码解惑
有点奇怪为啥是大于2^31就不行,这个博客也没讲清楚。下载了当前系统netstat 1.6.0版源码, grep -r -i "cmdline"狂扫。在netstat.c文件找到处理cmdline的函数代码:- static void prg_cache_load(void)
- {
- char line[LINE_MAX],eacces=0;
- int procfdlen,fd,cmdllen,lnamelen;
- char lname[30],cmdlbuf[512],finbuf[PROGNAME_WIDTH];
- long inode;
- const char *cs,*cmdlp;
- DIR *dirproc=NULL,*dirfd=NULL;
- struct dirent *direproc,*direfd;
- if (prg_cache_loaded || !flag_prg) return;
- prg_cache_loaded=1;
- cmdlbuf[sizeof(cmdlbuf)-1]='\0';
- if (!(dirproc=opendir(PATH_PROC))) goto fail;
- while (errno=0,direproc=readdir(dirproc)) {
- #ifdef DIRENT_HAVE_D_TYPE_WORKS
- if (direproc->d_type!=DT_DIR) continue;
- #endif
- for (cs=direproc->d_name;*cs;cs++)
- if (!isdigit(*cs))
- break;
- if (*cs)
- continue;
- procfdlen=snprintf(line,sizeof(line),PATH_PROC_X_FD,direproc->d_name);
- if (procfdlen<=0 || procfdlen>=sizeof(line)-5)
- continue;
- errno=0;
- dirfd=opendir(line);
- if (! dirfd) {
- if (errno==EACCES)
- eacces=1;
- continue;
- }
- line[procfdlen] = '/';
- cmdlp = NULL;
- while ((direfd = readdir(dirfd))) {
- #ifdef DIRENT_HAVE_D_TYPE_WORKS
- if (direfd->d_type!=DT_LNK)
- continue;
- #endif
- if (procfdlen+1+strlen(direfd->d_name)+1>sizeof(line))
- continue;
- memcpy(line + procfdlen - PATH_FD_SUFFl, PATH_FD_SUFF "/",
- PATH_FD_SUFFl+1);
- strcpy(line + procfdlen + 1, direfd->d_name);
- lnamelen=readlink(line,lname,sizeof(lname)-1);
- lname[lnamelen] = '\0'; /*make it a null-terminated string*/
- extract_type_1_socket_inode(lname, &inode);
- if (inode < 0) extract_type_2_socket_inode(lname, &inode);
- if (inode < 0) continue;
- if (!cmdlp) {
- if (procfdlen - PATH_FD_SUFFl + PATH_CMDLINEl >=
- sizeof(line) - 5)
- continue;
- strcpy(line + procfdlen-PATH_FD_SUFFl, PATH_CMDLINE);
- fd = open(line, O_RDONLY);
- if (fd < 0)
- continue;
- cmdllen = read(fd, cmdlbuf, sizeof(cmdlbuf) - 1); #读取/proc/PID/cmdline
- if (close(fd))
- continue;
- if (cmdllen == -1)
- continue;
- if (cmdllen < sizeof(cmdlbuf) - 1)
- cmdlbuf[cmdllen]='\0';
- if ((cmdlp = strrchr(cmdlbuf, '/')))
- cmdlp++;
- else
- cmdlp = cmdlbuf;
- }
- snprintf(finbuf, sizeof(finbuf), "%s/%s", direproc->d_name, cmdlp);
- prg_cache_add(inode, finbuf);
- }
- closedir(dirfd);
- dirfd = NULL;
- }
- if (dirproc)
- closedir(dirproc);
- if (dirfd)
- closedir(dirfd);
- if (!eacces)
- return;
- if (prg_cache_loaded == 1) {
- fail:
- fprintf(stderr,_("(No info could be read for \"-p\": geteuid()=%d but you should be root.)\n"),
- geteuid());
- }
- else
- fprintf(stderr, _("(Not all processes could be identified, non-owned process info\n"
- " will not be shown, you would have to be root to see it all.)\n"));
- }
回头看,如果inodeextract_type_1_socket_inode、extract_type_2_socket_inode函数。
- static void extract_type_1_socket_inode(const char lname[], long * inode_p) {
- /* If lname is of the form "socket:[12345]", extract the "12345"
- as *inode_p. Otherwise, return -1 as *inode_p.
- */
- if (strlen(lname) < PRG_SOCKET_PFXl+3) *inode_p = -1;
- else if (memcmp(lname, PRG_SOCKET_PFX, PRG_SOCKET_PFXl)) *inode_p = -1;
- else if (lname[strlen(lname)-1] != ']') *inode_p = -1;
- else {
- char inode_str[strlen(lname + 1)]; /* e.g. "12345" */
- const int inode_str_len = strlen(lname) - PRG_SOCKET_PFXl - 1;
- char *serr;
- strncpy(inode_str, lname+PRG_SOCKET_PFXl, inode_str_len);
- inode_str[inode_str_len] = '\0';
- *inode_p = strtol(inode_str,&serr,0);
- if (!serr || *serr || *inode_p < 0 || *inode_p >= INT_MAX)
- *inode_p = -1;
- }
- }
- static void extract_type_2_socket_inode(const char lname[], long * inode_p) {
- /* If lname is of the form "[0000]:12345", extract the "12345"
- as *inode_p. Otherwise, return -1 as *inode_p.
- */
- if (strlen(lname) < PRG_SOCKET_PFX2l+1) *inode_p = -1;
- else if (memcmp(lname, PRG_SOCKET_PFX2, PRG_SOCKET_PFX2l)) *inode_p = -1;
- else {
- char *serr;
- *inode_p=strtol(lname + PRG_SOCKET_PFX2l,&serr,0);
- if (!serr || *serr || *inode_p < 0 || *inode_p >= INT_MAX)
- *inode_p = -1;
- }
- }
现在可以确认: 原因是socket inode超过INT_MAX,引起netstat不识别进程名pid.
ref: http://xiezhenye.com/2014/01/%E6%9F%90%E4%BA%9B-linux-%E5%8F%91%E8%A1%8C%E7%89%88%E4%B8%AD-netstat-%E7%9A%84-bug.html