英文文章词频统计:
功能:统计一篇英文文章的单词总数及出现频数并输出,之后排序,输出频数前十的单词及其频数。
实现方法:使用C语言,用fopen函数读入txt文件,fscanf函数逐个读入单词,结构体wordNode存储单词及其频数,以链表的形式连接在一起,最后使用插入排序进行分析,输出频数最高的5个单词。
头文件
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
定义宏
#define ERROR 1
#define OK 0
#define WORD_LENGTH 250
自定义数据类型
typedef int status; typedef struct Node
{
char word[WORD_LENGTH];
int time;
struct Node *next;
}wordNode;
定义全局变量
wordNode *headNode = NULL;
声明所有使用的函数
wordNode *wordSearch(char *word,int *num);
status wordCount(char *word,int *num);
void printCountList(int *num);
void PrintFirstFiveTimes();
void mergeSort(wordNode **head);
void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next);
void wordJob(char word[]);
wordNode *SortedMerge(wordNode *pre,wordNode *next);
void release();
主函数
status main(int argc,char *argv[])
{
char temp[WORD_LENGTH];//定义用以临时存放单词的数组
FILE *file;
int count;
int articleWordNum = ;//定义统计结点个数的变量
int *num = &articleWordNum;
if((file = fopen("F:\\zc\\c\\yjs\\file.txt", "r")) == NULL)
{
printf("文件读取失败!");
exit();
}
while((fscanf(file,"%s",temp))!= EOF)
{
wordJob(temp);
count = wordCount(temp,num);
}
fclose(file);
printf("\n输出所有单词的频数\n");
printCountList(num);
printf("\n输出词频最高的5个词\n");
mergeSort(&headNode); //排序
PrintFirstFiveTimes();
release();
return ;
}
查找单词所在结点并返回其地址
wordNode *wordSearch(char *word,int *num)
{
wordNode *node;
wordNode *nextNode = headNode;
wordNode *preNode = NULL;
char a[WORD_LENGTH];
if(headNode == NULL)
{
node = (wordNode*)malloc(sizeof(wordNode));
strcpy(node->word, word);
node->time = ;
*num+=;
headNode = node;
return node;
}
while(nextNode != NULL) //查找匹配单词
{
strcpy(a,nextNode->word);
if(strcmp(a, word) == )
{
return nextNode;
}
preNode = nextNode;
nextNode = nextNode->next;
} if(nextNode == NULL)
{
node = (wordNode*)malloc(sizeof(wordNode));
strcpy(node->word, word);
node->time = ;
node->next = headNode->next;
headNode->next = node;
*num+=;
return node;
}
else
return nextNode;
}
进行词频统计
status wordCount(char *word,int *num)
{
wordNode *tmpNode = NULL;
tmpNode = wordSearch(word,num); //word所在的节点
if(tmpNode == NULL)
{
return ERROR;
}
tmpNode->time++;
return ;
}
输出所有词频
void printCountList(int *num)
{
if(headNode == NULL)
{
printf("该文件无内容!");
}
else
{
wordNode *preNode = headNode;
printf("\n\t总计 %d \n",*num);
while(preNode != NULL)
{
printf("\n\t%s:%d次\n",preNode->word,preNode->time);
preNode = preNode->next;
}
}
}
输出词频最高的10个词
void PrintFirstFiveTimes()
{
if(headNode == NULL)
{
printf("该文件无内容!");
}
else
{
wordNode *preNode = headNode;
int i = ;
while (preNode != NULL && i<=)
{
printf("\n\t%s:%d次\n",preNode->word,preNode->time);
preNode = preNode->next;
i++;
}
}
}
对词频统计结果进行归并排序
void mergeSort(wordNode **headnode)
{
wordNode *pre,*next,*head;
head = *headnode;
if(head == NULL || head->next == NULL)
{
return;
}
FrontBackSplit(head,&pre,&next);
mergeSort(&pre);
mergeSort(&next);
*headnode = SortedMerge(pre,next);
}
取尾节点
void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next)
{
wordNode *fast;
wordNode *slow;
if(source == NULL || source->next == NULL)
{
*pre = source;
*next = NULL;
}
else
{
slow = source;
fast = source->next;
while(fast != NULL)
{
fast = fast->next;
if(fast != NULL)
{
slow = slow->next;
fast = fast->next;
}
}
*pre = source;
*next = slow->next;
slow->next = NULL;
}
}
取频数最大的节点作为头节点
wordNode *SortedMerge(wordNode *pre,wordNode *next)
{
wordNode *result = NULL;
if(pre == NULL)
return next;
else if(next == NULL)
return pre;
if(pre->time >= next->time)
{
result = pre;
result->next = SortedMerge(pre->next,next);
}
else
{
result = next;
result->next = SortedMerge(pre,next->next);
}
return result;
}
处理单词
void wordJob(char word[])
{
int i,k;
for(i = ;i<strlen(word);i++)
{
if(word[i]>='A'&& word[i]<='Z')
{
word[i] += ;
continue;
}
if(word[i]<'a'||word[i]>'z')
{
if(i == (strlen(word)-))
{
word[i] = '\0';
}
else
{
k = i;
while(i < strlen(word))
{
word[i] = word[i+];
i++;
}
i = k;
}
}
}
}
释放所有结点内存
void release()
{
if(headNode == NULL)
return;
wordNode *pre = headNode;
while(pre != NULL)
{
headNode = pre->next;
free(pre);
pre = headNode;
}
}
[email protected]:amberpass/Calculate_words.git
https://git.coding.net/amberpass/Calculate_words.git
程序运行结果: