Closed. This question is off-topic. It is not currently accepting answers. Learn more。
想改进这个问题吗Update the question所以堆栈溢出的值小于aa>。
两个月前关闭。
我开始做一个小的项目来实现一个小型的C控制台程序来进行快速的首字母缩略词搜索我在一家公司/行业工作,那里有1000个缩写!
这些首字母缩略词是从各种文档中收集(复制/粘贴)并放入一个文本文件中的程序将此文件的内容加载到内存中,并将其存储在链接列表中。
我使用的格式是:
缩略词定义
或者在首字母缩略词有多种定义的情况下:
缩写#定义1;定义2;定义3
总的来说,程序运行良好-实现了一些基本的错误处理,节省了我查找多个文档的时间。
不过,令我好奇的是首字母缩略词列表的加载时间对于大约900个缩略词,将列表加载到内存中并准备好接受用户输入大约需要0.35秒。
两个关键功能是:
我只是复制粘贴我现有的清单很多次,看看需要多长时间加载:
缩略词-秒
1640-0.91秒(1800首字母缩写词/秒)
3280-3.64秒
6560-14.27秒
13120-57.30秒(229个缩写词/秒)
26240-231.75秒
52480~927秒
104960~3708秒
209920-14083秒(15个缩写词/秒)
最后一次,我让它通宵运行。
实际上,我认为最多会有5000个首字母缩略词,甚至不太可能——但我只是想看看是否有办法改善这个加载时间。也许使用不同的数据结构?
应避免在每次迭代时重新计算输入长度:
在
还有其他问题:
文件内容应该为空终止符分配一个额外的字节
我在OS/X上做了一些测试:clang能够优化循环外对
想改进这个问题吗Update the question所以堆栈溢出的值小于aa>。
两个月前关闭。
我开始做一个小的项目来实现一个小型的C控制台程序来进行快速的首字母缩略词搜索我在一家公司/行业工作,那里有1000个缩写!
这些首字母缩略词是从各种文档中收集(复制/粘贴)并放入一个文本文件中的程序将此文件的内容加载到内存中,并将其存储在链接列表中。
我使用的格式是:
缩略词定义
或者在首字母缩略词有多种定义的情况下:
缩写#定义1;定义2;定义3
总的来说,程序运行良好-实现了一些基本的错误处理,节省了我查找多个文档的时间。
不过,令我好奇的是首字母缩略词列表的加载时间对于大约900个缩略词,将列表加载到内存中并准备好接受用户输入大约需要0.35秒。
两个关键功能是:
char * loadACR( FILE * fptr, unsigned long int file_size )
AcronymDB * mapAcroDefs( const char * filecont )
loadACR
将整个文件加载到内存中mapAcroDefs
调用其他函数来创建首字母缩略词数据库、处理文件内容、创建新定义、遍历链接列表、存储等。#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <dirent.h>
#include <ctype.h>
#include "resources.h"
/*************** DEFS **********************/
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#define acro_length 16
#define MAX_FILE_SIZE 3145728 //(in bytes == 3MB)
#define SET_GREEN SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_GREEN | FOREGROUND_INTENSITY );
#define SET_RED SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_RED | FOREGROUND_INTENSITY );
#define SET_CYAN SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_BLUE );
#define SET_WHITE SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE );
/******************************************/
/*************** GLOBALS *******************/
//#include <sys/time.h>
//struct timeval tv1,tv2;
/*******************************************/
/*************** STRUCTS *******************/
//associated definitions in a linked list
typedef struct defnList
{
char * defn;
struct defnList * next;
} Definition;
//acronym entry
typedef struct
{
char * acronym;
Definition * f_defn; //a pointer to the first definition
} Acronym;
//database of acronyms
typedef struct
{
unsigned int entries;
Acronym ** acronyms; //array of pointers to acronym entries
} AcronymDB;
/******************************************/
void throwError( unsigned short int err_code, AcronymDB * entry_list, unsigned int r )
{
SET_RED
switch ( err_code )
{
//retFileHandle errors
case 1:
puts( "Directory path error - check access rights" );
break;
//getFileSize errors
case 5:
puts( "'.acr' file is greater than 3MB, reduce the file size and try again.\n" );
break;
//macroAcroDefs errors
case 35:
case 36:
case 37:
printf( "Error(s) found at lines %u to %u\n", r - 1, r + 1 );
printf( "Code: %d\n", err_code );
break;
//main() errors
case 40:
puts( "File error - ensure a '.acr' definitions file exists in the program folder." );
break;
//malloc errors
default:
printf( "Memory Allocation Error - Code: %d\n", err_code );
}
free( entry_list );
puts ( "Program ended" );
getchar();
exit( EXIT_FAILURE );
}
void chkMalloc( void * chk_me, unsigned short int err_code, AcronymDB * entry_list )
{
if ( !chk_me && ( sizeof ( chk_me ) > 0 ) )
{
throwError( err_code, entry_list, 0 );
}
}
char * getInputAcronym()
{
char input_acro[ acro_length ], c;
unsigned int i = 0;
while( ( c = getchar() ) != '\n' && c != EOF )
{
if( acro_length == i || c == ' ' )
{
fflush( stdin );
return ( "err_format" );
}
//some acronyms contain parentheses, hyphens etc, these are ignored if the user enters it
//as they are removed in the mapAcroDefs function before adding into the list
if ( c != '-' &&
c != '(' &&
c != ')' &&
c != '&' &&
c != '/' )
{
input_acro[ i ] = toupper( c );
i++;
}
}
input_acro[ i ] = '\0';
return strdup( input_acro );
}
FILE * retFileHandle() //errcodes 1 - 4
{
DIR * dptr;
struct dirent * directory;
FILE * acr_file;
dptr = opendir( "." );
if ( dptr == NULL ) throwError( 1, NULL, 0 );
while( ( directory = readdir( dptr ) ) != NULL )
{
//check whether the file name contains .acr
//n.b: only the first found .acr occurrence will be used
if( strstr( directory -> d_name, ".acr" ) )
{
acr_file = fopen( directory -> d_name , "r" );
closedir( dptr );
return ( acr_file );
}
}
return NULL;
}
unsigned long int getFileSize( FILE * fptr ) //errcodes 5 - 9
{
unsigned long int file_size, prev;
prev = ftell( fptr );
fseek( fptr, 0, SEEK_END );
file_size = ftell( fptr );
fseek( fptr, prev, SEEK_SET );
// > 3MB
if( file_size >= MAX_FILE_SIZE ) throwError( 5, NULL, 0 );
else
{
return ( file_size );
}
return 0; // should never get here, but the compiler complains of returning non-void
}
//load the acr file into memory
char * loadACR( FILE * fptr, unsigned long int file_size ) //errcodes 10 - 14
{
char * read_buffer;
read_buffer = ( char * ) malloc ( file_size );
chkMalloc( read_buffer, 10, NULL );
//fread returns the total number of elements, so set that last element to '\0' to remove random characters at the end
read_buffer[ fread( read_buffer, 1, file_size, fptr ) ] = '\0';
return ( read_buffer );
}
unsigned int countRows( const char * filecont )
{
unsigned int rows = 0, i;
for ( i = 0; i < strlen( filecont ); i++ )
{
//disregard blank lines at the top or somewhere in the middle
if ( ( '\n' == filecont[ i ] ) && ( 0 != i ) && ( '\n' != filecont[ i + 1 ] ) )
{
rows++;
}
}
//if there's a new line at the end, disregard
if ( ( '\0' == filecont[ i ] ) && ( '\n' == filecont[ i - 1 ] ) ) rows--;
return ( rows );
}
void remPunct( char * delimbuff )
{
char * src, * dst;
for ( src = dst = delimbuff; * src != '\0'; src++ )
{
* dst = * src;
if ( * dst != '-' &&
* dst != '(' &&
* dst != ')' &&
* dst != '&' &&
* dst != '/' )
{
dst++;
}
}
* dst = '\0';
}
void addAcronym( AcronymDB * entry_list, char * delimbuff, unsigned int r ) //errcodes 15 - 19
{
char * ptr_delim;
//convert the characters to uppercase to make matching easier later, some acronyms have a mix of upper and lowercase
for ( ptr_delim = delimbuff; * ptr_delim != '\0'; ptr_delim++ )
{
* ptr_delim = toupper( * ptr_delim );
}
entry_list -> acronyms[ r ] = malloc( sizeof( Acronym ) );
chkMalloc( entry_list -> acronyms[ r ], 15, entry_list );
entry_list -> acronyms[ r ] -> acronym = strdup( delimbuff );
chkMalloc( entry_list -> acronyms[ r ] -> acronym, 16, entry_list );
}
Definition * initDefn() //errcodes 20 - 24
{
Definition * newDefn = malloc( sizeof( Definition ) );
chkMalloc( newDefn, 20, NULL );
newDefn = NULL;
return ( newDefn );
}
Definition * addDefn( AcronymDB * entry_list, Definition * newDefn, char * delimbuff, unsigned int r ) //errcodes 25 - 29
{
Definition * head = newDefn;
Definition * currDefn;
//first entry if head is NULL, allocate space for the first definition and store it
if ( NULL == head )
{
head = entry_list -> acronyms[ r ] -> f_defn = malloc( sizeof( Definition ) );
chkMalloc( entry_list -> acronyms[ r ] -> f_defn, 25, entry_list );
entry_list -> acronyms[ r ] -> f_defn -> defn = strdup( delimbuff );
chkMalloc( entry_list -> acronyms[ r ] -> f_defn -> defn, 26, entry_list );
head -> next = entry_list -> acronyms[ r ] -> f_defn -> next = NULL;
return ( head );
}
//else go through all the existing definitions and put the definition at the end of the list
currDefn = entry_list -> acronyms[ r ] -> f_defn;
while ( NULL != currDefn -> next )
{
currDefn = currDefn -> next;
}
currDefn -> next = malloc ( sizeof( Definition ) );
chkMalloc( currDefn -> next, 27, entry_list );
currDefn = currDefn -> next;
currDefn -> defn = strdup( delimbuff );
chkMalloc( currDefn -> defn, 28, entry_list );
currDefn -> next = NULL;
return ( head );
}
//create space for the database of acronyms and enough space for the entry list based on rows counted earlier
AcronymDB * initAcroDB( unsigned int rows ) //errcodes 30 - 34
{
AcronymDB * entry_list = malloc( sizeof( AcronymDB ) );
chkMalloc( entry_list, 30, 0 );
entry_list -> acronyms = malloc( sizeof( Acronym * ) * rows );
chkMalloc( entry_list -> acronyms, 31, entry_list );
entry_list -> entries = rows;
return ( entry_list );
}
Definition * procMultiDefs( const char * pStart, const char * pCurrent, AcronymDB * entry_list, Definition * newDefn, unsigned int r )
{
//i hope 250 characters is enough? :S
char tmpdefhold[ 250 ] = { 0 };
//we don't want to memmove here as it will affect pCurrent as well, just change where pStart is pointing
if ( ' ' == pStart[ 0 ] ) ++pStart;
strncpy( tmpdefhold, pStart, pCurrent - pStart );
newDefn = addDefn( entry_list, newDefn, tmpdefhold, r );
return ( newDefn );
}
AcronymDB * mapAcroDefs( const char * filecont ) //errcodes 35 - 39
{
SET_CYAN printf( "Acronym Search v1.1\n" );
SET_WHITE printf( "Reading acronym list...\n" );
unsigned int rows = countRows( filecont ), r = 0;
char * tmp_filecont, * delimbuff;
//copy contents of file into a temporary buffer so it can be passed to strtok
tmp_filecont = ( char * ) malloc ( ( strlen( filecont ) + 1 ) * sizeof( char ) );
if ( NULL == tmp_filecont ) throwError( 30, NULL, 0 );
strncpy( tmp_filecont, filecont, strlen( filecont ) + 1 ); //create a temporary string and null terminate it
tmp_filecont[ strlen( tmp_filecont ) ] = '\0';
//create space for database of acronyms based on entries counted
AcronymDB * entry_list = initAcroDB( rows );
//tokenise the file contents, the first split will be an acronym entry including a space before the hash
delimbuff = strtok( tmp_filecont, "#\n" );
while ( r <= rows )
{ //catch potential errors in the file (not tokenised according to # or \n
if ( NULL == delimbuff )
{
free( tmp_filecont );
throwError( 35, entry_list, r + 1 );
}
//return location of space before the hash
char * spcptr = strchr( delimbuff, ' ' );
if ( NULL == spcptr )
{
free( tmp_filecont );
throwError( 36, entry_list, r + 1 );
}
//terminate the word correctly, hopefully we have a valid acronym entry now
* spcptr = '\0';
remPunct( delimbuff );
addAcronym( entry_list, delimbuff, r );
//allocate memory for a new definition
Definition * newDefn = initDefn();
while ( NULL != delimbuff )
{
delimbuff = strtok( NULL, "#\n" );
//get rid of the space that will be present, otherwise there's a formatting error present, missing hash/other char
if ( ' ' == delimbuff[ 0 ] ) memmove( delimbuff, delimbuff + 1, strlen( delimbuff ) );
//multiple definitions exist
if ( NULL != strstr( delimbuff, ";" ) )
{
char * pStart = delimbuff, * pCurrent = delimbuff;
//go through the entire string and process the multiple definitions
//two pointers are used to mark the string from <start to ';'>
//https://stackoverflow.com/questions/49788179/how-to-get-a-substring-using-strchr-in-multiple-occurrences-of-in-c-and-stor
while ( '\0' != * pCurrent)
{
if ( ';' == * pCurrent )
{
newDefn = procMultiDefs( pStart, pCurrent, entry_list, newDefn, r );
pStart = pCurrent + 1;
}
++pCurrent;
}
//last definition after ; wouldn't be captured by the loop above, so do this again immediately after
newDefn = procMultiDefs( pStart, pCurrent, entry_list, newDefn, r );
}
else
{
newDefn = addDefn( entry_list, newDefn, delimbuff, r );
}
//go to the next line now
if ( NULL == strstr( delimbuff, "\n" ) )
{
r++;
delimbuff = strtok( NULL, "#\n" );
break;
}
}
}
free( tmp_filecont );
return ( entry_list );
}
//start from the beginning of the acronym list and look for a match, print any associated definitions
int lookUpAcro( AcronymDB * entry_list, const char * retstring )
{
unsigned int b = 0;
int found = -1;
Acronym * currAcro = entry_list -> acronyms[ b ];
while ( b <= entry_list -> entries )
{
if ( 0 == strcmp( currAcro -> acronym, retstring ) )
{
Definition * currDefn = entry_list -> acronyms[ b ] -> f_defn;
while ( NULL != currDefn )
{
SET_GREEN
SetConsoleOutputCP( 65001 ); //this is required to print out UTF-8 characters, but doesn't work properly?
printf( " %s\n", currDefn -> defn );
currDefn = currDefn -> next;
}
SET_WHITE
found = 1;
//in case there are duplicate acronym entries with different definitions
currAcro = entry_list -> acronyms[ ++b ];
}
else
{
currAcro = entry_list -> acronyms[ ++b ];
}
}
return ( found );
}
int main () //errcodes 40 - 50
{
char * retstring, * filecont;
FILE * fptr;
unsigned long int file_size = 0;
AcronymDB * entry_list;
system( "Acronym Search" );
//gettimeofday( &tv1, NULL );
//open definitions file
if( ( fptr = retFileHandle() ) == NULL ) throwError( 40, NULL, 0 );
file_size = getFileSize( fptr );
filecont = loadACR( fptr, file_size );
fclose( fptr );
//create linked lists of acronyms and defs from the file contents read in above, then free the temporary read buffer; filecont
entry_list = mapAcroDefs( filecont );
printf( "%u acronyms loaded\n\n", entry_list -> entries + 1 );
free( filecont );
// gettimeofday( &tv2, NULL );
// printf ("Total time = %f seconds\n",
// ( double ) ( tv2.tv_usec - tv1.tv_usec ) / 1000000 +
// ( double ) ( tv2.tv_sec - tv1.tv_sec ) );
while ( NULL != entry_list )
{
//get input acronym, string is NULL terminated within function
retstring = getInputAcronym();
if( 0 == strcmp( retstring, "err_format" ) )
{
free( retstring );
SET_RED printf( "Input format error - try again: \n 1) Maximum word length is 15 characters \n 2) No spaces. \n\n" ); SET_WHITE
continue;
} //multiple ways to exit
else if( 0 == strcmp( retstring, "." ) || 0 == strcmp( retstring, ".." ) ||
0 == strcmp( retstring, ",," ) || 0 == strcmp( retstring, ",." ) ||
0 == strcmp( retstring, ".," ) || 0 == strcmp( retstring, "," ) )
{
break;
}
if ( -1 == lookUpAcro( entry_list, retstring ) ) { SET_RED printf( "Acronym '%s' not found\n\n", retstring ); SET_WHITE }
free( retstring );
}
printf( "Program ended\n" );
free( retstring );
free( entry_list );
return 0;
}
我只是复制粘贴我现有的清单很多次,看看需要多长时间加载:
缩略词-秒
1640-0.91秒(1800首字母缩写词/秒)
3280-3.64秒
6560-14.27秒
13120-57.30秒(229个缩写词/秒)
26240-231.75秒
52480~927秒
104960~3708秒
209920-14083秒(15个缩写词/秒)
最后一次,我让它通宵运行。
实际上,我认为最多会有5000个首字母缩略词,甚至不太可能——但我只是想看看是否有办法改善这个加载时间。也许使用不同的数据结构?
最佳答案
时序清楚地指示了二次时间复杂度O(N2):处理输入大小的两倍需要更长的时间。
有多次出现二次算法:
在functioncount_rows
中,您使用了一种经典但效率极低的方法:
for ( i = 0; i < strlen( filecont ); i++ )
应避免在每次迭代时重新计算输入长度:
for ( i = 0; filecont[i] != '\0'; i++ )
在
addDefn
中,对给定的首字母缩略图遍历现有列表,在列表末尾添加定义。相反,您应该保留一个指向列表最后一个节点的指针,或者在开头插入新定义,以优先于后面的缩略词定义无论如何,这些列表非常小,因为它们是同一首字母缩略词的替代定义,所以这真的不应该是一个问题。还有其他问题:
文件内容应该为空终止符分配一个额外的字节
c
必须定义为int
才能正确处理EOF
返回的值。getchar()
是一个错误的朋友:如果字符串不适合目标数组,则复制的部分不以空结尾,如果适合,则将数组的其余部分设置为'\0',这是低效的。如果知道字符串长度并且目标数组被分配到适当的大小,请使用strncpy()
甚至strcpy
。memcpy
具有未定义的行为。它可能会丢弃某些体系结构上的挂起输入,但这不是一种可移植的方法。我在OS/X上做了一些测试:clang能够优化循环外对
fflush( stdin );
的重复调用,因此当在未优化的strlen()
模式下使用-O3
与2.969s编译时,程序在0.023s中加载5000个缩写词列表删除-O0
调用会将未优化的生成降低到0.016s,将优化的生成降低到0.012s。这显然是程序中的瓶颈。09-16 05:17