Closed. This question is off-topic. It is not currently accepting answers. Learn more
想改进这个问题吗Update the question所以堆栈溢出的值小于aa>。
两个月前关闭。
我开始做一个小的项目来实现一个小型的C控制台程序来进行快速的首字母缩略词搜索我在一家公司/行业工作,那里有1000个缩写!
这些首字母缩略词是从各种文档中收集(复制/粘贴)并放入一个文本文件中的程序将此文件的内容加载到内存中,并将其存储在链接列表中。
我使用的格式是:
缩略词定义
或者在首字母缩略词有多种定义的情况下:
缩写#定义1;定义2;定义3
总的来说,程序运行良好-实现了一些基本的错误处理,节省了我查找多个文档的时间。
不过,令我好奇的是首字母缩略词列表的加载时间对于大约900个缩略词,将列表加载到内存中并准备好接受用户输入大约需要0.35秒。
两个关键功能是:
char * loadACR( FILE * fptr, unsigned long int file_size )
AcronymDB * mapAcroDefs( const char * filecont )
loadACR将整个文件加载到内存中
mapAcroDefs调用其他函数来创建首字母缩略词数据库、处理文件内容、创建新定义、遍历链接列表、存储等。
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <dirent.h>
#include <ctype.h>
#include "resources.h"

/*************** DEFS **********************/
#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#define acro_length 16
#define MAX_FILE_SIZE 3145728 //(in bytes == 3MB)
#define SET_GREEN SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_GREEN | FOREGROUND_INTENSITY );
#define SET_RED SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_RED | FOREGROUND_INTENSITY );
#define SET_CYAN SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_BLUE );
#define SET_WHITE SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ), FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE );

/******************************************/

/*************** GLOBALS *******************/
//#include <sys/time.h>
//struct timeval tv1,tv2;
/*******************************************/

/*************** STRUCTS *******************/

//associated definitions in a linked list
typedef struct defnList
{
    char * defn;
    struct defnList * next;
} Definition;

//acronym entry
typedef struct
{
    char * acronym;
    Definition * f_defn;     //a pointer to the first definition
} Acronym;

//database of acronyms
typedef struct
{
    unsigned int entries;
    Acronym ** acronyms;     //array of pointers to acronym entries
} AcronymDB;

/******************************************/

void throwError( unsigned short int err_code, AcronymDB * entry_list, unsigned int r )
{
    SET_RED
    switch ( err_code )
    {
        //retFileHandle errors
        case 1:
            puts( "Directory path error - check access rights" );
            break;
        //getFileSize errors
        case 5:
            puts( "'.acr' file is greater than 3MB, reduce the file size and try again.\n" );
            break;
        //macroAcroDefs errors
        case 35:
        case 36:
        case 37:
            printf( "Error(s) found at lines %u to %u\n", r - 1, r + 1 );
            printf( "Code: %d\n", err_code );
            break;
        //main() errors
        case 40:
            puts( "File error - ensure a '.acr' definitions file exists in the program folder." );
            break;
        //malloc errors
        default:
            printf( "Memory Allocation Error - Code: %d\n", err_code );
    }

    free( entry_list );
    puts ( "Program ended" );
    getchar();
    exit( EXIT_FAILURE );
}

void chkMalloc( void * chk_me, unsigned short int err_code, AcronymDB * entry_list )
{
    if ( !chk_me && ( sizeof ( chk_me ) > 0 ) )
    {
        throwError( err_code, entry_list, 0 );
    }
}

char * getInputAcronym()
{
    char input_acro[ acro_length ], c;
    unsigned int i = 0;

    while( ( c = getchar() ) != '\n' && c != EOF )
    {
        if( acro_length == i || c == ' ' )
        {
            fflush( stdin );
            return ( "err_format" );
        }

        //some acronyms contain parentheses, hyphens etc, these are ignored if the user enters it
        //as they are removed in the mapAcroDefs function before adding into the list
        if ( c != '-' &&
             c != '(' &&
             c != ')' &&
             c != '&' &&
             c != '/'   )
        {
            input_acro[ i ] = toupper( c );
            i++;
        }
    }

    input_acro[ i ] = '\0';

    return strdup( input_acro );
}

FILE * retFileHandle() //errcodes 1 - 4
{
    DIR           * dptr;
    struct dirent * directory;
    FILE          * acr_file;

    dptr = opendir( "." );
    if ( dptr == NULL ) throwError( 1, NULL, 0 );

    while( ( directory = readdir( dptr ) ) != NULL )
    {
        //check whether the file name contains .acr
        //n.b: only the first found .acr occurrence will be used
        if( strstr( directory -> d_name, ".acr" ) )
        {
            acr_file = fopen( directory -> d_name , "r" );
            closedir( dptr );
            return ( acr_file );
        }
    }

    return NULL;
}

unsigned long int getFileSize( FILE * fptr ) //errcodes 5 - 9
{
    unsigned long int file_size, prev;

    prev = ftell( fptr );
    fseek( fptr, 0, SEEK_END );
    file_size = ftell( fptr );
    fseek( fptr, prev, SEEK_SET );

    // > 3MB
    if( file_size >= MAX_FILE_SIZE ) throwError( 5, NULL, 0 );

    else
    {
        return ( file_size );
    }

    return 0; // should never get here, but the compiler complains of returning non-void
}

//load the acr file into memory
char * loadACR( FILE * fptr, unsigned long int file_size ) //errcodes 10 - 14
{
    char * read_buffer;

    read_buffer = ( char * ) malloc ( file_size );
    chkMalloc( read_buffer, 10, NULL );

    //fread returns the total number of elements, so set that last element to '\0' to remove random characters at the end
    read_buffer[ fread( read_buffer, 1, file_size, fptr ) ] = '\0';

    return ( read_buffer );
}

unsigned int countRows( const char * filecont )
{
    unsigned int rows = 0, i;

    for ( i = 0; i < strlen( filecont ); i++ )
    {
        //disregard blank lines at the top or somewhere in the middle
        if ( ( '\n' == filecont[ i ] ) && ( 0 != i ) && ( '\n' != filecont[ i + 1 ] ) )
        {
            rows++;
        }
    }
    //if there's a new line at the end, disregard
    if ( ( '\0' == filecont[ i ] ) && ( '\n' == filecont[ i - 1 ] ) ) rows--;

    return ( rows );
}

void remPunct( char * delimbuff )
{
    char * src, * dst;

    for ( src = dst = delimbuff; * src != '\0'; src++ )
    {
        * dst = * src;
        if ( * dst != '-' &&
             * dst != '(' &&
             * dst != ')' &&
             * dst != '&' &&
             * dst != '/'   )
        {
            dst++;
        }
    }

    * dst = '\0';
}

void addAcronym( AcronymDB * entry_list, char * delimbuff, unsigned int r ) //errcodes 15 - 19
{

    char * ptr_delim;
    //convert the characters to uppercase to make matching easier later, some acronyms have a mix of upper and lowercase
    for ( ptr_delim = delimbuff; * ptr_delim != '\0'; ptr_delim++ )
    {
        * ptr_delim = toupper( * ptr_delim );
    }

    entry_list -> acronyms[ r ] = malloc( sizeof( Acronym ) );
    chkMalloc( entry_list -> acronyms[ r ], 15, entry_list );

    entry_list -> acronyms[ r ] -> acronym = strdup( delimbuff );
    chkMalloc( entry_list -> acronyms[ r ] -> acronym, 16, entry_list );
}

Definition * initDefn() //errcodes 20 - 24
{
    Definition * newDefn = malloc( sizeof( Definition ) );
    chkMalloc( newDefn, 20, NULL );

    newDefn = NULL;
    return ( newDefn );
}

Definition * addDefn( AcronymDB * entry_list, Definition * newDefn, char * delimbuff, unsigned int r ) //errcodes 25 - 29
{

    Definition * head = newDefn;
    Definition * currDefn;
    //first entry if head is NULL, allocate space for the first definition and store it
    if ( NULL == head )
    {
        head = entry_list -> acronyms[ r ] -> f_defn = malloc( sizeof( Definition ) );
        chkMalloc( entry_list -> acronyms[ r ] -> f_defn, 25, entry_list );

        entry_list -> acronyms[ r ] -> f_defn -> defn = strdup( delimbuff );
        chkMalloc( entry_list -> acronyms[ r ] -> f_defn -> defn, 26, entry_list );

        head -> next = entry_list -> acronyms[ r ] -> f_defn -> next = NULL;

        return ( head );
    }
    //else go through all the existing definitions and put the definition at the end of the list
    currDefn = entry_list -> acronyms[ r ] -> f_defn;

    while ( NULL != currDefn -> next )
    {
        currDefn = currDefn -> next;
    }

    currDefn -> next = malloc ( sizeof( Definition ) );
    chkMalloc( currDefn -> next, 27, entry_list );
    currDefn = currDefn -> next;

    currDefn -> defn = strdup( delimbuff );
    chkMalloc( currDefn -> defn, 28, entry_list );

    currDefn -> next = NULL;

    return ( head );
}

//create space for the database of acronyms and enough space for the entry list based on rows counted earlier
AcronymDB * initAcroDB( unsigned int rows ) //errcodes 30 - 34
{
    AcronymDB * entry_list = malloc( sizeof( AcronymDB ) );
    chkMalloc( entry_list, 30, 0 );

    entry_list -> acronyms = malloc( sizeof( Acronym * ) * rows );
    chkMalloc( entry_list -> acronyms, 31, entry_list );

    entry_list -> entries = rows;

    return ( entry_list );
}

Definition * procMultiDefs( const char * pStart, const char * pCurrent, AcronymDB * entry_list, Definition * newDefn, unsigned int r  )
{
    //i hope 250 characters is enough? :S
    char tmpdefhold[ 250 ] = { 0 };

    //we don't want to memmove here as it will affect pCurrent as well, just change where pStart is pointing
    if ( ' ' == pStart[ 0 ] ) ++pStart;

    strncpy( tmpdefhold, pStart, pCurrent - pStart );
    newDefn = addDefn( entry_list, newDefn, tmpdefhold, r );

    return ( newDefn );
}

AcronymDB * mapAcroDefs( const char * filecont ) //errcodes 35 - 39
{
    SET_CYAN  printf( "Acronym Search v1.1\n" );
    SET_WHITE printf( "Reading acronym list...\n" );

    unsigned int rows = countRows( filecont ), r = 0;
    char * tmp_filecont, * delimbuff;

    //copy contents of file into a temporary buffer so it can be passed to strtok
    tmp_filecont = ( char * ) malloc ( ( strlen( filecont ) + 1 ) * sizeof( char ) );
    if ( NULL == tmp_filecont ) throwError( 30, NULL, 0 );

    strncpy( tmp_filecont, filecont, strlen( filecont ) + 1 ); //create a temporary string and null terminate it
    tmp_filecont[ strlen( tmp_filecont ) ] = '\0';

    //create space for database of acronyms based on entries counted
    AcronymDB * entry_list = initAcroDB( rows );

    //tokenise the file contents, the first split will be an acronym entry including a space before the hash
    delimbuff = strtok( tmp_filecont, "#\n" );

    while ( r <= rows )
    {   //catch potential errors in the file (not tokenised according to # or \n
        if ( NULL == delimbuff )
        {
            free( tmp_filecont );
            throwError( 35, entry_list, r + 1 );
        }
        //return location of space before the hash
        char * spcptr = strchr( delimbuff, ' ' );
        if ( NULL == spcptr )
        {
            free( tmp_filecont );
            throwError( 36, entry_list, r + 1 );
        }
        //terminate the word correctly, hopefully we have a valid acronym entry now
        * spcptr = '\0';

        remPunct( delimbuff );
        addAcronym( entry_list, delimbuff, r );

        //allocate memory for a new definition
        Definition * newDefn = initDefn();

        while ( NULL != delimbuff )
        {
            delimbuff = strtok( NULL, "#\n" );

            //get rid of the space that will be present, otherwise there's a formatting error present, missing hash/other char
            if ( ' ' == delimbuff[ 0 ] ) memmove( delimbuff, delimbuff + 1, strlen( delimbuff ) );

            //multiple definitions exist
            if ( NULL != strstr( delimbuff, ";" ) )
            {
                char * pStart = delimbuff, * pCurrent = delimbuff;

                //go through the entire string and process the multiple definitions
                //two pointers are used to mark the string from <start to ';'>
                //https://stackoverflow.com/questions/49788179/how-to-get-a-substring-using-strchr-in-multiple-occurrences-of-in-c-and-stor
                while ( '\0' != * pCurrent)
                {
                    if ( ';' == * pCurrent )
                    {
                        newDefn = procMultiDefs( pStart, pCurrent, entry_list, newDefn, r );
                        pStart = pCurrent + 1;
                    }
                    ++pCurrent;
                }
                //last definition after ; wouldn't be captured by the loop above, so do this again immediately after
                newDefn = procMultiDefs( pStart, pCurrent, entry_list, newDefn, r );
            }
            else
            {
                newDefn = addDefn( entry_list, newDefn, delimbuff, r );
            }
            //go to the next line now
            if ( NULL == strstr( delimbuff, "\n" ) )
            {
                r++;
                delimbuff = strtok( NULL, "#\n" );
                break;
            }

        }
    }

    free( tmp_filecont );
    return ( entry_list );
}

//start from the beginning of the acronym list and look for a match, print any associated definitions
int lookUpAcro( AcronymDB * entry_list, const char * retstring )
{
    unsigned int b = 0;
    int found = -1;

    Acronym * currAcro = entry_list -> acronyms[ b ];

    while ( b <= entry_list -> entries )
    {
        if ( 0 == strcmp( currAcro -> acronym, retstring ) )
        {
            Definition * currDefn = entry_list -> acronyms[ b ] -> f_defn;

            while ( NULL != currDefn )
            {
                SET_GREEN
                SetConsoleOutputCP( 65001 ); //this is required to print out UTF-8 characters, but doesn't work properly?
                printf( "   %s\n", currDefn -> defn );
                currDefn = currDefn -> next;
            }
            SET_WHITE
            found = 1;
            //in case there are duplicate acronym entries with different definitions
            currAcro = entry_list -> acronyms[ ++b ];
        }
        else
        {
            currAcro = entry_list -> acronyms[ ++b ];
        }

    }
    return ( found );
}

int main () //errcodes 40 - 50
{
    char  * retstring, * filecont;
    FILE  * fptr;
    unsigned long int file_size = 0;
    AcronymDB * entry_list;

    system( "Acronym Search" );
    //gettimeofday( &tv1, NULL );

    //open definitions file
    if( ( fptr = retFileHandle() ) == NULL ) throwError( 40, NULL, 0 );

    file_size = getFileSize( fptr );
    filecont = loadACR( fptr, file_size );
    fclose( fptr );

    //create linked lists of acronyms and defs from the file contents read in above, then free the temporary read buffer; filecont
    entry_list = mapAcroDefs( filecont );
    printf( "%u acronyms loaded\n\n", entry_list -> entries + 1 );
    free( filecont );

//  gettimeofday( &tv2, NULL );
//  printf ("Total time = %f seconds\n",
//           ( double ) ( tv2.tv_usec - tv1.tv_usec ) / 1000000 +
//           ( double ) ( tv2.tv_sec - tv1.tv_sec ) );

    while ( NULL != entry_list )
    {
        //get input acronym, string is NULL terminated within function
        retstring = getInputAcronym();

        if( 0 == strcmp( retstring, "err_format" ) )
        {
            free( retstring );

            SET_RED printf( "Input format error - try again: \n  1) Maximum word length is 15 characters \n  2) No spaces. \n\n" ); SET_WHITE
            continue;
        }                       //multiple ways to exit
        else if( 0 == strcmp( retstring, "." )  || 0 == strcmp( retstring, ".." ) ||
                 0 == strcmp( retstring, ",," ) || 0 == strcmp( retstring, ",." ) ||
                 0 == strcmp( retstring, ".," ) || 0 == strcmp( retstring, "," ) )
        {
            break;
        }

        if ( -1 == lookUpAcro( entry_list, retstring ) )  { SET_RED printf( "Acronym '%s' not found\n\n", retstring ); SET_WHITE }

        free( retstring );
    }

    printf( "Program ended\n" );

    free( retstring );
    free( entry_list );

    return 0;
}

我只是复制粘贴我现有的清单很多次,看看需要多长时间加载:
缩略词-秒
1640-0.91秒(1800首字母缩写词/秒)
3280-3.64秒
6560-14.27秒
13120-57.30秒(229个缩写词/秒)
26240-231.75秒
52480~927秒
104960~3708秒
209920-14083秒(15个缩写词/秒)
最后一次,我让它通宵运行。
实际上,我认为最多会有5000个首字母缩略词,甚至不太可能——但我只是想看看是否有办法改善这个加载时间。也许使用不同的数据结构?

最佳答案

时序清楚地指示了二次时间复杂度O(N2):处理输入大小的两倍需要更长的时间。
有多次出现二次算法:
在functioncount_rows中,您使用了一种经典但效率极低的方法:

for ( i = 0; i < strlen( filecont ); i++ )

应避免在每次迭代时重新计算输入长度:
for ( i = 0; filecont[i] != '\0'; i++ )

addDefn中,对给定的首字母缩略图遍历现有列表,在列表末尾添加定义。相反,您应该保留一个指向列表最后一个节点的指针,或者在开头插入新定义,以优先于后面的缩略词定义无论如何,这些列表非常小,因为它们是同一首字母缩略词的替代定义,所以这真的不应该是一个问题。
还有其他问题:
文件内容应该为空终止符分配一个额外的字节
c必须定义为int才能正确处理EOF返回的值。
getchar()是一个错误的朋友:如果字符串不适合目标数组,则复制的部分不以空结尾,如果适合,则将数组的其余部分设置为'\0',这是低效的。如果知道字符串长度并且目标数组被分配到适当的大小,请使用strncpy()甚至strcpy
memcpy具有未定义的行为。它可能会丢弃某些体系结构上的挂起输入,但这不是一种可移植的方法。
我在OS/X上做了一些测试:clang能够优化循环外对fflush( stdin );的重复调用,因此当在未优化的strlen()模式下使用-O3与2.969s编译时,程序在0.023s中加载5000个缩写词列表删除-O0调用会将未优化的生成降低到0.016s,将优化的生成降低到0.012s。这显然是程序中的瓶颈。

09-16 05:17