我正在编写一个程序,每次读取一个大文件(44GB-63GB)1MB,然后我对这个1MB进行哈希运算。但是,我想看看执行这些散列需要多长时间
我不关心一次读取一个文件需要多长时间,只关心哈希性能时间。目前我正在使用一个非常基本/通用的散列函数
你知道我在哪里开始和结束计时吗?
以下是我目前掌握的情况:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define HASH_PRIME 65551// prime number for hash table

// generic hash function
static unsigned short hash_Function(char *hash_1MB)
{
    unsigned short hash;
    int i = 0;
    while(hash_1MB[i]!='\0')//each char of the file name
    {
        hash += (unsigned short)hash_1MB[i];//add it to hash
        i++;
    }
    return hash%HASH_PRIME;//mod hash by table size
}

int main()
{
    struct stat fileSize;
    char *buffer;

    FILE *fp;
    clock_t start, stop;
    double duration;
    char fname[40];

    printf("Enter name of file:");
    fgets(fname, 40, stdin);
    while (fname[strlen(fname) - 1] == '\n')
    {
        fname[strlen(fname) - 1] = '\0';
    }

    // handle file, open file, and read in binary form
    fp = fopen(fname, "rb");
    if (fp == NULL)
    {
        printf("Cannot open %s for reading\n", fname);
        exit(1);
    }

    stat(fname, &fileSize);
    size_t size = fileSize.st_size;
    printf("Size of file: %zd\n", size);

    buffer = (char*) malloc(sizeof(*buffer)*1000*1000);

    unsigned long long counter = 0;
    // read in 1MB at a time // & start timing how long it takes to perform the hash
    start = clock();
    clock_t total = 0;
    while (fread(buffer, sizeof(*buffer), (1<<20), fp) == (1<<20))
    {
    start = clock();
    hash_Function(buffer);
    counter++;
    total += (clock() - start);
    }

    //free(buffer);

     fclose (fp); // close files

     duration = (double)((stop - start)/CLOCKS_PER_SEC);

     printf("Counter: %llu\n", counter); // how many MB were hashed
     printf("Hashing took %.2f seconds\n", (float)duration);
     return 0;
}

另外,我的结果并不像预期的那样,我分析的第一个文件是1961893364字节大,所以应该至少有1961MB是散列的
但是当我打印出我的计数器来检查是否正确的MB被散列时,我只得到1871
以下是我的结果:
$ gcc one_mb.c
$ ./a.out
Enter name of file:v.10.nc
Size of file: 1961893364
Counter: 1871
Hashing took 0.00 seconds

提前谢谢你的帮助!
/////结果w/(1000*1000)
Enter name of file:v.13.nc
Size of file: 15695146912
Counter: 15695
Hashing took 18446744.00 seconds

//////结果w/1<
Enter name of file:v.13.nc
Size of file: 15695146912
Counter: 14968
Hashing took 18446744.00 seconds // why this long?!?!? It didn't take 30mins

/////将while循环替换为for循环
// generic hash function
static unsigned short hash_Function(char *hash_1MB)
{
    unsigned short hash;
    int i;

    for(i = 0; i < (1 << 20); i++)
    {
        hash += (unsigned short)hash_1MB[i];//add it to hash
    }

    return hash%HASH_PRIME;//mod hash by table size
}

最佳答案

您需要在while循环中使用时间戳,并保持它们的总和,以避免对文件IO计时。

start = clock();
clock_t total = 0;
while (fread(buffer, 1<<20, (1<<20), fp) == (1<<20))
{
    start = clock();
    hash_Function(buffer);
    counter++;
    total += (clock() - start);
}

注意,我把1000*1000改为1还要确保正确分配缓冲区至少1 MB。
buffer = (char*) malloc(1<<20);

以下值将计算为(字符大小)*1000*1000=1000*1000,这将不起作用。
buffer = (char*) malloc(sizeof(*buffer)*1000*1000);

此外,在执行size of(*buffer)时,还返回字符的大小(1字节)。见更新的fread。

关于c - Clock()无法正常工作;避免IO,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/25064111/

10-09 00:28