c - pthread并没有真正的加速

我正在尝试实现蒙特卡洛算法的多线程版本。这是我的代码：

#define _POSIX_C_SOURCE 200112L

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <math.h>
#include <semaphore.h>
#include <errno.h>
#include <stdbool.h>
#include <string.h>

#define MAX_THREADS 12
#define MAX_DOTS 10000000

double sum = 0.0;
sem_t sem;

void reset() {
    sum = 0.0;
}

void* check_dot(void* _iterations) {
    int* iterations = (int*)_iterations;
    for(int i = 0; i < *iterations; ++i) {
        double x = (double)(rand() % 314) / 100;
        double y = (double)(rand() % 100) / 100;
        if(y <= sin(x)) {
            sem_wait(&sem);
            sum += x * y;
            sem_post(&sem);
        }
    }
    return NULL;
}

void* check_dots_advanced(void* _iterations) {
    int* iterations = (int*)_iterations;
    double* res = (double*)malloc(sizeof(double));
    for(int i = 0; i < *iterations; ++i) {
        double x = (double)(rand() % 314) / 100;
        double y = (double)(rand() % 100) / 100;
        if(y <= sin(x)) *res += x * y;
    }
    pthread_exit((void*)res);
}

double run(int threads_num, bool advanced) {
    if(!advanced) sem_init(&sem, 0, 1);
    struct timespec begin, end;
    double elapsed;
    pthread_t threads[threads_num];
    int iters = MAX_DOTS / threads_num;
    for(int i = 0; i < threads_num; ++i) {
        if(!advanced) pthread_create(&threads[i], NULL, &check_dot, (void*)&iters);
        else pthread_create(&threads[i], NULL, &check_dots_advanced, (void*)&iters);
    }
    if(clock_gettime(CLOCK_REALTIME, &begin) == -1) {
        perror("Unable to get time");
        exit(-1);
    }
    for(int i = 0; i < threads_num; ++i) {
        if(!advanced) pthread_join(threads[i], NULL);
        else {
            void* tmp;
            pthread_join(threads[i], &tmp);
            sum += *((double*)tmp);
            free(tmp);
        }
    }
    if(clock_gettime(CLOCK_REALTIME, &end) == -1) {
        perror("Unable to get time");
        exit(-1);
    }
    if(!advanced) sem_destroy(&sem);
    elapsed = end.tv_sec - begin.tv_sec;
    elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
    return elapsed;
}

int main(int argc, char** argv) {
    bool advanced = false;
    char* filename = NULL;
    for(int i = 1; i < argc; ++i) {
        if(strcmp(argv[i], "-o") == 0 && argc > i + 1) {
            filename = argv[i + 1];
            ++i;
        }
        else if(strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--advanced") == 0) {
            advanced = true;
        }
    }
    if(!filename) {
        fprintf(stderr, "You should provide the name of the output file.\n");
        exit(-1);
    }
    FILE* fd = fopen(filename, "w");
    if(!fd) {
        perror("Unable to open file");
        exit(-1);
    }
    srand(time(NULL));
    double worst_time = run(1, advanced);
    double result = (3.14 / MAX_DOTS) * sum;
    reset();
    fprintf(fd, "Result: %f\n", result);
    for(int i = 2; i <= MAX_THREADS; ++i) {
        double time = run(i, advanced);
        double accel = time / worst_time;
        fprintf(fd, "%d:%f\n", i, accel);
        reset();
    }
    fclose(fd);
    return 0;
}

但是，随着线程数量的增加，我看不到任何真正的加速（并且我使用的是哪个check_dot（）函数都没有关系）。我尝试使用英特尔酷睿i7-3517u（lscpu表示它具有4个独立的CPU）在笔记本电脑上执行此代码，并且看起来线程数并没有真正影响程序的执行时间：

Number of threads: 1, working time: 0.847277 s
Number of threads: 2, working time: 3.133838 s
Number of threads: 3, working time: 2.331216 s
Number of threads: 4, working time: 3.011819 s
Number of threads: 5, working time: 3.086003 s
Number of threads: 6, working time: 3.118296 s
Number of threads: 7, working time: 3.058180 s
Number of threads: 8, working time: 3.114867 s
Number of threads: 9, working time: 3.179515 s
Number of threads: 10, working time: 3.025266 s
Number of threads: 11, working time: 3.142141 s
Number of threads: 12, working time: 3.064318 s

我以为这应该是至少四个第一个值的执行时间与工作线程数之间的某种线性依赖关系（工作的线程越多，执行时间就越少），但是这里的时间值是相当相等的。这是我的代码中的真正问题还是我要求太高？

最佳答案

您遇到的问题是rand()的内部状态是所有线程之间的共享资源，因此这些线程将在访问rand()时进行序列化。

您需要使用具有每个线程状态的伪随机数生成器-可以使用rand_r()函数（尽管在最新版本的POSIX中已标记为过时）。对于认真的工作，您最好导入一些特定的PRNG算法的实现，例如Mersenne Twister。

关于c - pthread并没有真正的加速，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/39426223/