我有一个程序,需要对大量的大数字分布进行排序。为了减少执行此操作所需的时间,我尝试对此进行多线程处理。

我为程序编写了一个简单的小摘要,以尝试找出问题所在。我相信我遇到了堆栈溢出或达到操作系统的堆栈限制,因为在以下情况下,我的测试程序会反射(reflect)段错误问题:

  • 分布都是相同的值(意味着qsort会像废话一样运行)
  • 启用线程。


  • #include <boost/thread/thread.hpp>
    #include <vector>
    #include <stdlib.h> // for rand()
    
    void swapvals(double *distribution, const size_t &d1, const size_t &d2)
    {
        double temp = 0;
        temp = distribution[d2];
        distribution[d2] = distribution[d1];
        distribution[d1] = temp;
        //std::swap(distribution[d1], distribution[d2]);
    
    }
    
    size_t partition(double *distribution,  size_t left, size_t right)
    {
            const double pivot = distribution[right];
    
            while (left < right) {
    
                    while ((left < right) && distribution[left] <= pivot)
                            left++;
    
                    while ((left < right) && distribution[right] > pivot)
                            right--;
    
                    if (left < right)
                    {
                            swapvals(distribution, left, right);
                    }
            }
            return right;
    }
    
    void quickSort(double *distribution, const size_t left, const size_t right)
    {
            if (left >= right) {
                    return;
            }
            size_t part = partition(distribution, left, right);
            quickSort(distribution, left, part - 1);
            quickSort(distribution, part + 1, right);
    }
    void processDistribution(double *distributions, const size_t distribution_size)
    {
    
           std::clog << "beginning qsorting." << std::endl;
           quickSort(distributions, 0, distribution_size - 1);
           std::clog << "done qsorting." << std::endl;
    
    }
    
    int main(int argc, char* argv[])
    {
        size_t distribution_size = 65000;
        size_t num_distributions = 10;
    
        std::vector<double *> distributions;
    
        // Create num_distributions distributions.
        for (int i = 0; i < num_distributions; i++)
        {
            double * new_dist = new double[distribution_size];
            for (int k = 0; k < distribution_size; k++)
            {
                // Works when I have actual numbers in the distributions.
                // Seg faults when all the numbers are the same.
                new_dist[k] =1;
                //new_dist[k] = rand() % 1000 + 1; // uncomment this, and it works.
            }
    
            distributions.push_back(new_dist);
        }
    
        // Submit each distribution to a quicksort thread.
        boost::thread_group threads;
        for (std::vector<double *>::const_iterator it=distributions.begin(); it != distributions.end(); ++it)
        {
             // It works when I run processDistribution directly. Segfaults when I run it via threads.
             //processDistribution(*it, distribution_size);
             threads.create_thread(boost::bind(&processDistribution, *it, distribution_size));
        }
        threads.join_all();
    
        // Show the results of the sort for all the distributions.
        for (std::vector<double *>::const_iterator it=distributions.begin(); it != distributions.end(); ++it)
        {
            for (size_t i = 0; i < distribution_size; i++)
            {
                // print first and last 20 results.
                if (i < 20 || i > (distribution_size - 20))
                    std::cout << (*it)[i] << ",";
            }
            std::cout << std::endl;
        }
    
    }
    

    GDB对核心文件的分析得出:
    Error in re-setting breakpoint -1: aix-thread: ptrace (52, 18220265) returned -1 (errno = 3 The process does not exist.)
    Error in re-setting breakpoint -1: aix-thread: ptrace (52, 18220265) returned -1 (errno = 3 The process does not exist.)
    Error in re-setting breakpoint -2: aix-thread: ptrace (52, 18220265) returned -1 (errno = 3 The process does not exist.)
    Error in re-setting breakpoint -3: aix-thread: ptrace (52, 18220265) returned -1 (errno = 3 The process does not exist.)
    Core was generated by `testthreads'.
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  0x00000001000056bc in partition (distribution=0x1101d1430, left=0, right=63626) at testthreads.cpp:18
    
    warning: Source file is more recent than executable.
    18
    (gdb) bt 7
    #0  0x00000001000056bc in partition (distribution=0x1101d1430, left=0, right=63626) at testthreads.cpp:18
    #1  0x0000000100005834 in quickSort (distribution=0x1101d1430, left=0, right=63626) at testthreads.cpp:42
    #2  0x0000000100005850 in quickSort (distribution=0x1101d1430, left=0, right=63627) at testthreads.cpp:43
    #3  0x0000000100005850 in quickSort (distribution=0x1101d1430, left=0, right=63628) at testthreads.cpp:43
    #4  0x0000000100005850 in quickSort (distribution=0x1101d1430, left=0, right=63629) at testthreads.cpp:43
    #5  0x0000000100005850 in quickSort (distribution=0x1101d1430, left=0, right=63630) at testthreads.cpp:43
    #6  0x0000000100005850 in quickSort (distribution=0x1101d1430, left=0, right=63631) at testthreads.cpp:43
    (More stack frames follow...)
    (gdb) frame 0
    #0  0x00000001000056bc in partition (distribution=0x1101d1430, left=0, right=63626) at testthreads.cpp:18
    18
    (gdb) info locals
    pivot = 1
    (gdb) info args
    distribution = 0x1101d1430
    left = 0
    right = 63626
    (gdb)
    

    另外,我的实际程序处理更多的线程和发行版。而且那里的GDB检查通常会显示更多看起来像内存损坏的异常堆栈跟踪(请注意,如何使用d1 = 12119调用swapVals,但在分区堆栈框架内将其作为4568618016来通过):
    (gdb) bt 3
    #0  0x00000001002aa0b8 in ScenRankReplacer<double>::swapvals (this=0xfffffffffffdfc8, distribution=..., d1=@0x1104c8178: 4568618016, d2=@0x1104c8140: 4568416720, ranking_values=0x1104c81d0,
        r1=@0x1104c8170: 1152921504606838728, r2=@0x1002a16c8: 6917529029728344952) at ScenRankReplacer.h:96
    #1  0x00000001002a7120 in ScenRankReplacer<double>::partition (this=0xfffffffffffdfc8, distribution=..., ranking_values=0x11069ae50, left=1, right=24237) at ScenRankReplacer.h:122
    #2  0x00000001002a16c8 in ScenRankReplacer<double>::quickSort (this=0xfffffffffffdfc8, distribution=..., ranking_values=0x11069ae50, left=1, right=24237) at ScenRankReplacer.h:91
    (More stack frames follow...)
    (gdb) frame 1
    #1  0x00000001002a7120 in ScenRankReplacer<double>::partition (this=0xfffffffffffdfc8, distribution=..., ranking_values=0x11069ae50, left=1, right=24237) at ScenRankReplacer.h:122
    122             swapvals(distribution, mid, left, ranking_values, mid - 1, left - 1);
    (gdb) p mid
    $1 = 12119
    (gdb) p left
    $2 = 1
    

    所以...我的问题:
  • 我正确吗?我是否达到了筹码限制?
  • 我到底如何确定是这种情况(除了我上面所做的推论之外)?有一种简单的方法可以检测到这些吗? GDB线索还是什么?
  • 为什么线程很重要?是否所有线程共享相同的线程
    堆栈限制?
  • 最重要:如何使它正常工作?是一个
    大规模数据集上的递归快速排序不可行吗?

  • 编译级别为O2时发生错误。
    线程模型:aix
    gcc版本4.8.3(GCC)

    最佳答案

    看起来可能与堆栈空间有关。线程很重要,因为尽管所有线程都有自己的堆栈,但是这些堆栈都共享相同的内存池。堆栈通常会根据需要增长,直到它们运行到已使用的内存中为止,在这种情况下,该内存很可能是来自另一个线程的堆栈。单线程程序不会有这个问题,并且可以使它的堆栈更大。 (另外,对于多个线程,您同时执行多种排序,这将需要更多的堆栈空间。)

    解决此问题的一种方法是删除递归,并使用一些循环和本地存储来替换它。像这样的(未编译或经过测试)的代码:

    void quickSort(double *distribution, size_t left, size_t right) {
        std::vector<std::pair<size_t, size_t>> ranges;
        for (;;) {
            for (;;) {
                if (left <= right)
                    break;
                size_t part = partition(distribution, left, right);
    
                // save range for later to replace the second recursive call
                ranges.push_back(std::make_pair(part + 1, right));
    
                // set right == part - 1, then loop, to replace the first recursive call
                right = part - 1;
            }
            if (ranges.empty())
                break;
    
            // Take top off of ranges for the next loop, replacing the second recursive call
            left = ranges.back().first;
            right = ranges.back().second;
            ranges.pop_back();
        }
    }
    

    关于c++ - C++/调试(在AIX上为g++)导致段错误的递归Quicksort,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/32575470/

    10-11 22:57
    查看更多