我只是不知道该怎么办...

该功能在调试中运行良好,但在发行版中运行不佳。

我正在尝试学习人工神经网络和C ++向量。

这是我用C ++编写的代码(在Python 2.7中):

http://neuralnetworksanddeeplearning.com/chap1.html#exercise_852508

(只需稍微滚动即可到达)

我正在使用MSYS2(C ++ 11)中的MinGW 7.2.0。

反向传播方法中有一些“ teste”打印,这就是问题所在(我想)。我还重载了运算符+,-和*以使事情变得更容易。

我知道有一些类似Armadillo的库可以使事情变得更轻松,但是我真的想用这个问题来学习更好。

这是文件:

神经网络

(我公开了所有内容,使事情看起来更容易看)

#define MIN_NUMBER_TOLERANCE 1e-8

namespace nn
{
    class neuralnetwork
    {
    //private:
    public:
        //total number of weights. useful to reserve memory
        int numWeights;
        //total number of biases. useful to reserve memory
        int numBiases;
        //total number of layers: 1 for input, n hidden layers and 1 for output
        int numLayers;
        //a vector to store the number of neurons in each layer: 0 index is about the input layer, last index is about the output layer
        std::vector<int> sizes;
        //stores all biases: num of neurons of layer 1 + ... + num of neurons of layer (numLayers - 1) (input layer has no bias)
        std::vector<std::vector<double>> biases;
        //stores all weights: (num of neurons of layer 1) x (num of neurons of layer ) + ... + ( num of neurons of layer (numLayers - 1) ) x ( num of neurons of layer (numLayers - 2) ) (input layer has no bias)
        std::vector<std::vector<std::vector<double>>> weights;
        //stores the output of each neuron of each layer
        std::vector<std::vector<double>> layersOutput;

        std::vector<std::vector<std::vector<double>>> derivativeWeights;
        std::vector<std::vector<double>> derivativeBiases;

        std::default_random_engine generator;
        std::normal_distribution<double> distribution;

        double randomNormalNumber(void);

        double costDerivatives(const double&, const double&);

        std::vector<double> costDerivatives(const std::vector<double> &, const std::vector<double> &);

        void backPropagation(const std::vector<double>& neuralNetworkInputs, const std::vector<double>& expectedOutputs, // inputs
                      std::vector<std::vector<std::vector<double>>>& derivativeWeights, std::vector<std::vector<double>>& derivativeBiases); // outputs

        void update_mini_batch( const std::vector<std::pair<std::vector<double>,std::vector<double>>> & mini_batch, double eta);

    //public:

        neuralnetwork(const std::vector<int>& sizes);

        std::vector<double> feedforward(const std::vector<double>&);
    };


    std::vector<double> sigmoid(const std::vector<double> &);
    double sigmoid(double);
    std::vector<double> sigmoid_prime(const std::vector<double> &);
    //double sigmoid_prime(double);


}


神经网络

#include "neuralnetwork.h"
#include <iostream>
#include <assert.h>
#include <algorithm>

namespace nn
{
    int counter = 0;

    neuralnetwork::neuralnetwork(const std::vector<int> &sizes)
    {
        this->distribution = std::normal_distribution<double>( 0.0 , 1.0 );

        this->numLayers = sizes.size();
        this->sizes = sizes;

        this->numWeights = 0;
        this->numBiases = 0;

        for ( int i = 1 ; i < this->numLayers ; i++ )
        {
            numWeights += this->sizes[ i ] * this->sizes[ i - 1 ];
            numBiases += this->sizes[ i ];
        }

        this->weights.reserve( numWeights );
        this->biases.reserve( numBiases );

        this->derivativeWeights.reserve( numWeights );
        this->derivativeBiases.reserve( numBiases );

        this->layersOutput.reserve( this->sizes[ 0 ] + numBiases );

        std::vector<double> auxVectorWeights;
        std::vector<std::vector<double> > auxMatrixWeights;

        std::vector<double> auxVectorBiases;

#ifdef DEBUG_BUILD
        std::cout << "debugging!\n";
#endif

        //just to accommodate the input layer with null biases and inputs (makes things easier to iterate and reading :D).
        this->layersOutput.push_back( std::vector<double>( this->sizes[ 0 ] ) );
        std::vector<std::vector<double>> matrixNothing( 0 );
        this->weights.push_back( matrixNothing );
        this->biases.push_back( std::vector<double>( 0 ) );

        //since the second layer (index 1) because there is no weights (nor biases) for the neurons of the first layer
        for ( int layer = 1 ; layer < this->numLayers ; layer++ )
        {
            //preallocate memory for the output of each layer.
            layersOutput.push_back( std::vector<double>( this->sizes[ layer ] ) );

            //-----------weights begin--------------
            //auxMatrixWeights will store the weights connections between one layer (number of columns) and its subsequent layer (number of rows)
            //auxMatrixWeights = new std::vector(this->sizes[layer], std::vector<double>( this->sizes[layer - 1] )); // it is not working...

            //size[layer] stores the number of neurons on the layer
            for ( int i = 0 ; i < this->sizes[ layer ] ; i++ )
            {
                //auxVectorWeights will have the size of the amount of wights necessary to connect the neuron i (from this layer) to neuron j (from next layer)
                auxVectorWeights = std::vector<double>( this->sizes[ layer - 1 ] );


                for ( int j = 0 ; j < auxVectorWeights.size() ; j++ )
                {
                    auxVectorWeights[ j ] = this->randomNormalNumber();
                }

                auxMatrixWeights.push_back( auxVectorWeights );
            }

            this->weights.push_back( auxMatrixWeights );

            auxMatrixWeights.clear();

            //-----------weights end----------------


            //-----------biases begin---------------
            auxVectorBiases = std::vector<double>( this->sizes[ layer ] );

            for ( int i = 0 ; i < auxVectorBiases.size() ; i++ )
            {
                auxVectorBiases[ i ] = this->randomNormalNumber();
            }

            this->biases.push_back( auxVectorBiases );
            //-----------biases end-----------------
        }

#ifdef _DEBUG
        for ( int i = 0 ; i < this->weights.size() ; i++ )
        {
            std::cout << "layer " << i << "\n";
            for ( int j = 0 ; j < this->weights[ i ].size() ; j++ )
            {
                std::cout << "neuron" << j << std::endl;
                for ( const auto k : this->weights[ i ][ j ] )
                {
                    std::cout << '\t' << k << ' ';
                }
                std::cout << std::endl;
            }
        }
#endif
    }

    template <class T>
    inline int lastIndex(std::vector<T> vector , int tail)
    {
        return (vector.size() - tail);
    }

    double neuralnetwork::randomNormalNumber(void)
    {
        return this->distribution( this->generator );
    }

    double sigmoid(double z)
    {
        return 1.0 / ( 1.0 + exp( -z ) );
    }

    std::vector<double> sigmoid(const std::vector<double> & z)
    {
        int max = z.size();
        std::vector<double> output;
        output.reserve(max);

        for(int i=0;i<max;i++)
        {
            output.push_back(0);
            output[i] = 1.0 / ( 1.0 + exp( -z[i] ) );
        }

        return output;
    }

    /*double sigmoid_prime(double z)
    {
        return sigmoid( z ) * ( 1 - sigmoid( z ) );
    }*/

    std::vector<double> sigmoid_prime(const std::vector<double>& z)
    {
        int max = z.size();
        std::vector<double> output;
        output.reserve(max);

        for(int i=0;i<max;i++)
        {
            output.push_back(sigmoid( z[i] ) * ( 1 - sigmoid( z[i] ) ) );
        }

        return output;
    }

    //scalar times vector
    std::vector<double> operator* (double a , const std::vector<double> & b)
    {
        int size = b.size();

        std::vector<double> result(size);

        for ( int i = 0 ; i < size ; i++ )
        {
            result[i] = a * b[ i ];
        }

        return result;
    }

    // inner product
    std::vector<double> operator* (const std::vector<double> & a , const std::vector<double> & b)
    {

#ifdef _DEBUG
        assert(a.size() == b.size());
#endif

        int size = a.size(); // or b.size(). they should have the same size.

        std::vector<double> result;
        result.reserve(size); // or b.size(). they should have the same size.


        for ( int i = 0 ; i < size ; i++ )
        {
            result.push_back( a[ i ] * b[ i ] );
        }

        return result;
    }

    //matrix times columns vector
    std::vector<double> operator* (const std::vector<std::vector<double>> & a , const std::vector<double> & b)
    {
#ifdef _DEBUG
        assert(a[0].size() == b.size());

        for(int i = 0 ; i < ( lastIndex( a , 1 )) ; i++)
        {
            assert(a[i].size() == a[i+1].size());
        }
#endif

        int lines = a.size();
        int columns = a[0].size();

        std::vector<double> result;
        result.reserve(lines);

        int j = 0;

        for ( int i = 0 ; i < lines ; i++ )
        {
            result.push_back(0);
            for(j = 0 ; j < columns ; j++)
            {
                result[i] += a[ i ][ j ] * b[ j ];
            }
        }

        return result;
    }

    //scalar times matrix (calls scalar times vector)
    std::vector<std::vector<double>> operator* (double a , const std::vector<std::vector<double>> & b)
    {
#ifdef _DEBUG
        for(int i = 0 ; i < b.size()-1 ; i++)
        {
            assert(b[i].size() == b[i+1].size());
        }
#endif

        int lines = b.size();
        int columns = b[0].size();

        std::vector<std::vector<double>> result;

        int j = 0;

        for ( int i = 0 ; i < lines ; i++ )
        {
            result.push_back(a * b[ j ]);
        }

        return result;
    }

    std::vector<double> operator+(const std::vector<double>& a, const std::vector<double>& b)
    {
        assert(a.size() == b.size());

        int size = a.size();

        std::vector<double> result;
        result.reserve(size);

        for(int i = 0 ; i < size ; i++)
        {
            result.push_back(0);
            result[i] = a[i] + b[i];
        }

        return result;
    }

    //sum of matrices
    std::vector<std::vector<double>> operator+(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b)
    {
#ifdef _DEBUG
        assert(a.size() == b.size());
#endif

        int size = a.size();

#ifdef _DEBUG
        for(int i = 0 ; i < size ; i++)
        {
            assert(a[i].size() == b[i].size());
        }
#endif

        std::vector<std::vector<double>> result;
        result.resize(size);

        for(int i = 0 ; i < size ; i++)
        {
            result.push_back(a[i] + b[i]);
        }

        return result;
    }


    //subtraction of vectors
    std::vector<double> operator-(const std::vector<double>& a, const std::vector<double>& b)
    {
#ifdef _DEBUG
        assert(a.size() == b.size());
#endif

        int size = a.size();

        std::vector<double> result;
        result.resize(size);

        for(int i = 0 ; i < size ; i++)
        {
            result[i] = a[i] - b[i];
        }

        return result;
    }

    //subtraction of matrices (calls subtraction of vectors)
    std::vector<std::vector<double>> operator-(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b)
    {
#ifdef _DEBUG
        assert(a.size() == b.size());
#endif

        int size = a.size();

#ifdef _DEBUG
        for(int i = 0 ; i < size ; i++)
        {
            assert(a[i].size() == b[i].size());
        }
#endif

        std::vector<std::vector<double>> result;
        result.resize(size);

        for(int i = 0 ; i < size ; i++)
        {
            result.push_back(a[i] - b[i]);
        }

        return result;
    }

    //elementwise division
    std::vector<double> operator/(const std::vector<double>& a, const std::vector<double>& b)
    {
        assert(a.size() == b.size());

        int size = a.size();

        std::vector<double> result;
        result.reserve(size);

        for(int i = 0 ; i < size ; i++)
        {
            if(b[i] < MIN_NUMBER_TOLERANCE)
            {
                throw std::runtime_error("Can't divide by zero!");
            }
            result[i] = a[i] / b[i];
        }

        return result;
    }

    double neuralnetwork::costDerivatives(const double &networkOutput , const double &expectedOutput)
    {
        return expectedOutput - networkOutput;
    }

    std::vector<double> neuralnetwork::costDerivatives(const std::vector<double> &networkOutput , const std::vector<double> &expectedOutput)
    {
        assert(expectedOutput.size() == networkOutput.size());
        int size = networkOutput.size();
        std::vector<double> output;
        output.reserve(size);

        for(int i = 0 ; i < size ; i++)
        {
            output.push_back(networkOutput[i] - expectedOutput[i]);
        }

        return output;
    }

    void neuralnetwork::backPropagation(const std::vector<double> &neuralNetworkInputs , const std::vector<double> &expectedOutputs, // inputs
                                        std::vector<std::vector<std::vector<double>>>& derivativeWeights , std::vector<std::vector<double>>& derivativeBiases) // outputs
    {


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");



        derivativeWeights.reserve( sizes.size() - 1 );
        derivativeBiases.reserve( sizes.size() - 1 );

        //to store one activation layer
        std::vector<double> activation = neuralNetworkInputs;
        //to store each one of the activation layers
        std::vector<std::vector<double>> activations;

        activations.reserve(sizes.size()); // numBiases is the same as the number of neurons (except 1st layer)
        activations.push_back(activation);
        int maxLayerSize = 0;


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");




        for ( int i = 1 ; i < numBiases ; i++ )
        {
            maxLayerSize = std::max(sizes[i], maxLayerSize);
        }


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");




        // to store one weighted sum
        std::vector<double> z;
        z.reserve(maxLayerSize);
        // to store each one of the weighted sums
        std::vector<std::vector<double>> zs;
        zs.reserve(sizes.size());

        // layer and neuron counter
        int layer, neuron;

        for ( layer = 1 ; layer < numLayers ; layer++ )
        {
            z = (weights[layer] * activation) + biases[layer];
            zs.push_back(z);
            activation = sigmoid(z);
            activations.push_back(activation);
        }


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");




        std::vector<double> delta = costDerivatives(activations[ lastIndex( activations , 1 )] , expectedOutputs) * sigmoid_prime(z);
        delta.reserve(maxLayerSize);

        derivativeBiases.push_back(delta);

        int j;

        std::vector<std::vector<double>> dummyMatrix;
        dummyMatrix.reserve(maxLayerSize);


        for (neuron = 0; neuron < sizes[ lastIndex( sizes , 1 )]; neuron++)
        {
            dummyMatrix.push_back(std::vector<double>(activations[ lastIndex( activations , 2 )].size()));
            for (j = 0; j < activations[ lastIndex( activations , 2 )].size(); j++)
            {
                dummyMatrix[neuron][j] = delta[neuron] * activations[ lastIndex( activations , 2 )][j];
            }
        }


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");



        derivativeWeights.push_back(dummyMatrix);
        dummyMatrix.clear();

        std::vector<double> sp;
        sp.reserve(maxLayerSize);

        std::vector<double> dummyVector;
        dummyVector.reserve(maxLayerSize);

        double dummyDouble = 0;

        for(layer = 2 ; layer < numLayers ; layer++)
        {
            z = zs[ lastIndex( zs , layer )];
            sp = sigmoid_prime(z);

            for(j = 0 ; j < sizes[ lastIndex( weights , layer )] ; j++)
            {
                for (neuron = 0; neuron < sizes[ lastIndex( sizes , layer - 1 )]; neuron++)
                {
                    dummyDouble += weights[ lastIndex( weights , layer - 1 )][neuron][j] * delta[neuron];
                }
                dummyVector.push_back(dummyDouble * sp[j]);
                dummyDouble = 0;
            }
            delta = dummyVector;
            dummyVector.clear();

            derivativeBiases.push_back(delta);

            for (neuron = 0; neuron < sizes[ lastIndex( sizes , layer )]; neuron++)
            {
                dummyMatrix.push_back(std::vector<double>(sizes[ lastIndex( sizes , layer + 1 )]));
                for (j = 0; j < sizes[ lastIndex( sizes , layer + 1 )]; j++)
                {
                    dummyMatrix[neuron][j] = activations[ lastIndex( activations , layer + 1 )][j] * delta[neuron];
                }
            }
            derivativeWeights.push_back(dummyMatrix);
            dummyMatrix.clear();
        }


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");




        //both derivativeWeights and derivativeBiases are reversed. so let's reverse it.
        std::reverse(derivativeWeights.begin(),derivativeWeights.end());
        std::reverse(derivativeBiases.begin(),derivativeBiases.end());


        std::cout << "teste "<< counter++ << std::endl;
        system("PAUSE");




    }
}


main.cpp

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include "neuralnetwork.h"
#include <string>

void printAll(const std::vector<double> & v, const std::string & name)
{
    int size = v.size();

    std::cout << "\t" <<  name << ":\t";

    for(int i = 0 ; i < size ; i++)
    {
        std::cout << v[i] << "\t";
    }

    std::cout << std::endl;
}

template<class T>
void printAll(const std::vector<T> & v, const std::string & name)
{
    int size = v.size();

    std::cout << name << ":" << std::endl;

    for(int i = 0 ; i < size ; i++)
    {
        printAll(v[i], "\t" + ("[" + std::to_string(i)) + "]");
    }
}

int main(int argc, char** argv )
{

    nn::neuralnetwork n({2,4,3});

    n.weights = {{},{{1,2},{3,4},{5,6},{7,8}} , {{9,8,7,6},{5,4,3,2},{1,2,3,4}}};
    n.biases = {{},{1, 4, 6, 8} , {9, 2, 4}};

    printAll(n.weights,"weights");
    printAll(n.biases,"biases");

    std::vector<std::vector<std::vector<double>>> derivativeWeights;
    std::vector<std::vector<double>> derivativeBiases;
    n.backPropagation({1,2},{1,2,3},derivativeWeights,derivativeBiases);

    printAll(n.derivativeWeights,"derivativeWeights");
    printAll(n.derivativeBiases,"derivativeBiases");

    system("PAUSE");

    return 0;
}

最佳答案

看来您的问题是您只为构造函数中的向量保留内存,而不分配它。

reserve方法不会调整向量的大小,它是性能优化,如果您知道将来会调整向量的大小,但是优化的编译器可以忽略它。

在此特定代码中,这不会对“权重”和“偏见”造成任何问题,因为您正在使用适当大小的向量对其进行初始化,而这些向量的确将其设置为正确的大小。问题在于导数权重和导数偏差,您在其中为向量保留了内存,但实际上从未调整过它们的大小。如果您尝试取消引用该内存,则可能使该内存无效。您可以使用调整大小而不是保留大小,或者一一推回元素,这也会调整向量的大小。

另一条评论是,您不必对类的每个成员都使用this->,如果您不使用它,则对类的成员使用'this->'。

08-25 00:31