前言

一个小Demo, 用于分析的源文件比较简单, 主要的部分都有, 扩展比较容易.
将正则表达式表示的模式构造为状态转换图. 在本文中只列举状态转换图.

双缓冲区(代码中的Buffer类): https://my.oschina.net/u/3107416/blog/3110834

数字的状态转换

[编译原理-词法分析(二)] 使用状态转换图识别词法单元-LMLPHP

保留字和ID的状态转换

[编译原理-词法分析(二)] 使用状态转换图识别词法单元-LMLPHP

运算符的状态转换

[编译原理-词法分析(二)] 使用状态转换图识别词法单元-LMLPHP

用于分析的源文件

[编译原理-词法分析(二)] 使用状态转换图识别词法单元-LMLPHP

结果

[编译原理-词法分析(二)] 使用状态转换图识别词法单元-LMLPHP

前情提要

一、词素模式
二、打印Token
三、StateTransition类
四、StateTransition的构造与析构函数
五、StateTransition普通函数的实现
六、运算符的状态转换
七、数字的状态转换
八、保留字和ID的状态转换
九、空格, 制表符, 换行符设置
十、调用

正文

将使用<~> 标记来自哪个文件

一、词素模式
<~Token.h>

namespace Lexical_Analysis {
    enum Tag {
        RESERVE_WORD = 256,
        ID,
        METHOD,
        RELOP,
        NUM,
    };

    enum RelopTag {
        LT, // <
        LE, // <=
        EQ, // =
        NE, // <>
        GT, // >
        GE, // >=
    };

    enum ReserveWordTag {
        INT,
        IF,
        ELSE,
        THEN
    };

    class Token {
    public:
        Tag tag;
        explicit Token(Tag t): tag(t) { }
        virtual ~Token() = default;
    };

    // 运算符值
    class RelopToken : public Token {
    public:
        RelopTag relop;
        explicit RelopToken(RelopTag tag): Token(RELOP), relop(tag) { }
    };

    // 保留字tag, 和词素
    class ReserveWordToken : public Token {
    public:
        ReserveWordTag reserveWordTag;
        std::string lexeme;
        ReserveWordToken(ReserveWordTag t, std::string l): Token(RESERVE_WORD), reserveWordTag(t), lexeme(l) { }
    };

    // 存储id 值
    class IdToken : public Token {
    public:
        std::string value;
        explicit IdToken(std::string v): Token(ID), value(v) { }
    };

    // 存储int 值
    class IntToken : public Token {
    public:
        int value;
        explicit IntToken(int v): Token(NUM), value(v) { }
    };

    // 存储double 值
    class DoubleToken : public Token {
    public:
        double value;
        explicit DoubleToken(double v): Token(NUM), value(v) { }
    };

}
二、打印Token
void Lexical_Analysis::StateTransition::print_token(Lexical_Analysis::Token *token) {
    RelopToken* relopToken;
    ReserveWordToken* reserveWordToken;
    IdToken* idToken;
    IntToken* intToken;
    DoubleToken* doubleToken;

    if ((relopToken = dynamic_cast<RelopToken*>(token))) {
        std::cout << "<" << getRelopTagStr(relopToken->relop) << ">" << std::endl;
    } else if ((reserveWordToken = dynamic_cast<ReserveWordToken*>(token))) {
        std::cout << "<" << getTagStr(reserveWordToken->tag) << ", " << getReserveWordTagStr(reserveWordToken->reserveWordTag) << ">" << std::endl;
    } else if ((idToken = dynamic_cast<IdToken*>(token))) {
        std::cout << "<id, " << idToken->value << ">" << std::endl;
    } else if ((intToken = dynamic_cast<IntToken*>(token))) {
        std::cout << "<int, " << intToken->value << ">" << std::endl;
    } else if ((doubleToken = dynamic_cast<DoubleToken*>(token))) {
        std::cout << "<double, " << doubleToken->value << ">" << std::endl;
    }

}

std::string Lexical_Analysis::StateTransition::getTagStr(Lexical_Analysis::Tag val) {
    if (val == Tag::RESERVE_WORD) return "RESERVE_WORD";
    else if (val == Tag::ID) return "ID";
    else if (val == Tag::RELOP) return "RELOP";
    else if (val == Tag::METHOD) return "METHOD";
    else if (val == Tag::NUM) return "NUM";

    return std::__cxx11::string();
}

std::string Lexical_Analysis::StateTransition::getRelopTagStr(Lexical_Analysis::RelopTag val) {
    if (val == RelopTag::LT) return "<";
    else if (val == RelopTag::LE) return "<=";
    else if (val == RelopTag::EQ) return "=";
    else if (val == RelopTag::NE) return "<>";
    else if (val == RelopTag::GT) return ">";
    else if (val == RelopTag::GE) return ">=";

    return std::__cxx11::string();
}

std::string Lexical_Analysis::StateTransition::getReserveWordTagStr(Lexical_Analysis::ReserveWordTag val) {
    if (val == ReserveWordTag::INT) return "INT";
    else if (val == ReserveWordTag::IF) return "IF";
    else if (val == ReserveWordTag::ELSE) return "ELSE";
    else if (val == ReserveWordTag::THEN) return "THEN";

    return std::__cxx11::string();
}

三、StateTransition类
<~StateTransition.h>

namespace Lexical_Analysis {

    class StateTransition {
    public:
        explicit StateTransition(Buffer<>* _buffer);
        ~StateTransition();

    private:
        Buffer<>* buffer; // 缓冲区
        std::map<std::string, ReserveWordToken*> reserveWords; // 记号表, 见构造函数初始化
    private:
        /**
         * 将token存入记号表
         * @param token
         */
        void reserve(ReserveWordToken* token);
    public:
        /**
         * 运算符的状态转换实现
         */
        Token* getRelop();
        /**
         * 数字的状态转换实现
         */
        Token* getNumber();
        /**
         * 字符序列的状态转换实现
         */
        Token* getCharacterSequence();

        /**
         * 更新空格 制表符 换行
         */
        void getBlankTabNewline();

        /**
         * 全局恢复策略, 将forward值重置为lexemeBegin的值
         * 使用另一个状态图从尚未处理的输入部分的真实位置开始识别
         */
        void fail();

        /**
         * 回退一个字符
         */
        void retract();

    public:

        void print_token(Token* token);
        std::string getTagStr(Tag val);
        std::string getRelopTagStr(RelopTag val);
        std::string getReserveWordTagStr(ReserveWordTag val);
    };
}
四、StateTransition的构造与析构函数
<~StateTransition.cpp>

Lexical_Analysis::StateTransition::StateTransition(Lexical_Analysis::Buffer<> *_buffer):buffer(_buffer) {
    reserve(new ReserveWordToken(ReserveWordTag::INT, "int"));
    reserve(new ReserveWordToken(ReserveWordTag::IF, "if"));
    reserve(new ReserveWordToken(ReserveWordTag::ELSE, "else"));
    reserve(new ReserveWordToken(ReserveWordTag::THEN, "then"));
}

Lexical_Analysis::StateTransition::~StateTransition() {

}
五、StateTransition普通函数的实现
<~StateTransition.cpp>

void Lexical_Analysis::StateTransition::fail() {
    buffer->forward = buffer->lexemeBegin;
}

void Lexical_Analysis::StateTransition::retract() {
    buffer->pre();
}

void Lexical_Analysis::StateTransition::reserve(Lexical_Analysis::ReserveWordToken *token) {
    reserveWords[token->lexeme] = token;
}
六、运算符的状态转换, 在下文的三个状态转换中, 都首先检查了当前字符是否满足当前状态转换需要的条件, 如果不满足则直接返回. 然后则是一个while循环与多路分支语句.
<~StateTransition.cpp>

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getRelop() {
    int state = 0;

    char c = buffer->cur();
    bool is_loop = (c == '<' || c == '=' || c == '>');

    if (!is_loop) return nullptr;

    while (is_loop) {
        switch (state) {
            case 0:
                c = buffer->next();

                if (c == '<') state = 1;
                else if (c == '=') state = 5;
                else if (c == '>') state = 6;
                break;
            case 1:
                c = buffer->next();

                if (c == '=') state = 2;
                else if (c == '>') state = 3;
                else state = 4;
                break;
            case 2:
                // 需要将lexemeBegin指针设置为当前词素之后的第一个字符,
                // 因此使forward向前移动, 更改lexemeBegin指针后在回退
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::LE);
            case 3:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::NE);
            case 4:
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::LT);
            case 5:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::EQ);
            case 6:
                c = buffer->next();

                if (c == '=') state = 7;
                else state = 8;
                break;
            case 7:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::GE);
            case 8:
                // 当前forward指针已经指向当前词素之后的第一个位置, 不用更改
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::GT);
        }

    }
}
七、数字的状态转换, 包括小数位
<~StateTransition.cpp>

void calculation(int& result, int val) {
    result = result * 10 + val;
}

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getNumber() {
    int state = 12;

    bool is_loop = is_digit(buffer->cur());

    /* 直接返回 */
    if (!is_loop) return nullptr;

    int val = 0; // 整数部分

    ////////////////////
    double dVal = 0.0; // 小数部分
    int place = 10; // 小数点后第一位/10, 第二位/100 ...
    bool isDecimal = false; // 是否为小数

    ////////////////////

    int ePos = 0; // E 后面的数 (E06)
    bool isNegative = false; // E后面的数是否为负数 (E-03)

    char c;
    while (is_loop) {
        switch (state) {
            case 12:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(val, to_digit_10(c));
                    state = 13;
                }
                break;
            case 13:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(val, to_digit_10(c));
                    state = 13;
                }
                else if (c == '.') {
                    isDecimal = true;
                    state = 14;
                }
                else if (c == 'E') state = 16;
                else state = 19;
                break;
            case 14:
                c = buffer->next();
                if (isdigit(c)) {
                    double d = 1.0 * to_digit_10(c) / place;
                    dVal += d;
                    place *= 10;

                    state = 15;
                }
                else state = 19;
                break;
            case 15:
                c = buffer->next();
                if (isdigit(c)) {
                    double d = 1.0 * to_digit_10(c) / place;
                    dVal += d;
                    place *= 10;

                    state = 15;
                }
                else if (c == 'E') state = 16;
                else state = 19;
                break;
            case 16:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                else if (c == '+' || c == '-') {
                    if (c == '-') isNegative = true;
                    state = 17;
                }
                break;
            case 17:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                break;
            case 18:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                else state = 19;
                break;
            case 19:
                is_loop = false;
                break;
        }
    }

    // 使lexemeBegin指向刚才找到的词素之后的第一个字符, 处理完词素后, forward回退一个位置
    char* tmp = buffer->forward;

    if (isDecimal) { // 如果是小数
        dVal += val;
        if (isNegative) for (int i = 0; i < ePos; i++) dVal /= 10;
        else for (int i = 0; i < ePos; i++) dVal *= 10;

        // 回退, 并设置lexemeBegin值
        retract();
        buffer->lexemeBegin = tmp;

        DoubleToken* token = new DoubleToken(dVal);
        return token;
    } else {
        if (isNegative) for (int i = 0; i < ePos; i++) val /= 10;
        else for (int i = 0; i < ePos; i++) val *= 10;

        retract();
        buffer->lexemeBegin = tmp;

        return new IntToken(val);
    }
}
八、保留字和ID的状态转换
<~StateTransition.cpp>

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getCharacterSequence() {
    int state = 9;

    bool is_loop = is_letter(buffer->cur());

    /* 直接返回 */
    if (!is_loop) return nullptr;

    char c;
    while (is_loop) {
        switch (state) {
            case 9:
                c = buffer->next();
                if (is_underline(c) || is_letter(c)) state = 10;
                break;
            case 10:
                c = buffer->next();
                if (is_letter(c) || is_digit(c) || is_underline(c)) state = 10;
                else state = 11;
                break;
            case 11:
                is_loop = false;
                break;
        }
    }

    // 使lexemeBegin指向刚才找到的词素之后的第一个字符, 处理完词素后, forward回退一个位置
    char* tmp = buffer->forward;
    if (!buffer->is_end) {
        // 如果没有读到结尾, 执行回退
        retract();
    }
    std::string result = buffer->getString();

    buffer->lexemeBegin = tmp;

    // 从符号表中查找是否为关键词(保留字), 如果是则返回关键词token, 否则返回id token
    auto it = reserveWords.find(result);

    if (it != reserveWords.end()) {
        return (*it).second;
    }

    return new IdToken(result);
}
九、空格, 制表符, 换行符设置
<~StateTransition.cpp>

void Lexical_Analysis::StateTransition::getBlankTabNewline() {

    bool is_loop = is_blank_tab_newline(buffer->cur());
    if (!is_loop) return ;

    while (is_blank_tab_newline(buffer->cur())) {
        buffer->next();
    }
    buffer->lexemeBegin = buffer->forward;
}
十、调用
<~main.cpp>


int main() {

    string fileStr = "/home/yanuas/CompilingPrinciple/LexicalAnalysis/code.src";

    Buffer<1024> buffer(fileStr);
    StateTransition transition(&buffer);

    while (1) {
        Token* token = transition.getRelop();
        if (token != nullptr) transition.print_token(token);

        token = transition.getCharacterSequence();
        if (token != nullptr) transition.print_token(token);

        token = transition.getNumber();
        if (token != nullptr) transition.print_token(token);

        transition.getBlankTabNewline();

        if (transition.buffer->is_end) break;
    }

    return 0;
}
09-26 04:19