类C语言编译器-词法分析器

  词法分析的主要任务就是对源程序代码进行分解为单个的词,并去除其中的注释内容。


字符分类

  在本人实现的词法分析器中,将这些单词分为五类:保留字、运算符、界符、整数常数、标识符。(PS:只做了整型INT的识别,暂不支持浮点型FLOAT/DOUBLE的识别)

  保留字: int | bool | main | if | else | for | while | true | false | break | return

  运算符: + | - | * | / | % | & | && …

  界 符: , | ; | { | } | [ | ] …

  整数常数:(digit)+

  标识符: ( _ | letter ) ( _ | letter | digit )*

namespace Tag
{
    //保留字
    const int
        INT = 1,
        BOOL = 2,
        MAIN = 3,
        IF = 4, ELSE = 5, FOR = 6, WHILE = 7, BREAK = 8, RETURN = 9,
        TRUE = 10, FALSE = 11;

    //运算符
    const int
        NOT = 20,       //!
        NE = 21,        //!=
        AUTOMINUS = 22, //--
        MINUS = 23,     //-
        AUTOADD = 24,   //++
        ADD = 25,       //+
        BITOR = 26,     //|
        OR = 27,        //||
        BITAND = 28,    //&
        AND = 29,       //&&
        MUTIPLY = 30,   //*
        DIVIDE = 31,    ///
        MOD = 32,       //%
        EQ = 33,        //==
        ASSIN = 34,     //=
        GE = 35,        //>=
        GT = 36,        //>
        LE = 37,        //<=
        LS = 38;        //<

    //分界符
    const int
        COMMA = 60,     //,
        SEMICOLON = 61, //;
        LLBRACKET = 62, //(
        RLBRACKET = 63, //)
        LMBRACKET = 64, //[
        RMBRACKET = 65, //]
        LGBRACKET = 66, //{
        RGBRACKET = 67; //}

    //整数常数
    const int NUM = 98;

    //标识符
    const int ID = 99;

    //错误
    const int ERROR = 999;

    //结束符
    const int END = 100;
}

代码实现

数据结构定义

  词法单元

class Word
{
private:
    std::string lexeme;
    int tag;

public:
    Word() = default;
    Word(std::string s, int t) : lexeme(s), tag(t) {}
    std::string getLexeme() { return lexeme; }
    int getTag() { return tag; }
    void setTag(int t) { tag = t; }
    void setLexeme(std::string s) { lexeme = s; }
};

  词法分析器

class Lexer
{
private:
    std::unordered_map<std::string, Word> words;
    int line;

public:
    Lexer();
    void reserve(Word w);
    bool readnext(char c, std::istringstream& in);
    Word scan(std::istringstream& in);
    int getLine() { return line; }
};

具体实现

//判断是否为字母
bool IsLetter(char letter) {
    //注意C语言允许下划线也为标识符的一部分 所以这里把 _ 也看做字母
    if ((letter >= 'a' && letter <= 'z') || (letter >= 'A' && letter <= 'Z') || letter == '_') {
        return true;
    } else {
        return false;
    }
}

//判断是否为数字
bool IsDigit(char digit) {
    if (digit >= '0' && digit <= '9') {
        return true;
    } else {
        return false;
    }
}

void Lexer::reserve(Word w) {
    words.insert({ w.getLexeme(), w });
}

Lexer::Lexer() {
    //存入保留字,为了区分标识符
    reserve(Word("int", Tag::INT));
    reserve(Word("bool", Tag::BOOL));
    reserve(Word("main", Tag::MAIN));
    reserve(Word("if", Tag::IF));
    reserve(Word("else", Tag::ELSE));
    reserve(Word("for", Tag::FOR));
    reserve(Word("while", Tag::WHILE));
    reserve(Word("break", Tag::BREAK));
    reserve(Word("return", Tag::RETURN));
    reserve(Word("true", Tag::TRUE));
    reserve(Word("false", Tag::FALSE));

    line = 1;
}

//方便处理像>=,++等这些两个字符连在一起的运算符
bool Lexer::readnext(char c, std::istringstream& in) {
    if (c != in.peek()) {
        return false;
    }
    in.get();
    return true;
}

Word Lexer::scan(std::istringstream& in) {
    char ch;
    if (in.eof()) {
        return Word("#", Tag::END);
    }
    while (1) {
        ch = in.get();
        if (ch == ' ' || ch == '\t') //忽略空格及制表符
            ;
        else if (ch == '\n') {
            ++line;
        }
        else if (ch == '/') { //忽略注释
            if (readnext('/', in)) { //忽略单行注释    //comment
                while (ch != '\n' && !in.eof()) {
                    ch = in.get();
                }
                if (ch == '\n') {
                    line++;
                }
                else {
                    return Word("#", Tag::END);
                }
            }
            else if (readnext('*', in)) { //忽略多行注释    /*comment*/
                while (!readnext('*', in)) {
                    ch = in.get();
                    if (ch == '\n')
                        line++;
                    else if (ch == -1) //eof
                        return Word("error", Tag::ERROR);
                }
                if (!readnext('/', in)) {
                    return Word("error", Tag::ERROR);
                }
            }
        }
        else if (ch == -1) {//eof
            return Word("#", Tag::END);
        }
        else {
            break;
        }
    }

    //处理分界符、运算符等
    switch (ch) {
    case '!':
        if (readnext('=', in))
            return Word("!=", Tag::NE);
        else
            return Word("!", Tag::NOT);
    case '-':
        if (readnext('-', in))
            return Word("--", Tag::AUTOMINUS);
        else
            return Word("-", Tag::MINUS);
    case '+':
        if (readnext('+', in))
            return Word("++", Tag::AUTOADD);
        else
            return Word("+", Tag::ADD);
    case '|':
        if (readnext('|', in))
            return Word("||", Tag::OR);
        else
            return Word("|", Tag::BITOR);
    case '&':
        if (readnext('&', in))
            return Word("&&", Tag::AND);
        else
            return Word("&", Tag::BITAND);
    case '*':
        return Word("*", Tag::MUTIPLY);
    case '/':
        return Word("/", Tag::DIVIDE);
    case '%':
        return Word("%", Tag::MOD);
    case '=':
        if (readnext('=', in))
            return Word("==", Tag::EQ);
        else
            return Word("=", Tag::ASSIN);
    case '>':
        if (readnext('=', in))
            return Word(">=", Tag::GE);
        else
            return Word(">", Tag::GT);
    case '<':
        if (readnext('=', in))
            return Word("<=", Tag::LE);
        else
            return Word("<", Tag::LS);
    case ',':
        return Word(",", Tag::COMMA);
    case ';':
        return Word(";", Tag::SEMICOLON);
    case '(':
        return Word("(", Tag::LLBRACKET);
    case ')':
        return Word(")", Tag::RLBRACKET);
    case '[':
        return Word("[", Tag::LMBRACKET);
    case ']':
        return Word("]", Tag::RMBRACKET);
    case '{':
        return Word("{", Tag::LGBRACKET);
    case '}':
        return Word("}", Tag::RGBRACKET);
    }

    //处理常数
    if (IsDigit(ch)) {
        std::string tmp;
        tmp.push_back(ch);
        while (IsDigit(in.peek())) {
            ch = in.get();
            tmp.push_back(ch);
        }
        return Word(tmp, Tag::NUM);
    }

    //处理标识符
    if (IsLetter(ch)) {
        std::string tmp;
        tmp.push_back(ch);
        while (IsLetter(in.peek())) {
            ch = in.get();
            tmp.push_back(ch);
        }

        //判断是否为保留字
        if (words.find(tmp) != words.end()) {
            return words[tmp];
        } else {
            Word w = Word(tmp, Tag::ID);
            reserve(w);
            return w;
        }
    }

    if (ch != ' ' && ch != '\t' && ch != '\n') {
        return Word("error", Tag::ERROR);
    }
    return Word("#", Tag::END);
}

  每次调用 Lexer::scan 会返回一个 Word 变量。使用时循环调用 scan 直到输入流到末尾,每次循环对返回的 Word 变量进行输出,即可得到完整的词法分析结果。

参考博客:

https://www.cnblogs.com/vachester/p/6884345.html

https://www.cnblogs.com/zyrblog/p/6885922.html