类C语言编译器-词法分析器
词法分析的主要任务就是对源程序代码进行分解为单个的词,并去除其中的注释内容。
字符分类
在本人实现的词法分析器中,将这些单词分为五类:保留字、运算符、界符、整数常数、标识符。(PS:只做了整型INT的识别,暂不支持浮点型FLOAT/DOUBLE的识别)
保留字: int | bool | main | if | else | for | while | true | false | break | return
运算符: + | - | * | / | % | & | && …
界 符: , | ; | { | } | [ | ] …
整数常数:(digit)+
标识符: ( _ | letter ) ( _ | letter | digit )*
namespace Tag
{
//保留字
const int
INT = 1,
BOOL = 2,
MAIN = 3,
IF = 4, ELSE = 5, FOR = 6, WHILE = 7, BREAK = 8, RETURN = 9,
TRUE = 10, FALSE = 11;
//运算符
const int
NOT = 20, //!
NE = 21, //!=
AUTOMINUS = 22, //--
MINUS = 23, //-
AUTOADD = 24, //++
ADD = 25, //+
BITOR = 26, //|
OR = 27, //||
BITAND = 28, //&
AND = 29, //&&
MUTIPLY = 30, //*
DIVIDE = 31, ///
MOD = 32, //%
EQ = 33, //==
ASSIN = 34, //=
GE = 35, //>=
GT = 36, //>
LE = 37, //<=
LS = 38; //<
//分界符
const int
COMMA = 60, //,
SEMICOLON = 61, //;
LLBRACKET = 62, //(
RLBRACKET = 63, //)
LMBRACKET = 64, //[
RMBRACKET = 65, //]
LGBRACKET = 66, //{
RGBRACKET = 67; //}
//整数常数
const int NUM = 98;
//标识符
const int ID = 99;
//错误
const int ERROR = 999;
//结束符
const int END = 100;
}
代码实现
数据结构定义
词法单元
class Word
{
private:
std::string lexeme;
int tag;
public:
Word() = default;
Word(std::string s, int t) : lexeme(s), tag(t) {}
std::string getLexeme() { return lexeme; }
int getTag() { return tag; }
void setTag(int t) { tag = t; }
void setLexeme(std::string s) { lexeme = s; }
};
词法分析器
class Lexer
{
private:
std::unordered_map<std::string, Word> words;
int line;
public:
Lexer();
void reserve(Word w);
bool readnext(char c, std::istringstream& in);
Word scan(std::istringstream& in);
int getLine() { return line; }
};
具体实现
//判断是否为字母
bool IsLetter(char letter) {
//注意C语言允许下划线也为标识符的一部分 所以这里把 _ 也看做字母
if ((letter >= 'a' && letter <= 'z') || (letter >= 'A' && letter <= 'Z') || letter == '_') {
return true;
} else {
return false;
}
}
//判断是否为数字
bool IsDigit(char digit) {
if (digit >= '0' && digit <= '9') {
return true;
} else {
return false;
}
}
void Lexer::reserve(Word w) {
words.insert({ w.getLexeme(), w });
}
Lexer::Lexer() {
//存入保留字,为了区分标识符
reserve(Word("int", Tag::INT));
reserve(Word("bool", Tag::BOOL));
reserve(Word("main", Tag::MAIN));
reserve(Word("if", Tag::IF));
reserve(Word("else", Tag::ELSE));
reserve(Word("for", Tag::FOR));
reserve(Word("while", Tag::WHILE));
reserve(Word("break", Tag::BREAK));
reserve(Word("return", Tag::RETURN));
reserve(Word("true", Tag::TRUE));
reserve(Word("false", Tag::FALSE));
line = 1;
}
//方便处理像>=,++等这些两个字符连在一起的运算符
bool Lexer::readnext(char c, std::istringstream& in) {
if (c != in.peek()) {
return false;
}
in.get();
return true;
}
Word Lexer::scan(std::istringstream& in) {
char ch;
if (in.eof()) {
return Word("#", Tag::END);
}
while (1) {
ch = in.get();
if (ch == ' ' || ch == '\t') //忽略空格及制表符
;
else if (ch == '\n') {
++line;
}
else if (ch == '/') { //忽略注释
if (readnext('/', in)) { //忽略单行注释 //comment
while (ch != '\n' && !in.eof()) {
ch = in.get();
}
if (ch == '\n') {
line++;
}
else {
return Word("#", Tag::END);
}
}
else if (readnext('*', in)) { //忽略多行注释 /*comment*/
while (!readnext('*', in)) {
ch = in.get();
if (ch == '\n')
line++;
else if (ch == -1) //eof
return Word("error", Tag::ERROR);
}
if (!readnext('/', in)) {
return Word("error", Tag::ERROR);
}
}
}
else if (ch == -1) {//eof
return Word("#", Tag::END);
}
else {
break;
}
}
//处理分界符、运算符等
switch (ch) {
case '!':
if (readnext('=', in))
return Word("!=", Tag::NE);
else
return Word("!", Tag::NOT);
case '-':
if (readnext('-', in))
return Word("--", Tag::AUTOMINUS);
else
return Word("-", Tag::MINUS);
case '+':
if (readnext('+', in))
return Word("++", Tag::AUTOADD);
else
return Word("+", Tag::ADD);
case '|':
if (readnext('|', in))
return Word("||", Tag::OR);
else
return Word("|", Tag::BITOR);
case '&':
if (readnext('&', in))
return Word("&&", Tag::AND);
else
return Word("&", Tag::BITAND);
case '*':
return Word("*", Tag::MUTIPLY);
case '/':
return Word("/", Tag::DIVIDE);
case '%':
return Word("%", Tag::MOD);
case '=':
if (readnext('=', in))
return Word("==", Tag::EQ);
else
return Word("=", Tag::ASSIN);
case '>':
if (readnext('=', in))
return Word(">=", Tag::GE);
else
return Word(">", Tag::GT);
case '<':
if (readnext('=', in))
return Word("<=", Tag::LE);
else
return Word("<", Tag::LS);
case ',':
return Word(",", Tag::COMMA);
case ';':
return Word(";", Tag::SEMICOLON);
case '(':
return Word("(", Tag::LLBRACKET);
case ')':
return Word(")", Tag::RLBRACKET);
case '[':
return Word("[", Tag::LMBRACKET);
case ']':
return Word("]", Tag::RMBRACKET);
case '{':
return Word("{", Tag::LGBRACKET);
case '}':
return Word("}", Tag::RGBRACKET);
}
//处理常数
if (IsDigit(ch)) {
std::string tmp;
tmp.push_back(ch);
while (IsDigit(in.peek())) {
ch = in.get();
tmp.push_back(ch);
}
return Word(tmp, Tag::NUM);
}
//处理标识符
if (IsLetter(ch)) {
std::string tmp;
tmp.push_back(ch);
while (IsLetter(in.peek())) {
ch = in.get();
tmp.push_back(ch);
}
//判断是否为保留字
if (words.find(tmp) != words.end()) {
return words[tmp];
} else {
Word w = Word(tmp, Tag::ID);
reserve(w);
return w;
}
}
if (ch != ' ' && ch != '\t' && ch != '\n') {
return Word("error", Tag::ERROR);
}
return Word("#", Tag::END);
}
每次调用 Lexer::scan 会返回一个 Word 变量。使用时循环调用 scan 直到输入流到末尾,每次循环对返回的 Word 变量进行输出,即可得到完整的词法分析结果。
参考博客: